n8n-nodes-ocrbro 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/nodes/OcrBro/OcrBro.node.js +128 -37
- package/package.json +5 -3
|
@@ -1,4 +1,37 @@
|
|
|
1
1
|
"use strict";
|
|
2
|
+
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
|
|
3
|
+
if (k2 === undefined) k2 = k;
|
|
4
|
+
var desc = Object.getOwnPropertyDescriptor(m, k);
|
|
5
|
+
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
|
|
6
|
+
desc = { enumerable: true, get: function() { return m[k]; } };
|
|
7
|
+
}
|
|
8
|
+
Object.defineProperty(o, k2, desc);
|
|
9
|
+
}) : (function(o, m, k, k2) {
|
|
10
|
+
if (k2 === undefined) k2 = k;
|
|
11
|
+
o[k2] = m[k];
|
|
12
|
+
}));
|
|
13
|
+
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
|
|
14
|
+
Object.defineProperty(o, "default", { enumerable: true, value: v });
|
|
15
|
+
}) : function(o, v) {
|
|
16
|
+
o["default"] = v;
|
|
17
|
+
});
|
|
18
|
+
var __importStar = (this && this.__importStar) || (function () {
|
|
19
|
+
var ownKeys = function(o) {
|
|
20
|
+
ownKeys = Object.getOwnPropertyNames || function (o) {
|
|
21
|
+
var ar = [];
|
|
22
|
+
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
|
|
23
|
+
return ar;
|
|
24
|
+
};
|
|
25
|
+
return ownKeys(o);
|
|
26
|
+
};
|
|
27
|
+
return function (mod) {
|
|
28
|
+
if (mod && mod.__esModule) return mod;
|
|
29
|
+
var result = {};
|
|
30
|
+
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
|
|
31
|
+
__setModuleDefault(result, mod);
|
|
32
|
+
return result;
|
|
33
|
+
};
|
|
34
|
+
})();
|
|
2
35
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
36
|
exports.OcrBro = void 0;
|
|
4
37
|
const tesseract_js_1 = require("tesseract.js");
|
|
@@ -10,27 +43,51 @@ class OcrBro {
|
|
|
10
43
|
icon: 'file:ocrbro.png',
|
|
11
44
|
group: ['transform'],
|
|
12
45
|
version: 1,
|
|
13
|
-
description: '
|
|
46
|
+
description: 'Extract text from images (OCR) or PDFs',
|
|
14
47
|
defaults: {
|
|
15
48
|
name: 'OCR Bro',
|
|
16
49
|
},
|
|
17
50
|
inputs: ['main'],
|
|
18
51
|
outputs: ['main'],
|
|
19
52
|
properties: [
|
|
53
|
+
{
|
|
54
|
+
displayName: 'Operation',
|
|
55
|
+
name: 'operation',
|
|
56
|
+
type: 'options',
|
|
57
|
+
noDataExpression: true,
|
|
58
|
+
options: [
|
|
59
|
+
{
|
|
60
|
+
name: 'OCR from Image',
|
|
61
|
+
value: 'ocrImage',
|
|
62
|
+
description: 'Extract text from images using Tesseract OCR',
|
|
63
|
+
},
|
|
64
|
+
{
|
|
65
|
+
name: 'Extract Text from PDF',
|
|
66
|
+
value: 'extractPdf',
|
|
67
|
+
description: 'Extract text from PDF documents',
|
|
68
|
+
},
|
|
69
|
+
],
|
|
70
|
+
default: 'ocrImage',
|
|
71
|
+
},
|
|
20
72
|
{
|
|
21
73
|
displayName: 'Input Binary Field',
|
|
22
74
|
name: 'binaryPropertyName',
|
|
23
75
|
type: 'string',
|
|
24
76
|
default: 'data',
|
|
25
77
|
required: true,
|
|
26
|
-
description: 'The name of the binary property containing the
|
|
78
|
+
description: 'The name of the binary property containing the file',
|
|
27
79
|
},
|
|
28
80
|
{
|
|
29
81
|
displayName: 'Language',
|
|
30
82
|
name: 'language',
|
|
31
83
|
type: 'string',
|
|
32
84
|
default: 'eng',
|
|
33
|
-
description: 'Tesseract language code (e.g., eng, deu, spa). Multiple languages can be
|
|
85
|
+
description: 'Tesseract language code (e.g., eng, deu, spa). Multiple languages can be separated by "+".',
|
|
86
|
+
displayOptions: {
|
|
87
|
+
show: {
|
|
88
|
+
operation: ['ocrImage'],
|
|
89
|
+
},
|
|
90
|
+
},
|
|
34
91
|
},
|
|
35
92
|
],
|
|
36
93
|
};
|
|
@@ -39,51 +96,85 @@ class OcrBro {
|
|
|
39
96
|
var _a;
|
|
40
97
|
const items = this.getInputData();
|
|
41
98
|
const returnData = [];
|
|
99
|
+
const operation = this.getNodeParameter('operation', 0);
|
|
42
100
|
const binaryPropertyName = this.getNodeParameter('binaryPropertyName', 0);
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
101
|
+
if (operation === 'ocrImage') {
|
|
102
|
+
const language = this.getNodeParameter('language', 0);
|
|
103
|
+
// @ts-ignore
|
|
104
|
+
const worker = await (0, tesseract_js_1.createWorker)();
|
|
105
|
+
// @ts-ignore
|
|
106
|
+
await worker.loadLanguage(language);
|
|
107
|
+
// @ts-ignore
|
|
108
|
+
await worker.initialize(language);
|
|
109
|
+
for (let i = 0; i < items.length; i++) {
|
|
110
|
+
try {
|
|
111
|
+
const item = items[i];
|
|
112
|
+
// @ts-ignore
|
|
113
|
+
const binaryData = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
|
|
114
|
+
const binaryMetadata = item.binary ? item.binary[binaryPropertyName] : undefined;
|
|
115
|
+
if (!binaryMetadata) {
|
|
116
|
+
throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
|
|
117
|
+
}
|
|
118
|
+
const mimeType = binaryMetadata.mimeType;
|
|
119
|
+
if (!mimeType.startsWith('image/')) {
|
|
120
|
+
throw new Error(`Unsupported file type: ${mimeType}. Use "Extract Text from PDF" for PDF files.`);
|
|
121
|
+
}
|
|
122
|
+
// @ts-ignore
|
|
123
|
+
const { data } = await worker.recognize(binaryData);
|
|
124
|
+
returnData.push({
|
|
125
|
+
json: {
|
|
126
|
+
text: data.text,
|
|
127
|
+
confidence: data.confidence,
|
|
128
|
+
words: ((_a = data.words) === null || _a === void 0 ? void 0 : _a.length) || 0,
|
|
129
|
+
},
|
|
130
|
+
binary: item.binary,
|
|
131
|
+
});
|
|
58
132
|
}
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
133
|
+
catch (error) {
|
|
134
|
+
if (this.continueOnFail()) {
|
|
135
|
+
returnData.push({ json: { error: error.message } });
|
|
136
|
+
continue;
|
|
137
|
+
}
|
|
138
|
+
throw error;
|
|
62
139
|
}
|
|
63
|
-
// @ts-ignore
|
|
64
|
-
const { data } = await worker.recognize(binaryData);
|
|
65
|
-
returnData.push({
|
|
66
|
-
json: {
|
|
67
|
-
text: data.text,
|
|
68
|
-
confidence: data.confidence,
|
|
69
|
-
words: ((_a = data.words) === null || _a === void 0 ? void 0 : _a.length) || 0,
|
|
70
|
-
},
|
|
71
|
-
binary: item.binary,
|
|
72
|
-
});
|
|
73
140
|
}
|
|
74
|
-
|
|
75
|
-
|
|
141
|
+
await worker.terminate();
|
|
142
|
+
}
|
|
143
|
+
else if (operation === 'extractPdf') {
|
|
144
|
+
// Dynamic import for ESM module
|
|
145
|
+
const { extractText, getDocumentProxy } = await Promise.resolve().then(() => __importStar(require('unpdf')));
|
|
146
|
+
for (let i = 0; i < items.length; i++) {
|
|
147
|
+
try {
|
|
148
|
+
const item = items[i];
|
|
149
|
+
// @ts-ignore
|
|
150
|
+
const binaryData = await this.helpers.getBinaryDataBuffer(i, binaryPropertyName);
|
|
151
|
+
const binaryMetadata = item.binary ? item.binary[binaryPropertyName] : undefined;
|
|
152
|
+
if (!binaryMetadata) {
|
|
153
|
+
throw new Error(`Binary property "${binaryPropertyName}" does not exist on item ${i}`);
|
|
154
|
+
}
|
|
155
|
+
const mimeType = binaryMetadata.mimeType;
|
|
156
|
+
if (mimeType !== 'application/pdf') {
|
|
157
|
+
throw new Error(`Expected PDF file but got: ${mimeType}. Use "OCR from Image" for image files.`);
|
|
158
|
+
}
|
|
159
|
+
const pdf = await getDocumentProxy(new Uint8Array(binaryData));
|
|
160
|
+
const { text, totalPages } = await extractText(pdf, { mergePages: true });
|
|
76
161
|
returnData.push({
|
|
77
162
|
json: {
|
|
78
|
-
|
|
163
|
+
text: text,
|
|
164
|
+
pages: totalPages,
|
|
79
165
|
},
|
|
166
|
+
binary: item.binary,
|
|
80
167
|
});
|
|
81
|
-
continue;
|
|
82
168
|
}
|
|
83
|
-
|
|
169
|
+
catch (error) {
|
|
170
|
+
if (this.continueOnFail()) {
|
|
171
|
+
returnData.push({ json: { error: error.message } });
|
|
172
|
+
continue;
|
|
173
|
+
}
|
|
174
|
+
throw error;
|
|
175
|
+
}
|
|
84
176
|
}
|
|
85
177
|
}
|
|
86
|
-
await worker.terminate();
|
|
87
178
|
return [returnData];
|
|
88
179
|
}
|
|
89
180
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "n8n-nodes-ocrbro",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.1",
|
|
4
4
|
"description": "Native n8n node for OCR using Tesseract.js",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"n8n-community-node-package"
|
|
@@ -42,6 +42,8 @@
|
|
|
42
42
|
"typescript": "^5.0.0"
|
|
43
43
|
},
|
|
44
44
|
"dependencies": {
|
|
45
|
-
"
|
|
45
|
+
"pdf2json": "^4.0.0",
|
|
46
|
+
"tesseract.js": "^4.0.0",
|
|
47
|
+
"unpdf": "^1.4.0"
|
|
46
48
|
}
|
|
47
|
-
}
|
|
49
|
+
}
|