@leolionart/n8n-nodes-pdf-extractor 1.1.0 → 1.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -2,10 +2,8 @@
|
|
|
2
2
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
3
|
exports.PdfExtractor = void 0;
|
|
4
4
|
const n8n_workflow_1 = require("n8n-workflow");
|
|
5
|
-
//
|
|
6
|
-
const
|
|
7
|
-
// Disable worker for Node.js environment
|
|
8
|
-
pdf_mjs_1.GlobalWorkerOptions.workerSrc = '';
|
|
5
|
+
// unpdf provides a simple API for PDF text extraction with password support
|
|
6
|
+
const unpdf_1 = require("unpdf");
|
|
9
7
|
/**
|
|
10
8
|
* Parse page range string into array of page numbers
|
|
11
9
|
* Supports: "1-5", "1,3,5", "1-3,7,9-11", or empty for all pages
|
|
@@ -154,40 +152,38 @@ class PdfExtractor {
|
|
|
154
152
|
// Validate binary data exists
|
|
155
153
|
const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
|
|
156
154
|
const buffer = await this.helpers.getBinaryDataBuffer(itemIndex, binaryPropertyName);
|
|
157
|
-
// Convert buffer to Uint8Array
|
|
155
|
+
// Convert buffer to Uint8Array
|
|
158
156
|
const pdfData = new Uint8Array(buffer);
|
|
159
|
-
//
|
|
160
|
-
let
|
|
157
|
+
// Get document info first to know total pages
|
|
158
|
+
let pdf;
|
|
161
159
|
try {
|
|
162
|
-
|
|
163
|
-
data: pdfData,
|
|
160
|
+
pdf = await (0, unpdf_1.getDocumentProxy)(pdfData, {
|
|
164
161
|
password: password || undefined,
|
|
165
|
-
useSystemFonts: true,
|
|
166
162
|
});
|
|
167
|
-
pdfDocument = await loadingTask.promise;
|
|
168
163
|
}
|
|
169
164
|
catch (error) {
|
|
170
165
|
const errorMessage = error.message || String(error);
|
|
171
|
-
if (errorMessage.
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
166
|
+
if (errorMessage.toLowerCase().includes('password')) {
|
|
167
|
+
if (password) {
|
|
168
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Invalid password for PDF file', { itemIndex });
|
|
169
|
+
}
|
|
170
|
+
else {
|
|
171
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'This PDF is password-protected. Please provide the password.', { itemIndex });
|
|
172
|
+
}
|
|
176
173
|
}
|
|
177
174
|
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Failed to load PDF: ${errorMessage}`, { itemIndex });
|
|
178
175
|
}
|
|
179
|
-
const numPages =
|
|
180
|
-
// Parse page range
|
|
176
|
+
const numPages = pdf.numPages;
|
|
181
177
|
const pagesToExtract = parsePageRange(options.pageRange || '', numPages);
|
|
182
|
-
// Extract text from
|
|
178
|
+
// Extract text from selected pages
|
|
183
179
|
const pageTexts = [];
|
|
184
180
|
for (const pageNum of pagesToExtract) {
|
|
185
181
|
try {
|
|
186
|
-
const page = await
|
|
182
|
+
const page = await pdf.getPage(pageNum);
|
|
187
183
|
const textContent = await page.getTextContent();
|
|
188
|
-
//
|
|
184
|
+
// Join text items
|
|
189
185
|
const pageText = textContent.items
|
|
190
|
-
.filter((item) => 'str' in item)
|
|
186
|
+
.filter((item) => typeof item === 'object' && item !== null && 'str' in item)
|
|
191
187
|
.map((item) => item.str)
|
|
192
188
|
.join(' ')
|
|
193
189
|
.replace(/\s+/g, ' ')
|
|
@@ -195,16 +191,14 @@ class PdfExtractor {
|
|
|
195
191
|
pageTexts.push({ page: pageNum, text: pageText });
|
|
196
192
|
}
|
|
197
193
|
catch (pageError) {
|
|
198
|
-
|
|
199
|
-
console.warn(`Failed to extract text from page ${pageNum}: ${pageError}`);
|
|
194
|
+
console.warn(`Failed to extract page ${pageNum}: ${pageError}`);
|
|
200
195
|
pageTexts.push({ page: pageNum, text: '' });
|
|
201
196
|
}
|
|
202
197
|
}
|
|
203
198
|
const outputProperty = options.outputProperty || 'text';
|
|
204
|
-
const joinPages = options.joinPages !== false;
|
|
199
|
+
const joinPages = options.joinPages !== false;
|
|
205
200
|
let outputData;
|
|
206
201
|
if (joinPages) {
|
|
207
|
-
// Join all pages with separator
|
|
208
202
|
const separator = options.pageSeparator || '\n\n--- Page {page} ---\n\n';
|
|
209
203
|
const fullText = pageTexts
|
|
210
204
|
.map((p, index) => {
|
|
@@ -225,7 +219,6 @@ class PdfExtractor {
|
|
|
225
219
|
};
|
|
226
220
|
}
|
|
227
221
|
else {
|
|
228
|
-
// Return array of pages
|
|
229
222
|
const pagesOutput = options.includePageNumbers
|
|
230
223
|
? pageTexts
|
|
231
224
|
: pageTexts.map(p => p.text);
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@leolionart/n8n-nodes-pdf-extractor",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.2.0",
|
|
4
4
|
"description": "n8n community node to extract text from password-protected PDFs - no external dependencies required",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"n8n-community-node-package",
|
|
@@ -59,6 +59,6 @@
|
|
|
59
59
|
"n8n-workflow": "*"
|
|
60
60
|
},
|
|
61
61
|
"dependencies": {
|
|
62
|
-
"
|
|
62
|
+
"unpdf": "^0.12.1"
|
|
63
63
|
}
|
|
64
64
|
}
|