@leolionart/n8n-nodes-pdf-extractor 1.1.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,10 +2,8 @@
2
2
  Object.defineProperty(exports, "__esModule", { value: true });
3
3
  exports.PdfExtractor = void 0;
4
4
  const n8n_workflow_1 = require("n8n-workflow");
5
- // Use legacy build for Node.js compatibility
6
- const pdf_mjs_1 = require("pdfjs-dist/legacy/build/pdf.mjs");
7
- // Disable worker for Node.js environment
8
- pdf_mjs_1.GlobalWorkerOptions.workerSrc = '';
5
+ // unpdf provides a simple API for PDF text extraction with password support
6
+ const unpdf_1 = require("unpdf");
9
7
  /**
10
8
  * Parse page range string into array of page numbers
11
9
  * Supports: "1-5", "1,3,5", "1-3,7,9-11", or empty for all pages
@@ -154,40 +152,38 @@ class PdfExtractor {
154
152
  // Validate binary data exists
155
153
  const binaryData = this.helpers.assertBinaryData(itemIndex, binaryPropertyName);
156
154
  const buffer = await this.helpers.getBinaryDataBuffer(itemIndex, binaryPropertyName);
157
- // Convert buffer to Uint8Array for pdfjs
155
+ // Convert buffer to Uint8Array
158
156
  const pdfData = new Uint8Array(buffer);
159
- // Load PDF document
160
- let pdfDocument;
157
+ // Get document info first to know total pages
158
+ let pdf;
161
159
  try {
162
- const loadingTask = (0, pdf_mjs_1.getDocument)({
163
- data: pdfData,
160
+ pdf = await (0, unpdf_1.getDocumentProxy)(pdfData, {
164
161
  password: password || undefined,
165
- useSystemFonts: true,
166
162
  });
167
- pdfDocument = await loadingTask.promise;
168
163
  }
169
164
  catch (error) {
170
165
  const errorMessage = error.message || String(error);
171
- if (errorMessage.includes('Invalid password') || errorMessage.includes('Incorrect Password')) {
172
- throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Invalid password for PDF file', { itemIndex });
173
- }
174
- if (errorMessage.includes('password')) {
175
- throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'This PDF is password-protected. Please provide the correct password.', { itemIndex });
166
+ if (errorMessage.toLowerCase().includes('password')) {
167
+ if (password) {
168
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Invalid password for PDF file', { itemIndex });
169
+ }
170
+ else {
171
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'This PDF is password-protected. Please provide the password.', { itemIndex });
172
+ }
176
173
  }
177
174
  throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Failed to load PDF: ${errorMessage}`, { itemIndex });
178
175
  }
179
- const numPages = pdfDocument.numPages;
180
- // Parse page range
176
+ const numPages = pdf.numPages;
181
177
  const pagesToExtract = parsePageRange(options.pageRange || '', numPages);
182
- // Extract text from each page
178
+ // Extract text from selected pages
183
179
  const pageTexts = [];
184
180
  for (const pageNum of pagesToExtract) {
185
181
  try {
186
- const page = await pdfDocument.getPage(pageNum);
182
+ const page = await pdf.getPage(pageNum);
187
183
  const textContent = await page.getTextContent();
188
- // Extract text items and join them
184
+ // Join text items
189
185
  const pageText = textContent.items
190
- .filter((item) => 'str' in item)
186
+ .filter((item) => typeof item === 'object' && item !== null && 'str' in item)
191
187
  .map((item) => item.str)
192
188
  .join(' ')
193
189
  .replace(/\s+/g, ' ')
@@ -195,16 +191,14 @@ class PdfExtractor {
195
191
  pageTexts.push({ page: pageNum, text: pageText });
196
192
  }
197
193
  catch (pageError) {
198
- // Continue with other pages if one fails
199
- console.warn(`Failed to extract text from page ${pageNum}: ${pageError}`);
194
+ console.warn(`Failed to extract page ${pageNum}: ${pageError}`);
200
195
  pageTexts.push({ page: pageNum, text: '' });
201
196
  }
202
197
  }
203
198
  const outputProperty = options.outputProperty || 'text';
204
- const joinPages = options.joinPages !== false; // Default to true
199
+ const joinPages = options.joinPages !== false;
205
200
  let outputData;
206
201
  if (joinPages) {
207
- // Join all pages with separator
208
202
  const separator = options.pageSeparator || '\n\n--- Page {page} ---\n\n';
209
203
  const fullText = pageTexts
210
204
  .map((p, index) => {
@@ -225,7 +219,6 @@ class PdfExtractor {
225
219
  };
226
220
  }
227
221
  else {
228
- // Return array of pages
229
222
  const pagesOutput = options.includePageNumbers
230
223
  ? pageTexts
231
224
  : pageTexts.map(p => p.text);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@leolionart/n8n-nodes-pdf-extractor",
3
- "version": "1.1.0",
3
+ "version": "1.2.0",
4
4
  "description": "n8n community node to extract text from password-protected PDFs - no external dependencies required",
5
5
  "keywords": [
6
6
  "n8n-community-node-package",
@@ -59,6 +59,6 @@
59
59
  "n8n-workflow": "*"
60
60
  },
61
61
  "dependencies": {
62
- "pdfjs-dist": "^4.9.155"
62
+ "unpdf": "^0.12.1"
63
63
  }
64
64
  }