n8n-nodes-crawl4ai-plus 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. package/LICENSE +23 -0
  2. package/LICENSE.md +19 -0
  3. package/README.md +157 -0
  4. package/dist/credentials/Crawl4aiApi.credentials.d.ts +7 -0
  5. package/dist/credentials/Crawl4aiApi.credentials.js +242 -0
  6. package/dist/credentials/Crawl4aiApi.credentials.js.map +1 -0
  7. package/dist/nodes/Crawl4aiBasicCrawler/Crawl4aiBasicCrawler.node.d.ts +5 -0
  8. package/dist/nodes/Crawl4aiBasicCrawler/Crawl4aiBasicCrawler.node.js +37 -0
  9. package/dist/nodes/Crawl4aiBasicCrawler/Crawl4aiBasicCrawler.node.js.map +1 -0
  10. package/dist/nodes/Crawl4aiBasicCrawler/actions/crawlMultipleUrls.operation.d.ts +4 -0
  11. package/dist/nodes/Crawl4aiBasicCrawler/actions/crawlMultipleUrls.operation.js +299 -0
  12. package/dist/nodes/Crawl4aiBasicCrawler/actions/crawlMultipleUrls.operation.js.map +1 -0
  13. package/dist/nodes/Crawl4aiBasicCrawler/actions/crawlSingleUrl.operation.d.ts +4 -0
  14. package/dist/nodes/Crawl4aiBasicCrawler/actions/crawlSingleUrl.operation.js +324 -0
  15. package/dist/nodes/Crawl4aiBasicCrawler/actions/crawlSingleUrl.operation.js.map +1 -0
  16. package/dist/nodes/Crawl4aiBasicCrawler/actions/operations.d.ts +8 -0
  17. package/dist/nodes/Crawl4aiBasicCrawler/actions/operations.js +67 -0
  18. package/dist/nodes/Crawl4aiBasicCrawler/actions/operations.js.map +1 -0
  19. package/dist/nodes/Crawl4aiBasicCrawler/actions/processRawHtml.operation.d.ts +4 -0
  20. package/dist/nodes/Crawl4aiBasicCrawler/actions/processRawHtml.operation.js +148 -0
  21. package/dist/nodes/Crawl4aiBasicCrawler/actions/processRawHtml.operation.js.map +1 -0
  22. package/dist/nodes/Crawl4aiBasicCrawler/actions/router.d.ts +2 -0
  23. package/dist/nodes/Crawl4aiBasicCrawler/actions/router.js +37 -0
  24. package/dist/nodes/Crawl4aiBasicCrawler/actions/router.js.map +1 -0
  25. package/dist/nodes/Crawl4aiBasicCrawler/crawl4ai.svg +17 -0
  26. package/dist/nodes/Crawl4aiBasicCrawler/helpers/apiClient.d.ts +15 -0
  27. package/dist/nodes/Crawl4aiBasicCrawler/helpers/apiClient.js +226 -0
  28. package/dist/nodes/Crawl4aiBasicCrawler/helpers/apiClient.js.map +1 -0
  29. package/dist/nodes/Crawl4aiBasicCrawler/helpers/formatters.d.ts +5 -0
  30. package/dist/nodes/Crawl4aiBasicCrawler/helpers/formatters.js +81 -0
  31. package/dist/nodes/Crawl4aiBasicCrawler/helpers/formatters.js.map +1 -0
  32. package/dist/nodes/Crawl4aiBasicCrawler/helpers/interfaces.d.ts +189 -0
  33. package/dist/nodes/Crawl4aiBasicCrawler/helpers/interfaces.js +3 -0
  34. package/dist/nodes/Crawl4aiBasicCrawler/helpers/interfaces.js.map +1 -0
  35. package/dist/nodes/Crawl4aiBasicCrawler/helpers/utils.d.ts +8 -0
  36. package/dist/nodes/Crawl4aiBasicCrawler/helpers/utils.js +97 -0
  37. package/dist/nodes/Crawl4aiBasicCrawler/helpers/utils.js.map +1 -0
  38. package/dist/nodes/Crawl4aiContentExtractor/Crawl4aiContentExtractor.node.d.ts +5 -0
  39. package/dist/nodes/Crawl4aiContentExtractor/Crawl4aiContentExtractor.node.js +38 -0
  40. package/dist/nodes/Crawl4aiContentExtractor/Crawl4aiContentExtractor.node.js.map +1 -0
  41. package/dist/nodes/Crawl4aiContentExtractor/actions/cssExtractor.operation.d.ts +4 -0
  42. package/dist/nodes/Crawl4aiContentExtractor/actions/cssExtractor.operation.js +336 -0
  43. package/dist/nodes/Crawl4aiContentExtractor/actions/cssExtractor.operation.js.map +1 -0
  44. package/dist/nodes/Crawl4aiContentExtractor/actions/jsonExtractor.operation.d.ts +4 -0
  45. package/dist/nodes/Crawl4aiContentExtractor/actions/jsonExtractor.operation.js +369 -0
  46. package/dist/nodes/Crawl4aiContentExtractor/actions/jsonExtractor.operation.js.map +1 -0
  47. package/dist/nodes/Crawl4aiContentExtractor/actions/llmExtractor.operation.d.ts +4 -0
  48. package/dist/nodes/Crawl4aiContentExtractor/actions/llmExtractor.operation.js +786 -0
  49. package/dist/nodes/Crawl4aiContentExtractor/actions/llmExtractor.operation.js.map +1 -0
  50. package/dist/nodes/Crawl4aiContentExtractor/actions/operations.d.ts +8 -0
  51. package/dist/nodes/Crawl4aiContentExtractor/actions/operations.js +76 -0
  52. package/dist/nodes/Crawl4aiContentExtractor/actions/operations.js.map +1 -0
  53. package/dist/nodes/Crawl4aiContentExtractor/actions/regexExtractor.operation.d.ts +4 -0
  54. package/dist/nodes/Crawl4aiContentExtractor/actions/regexExtractor.operation.js +437 -0
  55. package/dist/nodes/Crawl4aiContentExtractor/actions/regexExtractor.operation.js.map +1 -0
  56. package/dist/nodes/Crawl4aiContentExtractor/actions/router.d.ts +2 -0
  57. package/dist/nodes/Crawl4aiContentExtractor/actions/router.js +37 -0
  58. package/dist/nodes/Crawl4aiContentExtractor/actions/router.js.map +1 -0
  59. package/dist/nodes/Crawl4aiContentExtractor/crawl4ai.svg +17 -0
  60. package/dist/nodes/Crawl4aiContentExtractor/helpers/apiClient.d.ts +1 -0
  61. package/dist/nodes/Crawl4aiContentExtractor/helpers/apiClient.js +7 -0
  62. package/dist/nodes/Crawl4aiContentExtractor/helpers/apiClient.js.map +1 -0
  63. package/dist/nodes/Crawl4aiContentExtractor/helpers/formatters.d.ts +1 -0
  64. package/dist/nodes/Crawl4aiContentExtractor/helpers/formatters.js +8 -0
  65. package/dist/nodes/Crawl4aiContentExtractor/helpers/formatters.js.map +1 -0
  66. package/dist/nodes/Crawl4aiContentExtractor/helpers/interfaces.d.ts +1 -0
  67. package/dist/nodes/Crawl4aiContentExtractor/helpers/interfaces.js +3 -0
  68. package/dist/nodes/Crawl4aiContentExtractor/helpers/interfaces.js.map +1 -0
  69. package/dist/nodes/Crawl4aiContentExtractor/helpers/utils.d.ts +6 -0
  70. package/dist/nodes/Crawl4aiContentExtractor/helpers/utils.js +89 -0
  71. package/dist/nodes/Crawl4aiContentExtractor/helpers/utils.js.map +1 -0
  72. package/dist/tsconfig.tsbuildinfo +1 -0
  73. package/index.js +14 -0
  74. package/package.json +70 -0
@@ -0,0 +1,786 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.description = void 0;
4
+ exports.execute = execute;
5
+ const n8n_workflow_1 = require("n8n-workflow");
6
+ const utils_1 = require("../helpers/utils");
7
+ const formatters_1 = require("../../Crawl4aiBasicCrawler/helpers/formatters");
8
+ exports.description = [
9
+ {
10
+ displayName: 'URL',
11
+ name: 'url',
12
+ type: 'string',
13
+ required: true,
14
+ default: '',
15
+ placeholder: 'https://example.com',
16
+ description: 'The URL to extract content from',
17
+ displayOptions: {
18
+ show: {
19
+ operation: ['llmExtractor'],
20
+ },
21
+ },
22
+ },
23
+ {
24
+ displayName: 'Extraction Instructions',
25
+ name: 'instruction',
26
+ type: 'string',
27
+ typeOptions: {
28
+ rows: 4,
29
+ },
30
+ required: true,
31
+ default: '',
32
+ placeholder: 'Extract the product name, price, and description from this page.',
33
+ description: 'Instructions for the LLM on what to extract from the page',
34
+ displayOptions: {
35
+ show: {
36
+ operation: ['llmExtractor'],
37
+ },
38
+ },
39
+ },
40
+ {
41
+ displayName: 'Schema Input Mode',
42
+ name: 'schemaMode',
43
+ type: 'options',
44
+ options: [
45
+ {
46
+ name: 'Simple Fields',
47
+ value: 'simple',
48
+ description: 'Define schema using individual field inputs',
49
+ },
50
+ {
51
+ name: 'Advanced JSON',
52
+ value: 'advanced',
53
+ description: 'Define schema using JSON editor',
54
+ },
55
+ ],
56
+ default: 'simple',
57
+ description: 'Choose how to define the extraction schema',
58
+ displayOptions: {
59
+ show: {
60
+ operation: ['llmExtractor'],
61
+ },
62
+ },
63
+ },
64
+ {
65
+ displayName: 'Schema Fields',
66
+ name: 'schemaFields',
67
+ placeholder: 'Add Schema Field',
68
+ type: 'fixedCollection',
69
+ typeOptions: {
70
+ multipleValues: true,
71
+ },
72
+ default: {},
73
+ required: true,
74
+ displayOptions: {
75
+ show: {
76
+ operation: ['llmExtractor'],
77
+ schemaMode: ['simple'],
78
+ },
79
+ },
80
+ options: [
81
+ {
82
+ name: 'fieldsValues',
83
+ displayName: 'Fields',
84
+ values: [
85
+ {
86
+ displayName: 'Field Name',
87
+ name: 'name',
88
+ type: 'string',
89
+ required: true,
90
+ default: '',
91
+ placeholder: 'title',
92
+ description: 'Name of the field to extract',
93
+ },
94
+ {
95
+ displayName: 'Field Type',
96
+ name: 'fieldType',
97
+ type: 'options',
98
+ options: [
99
+ {
100
+ name: 'String',
101
+ value: 'string',
102
+ description: 'Plain text string',
103
+ },
104
+ {
105
+ name: 'Number',
106
+ value: 'number',
107
+ description: 'Numeric value',
108
+ },
109
+ {
110
+ name: 'Boolean',
111
+ value: 'boolean',
112
+ description: 'True/false value',
113
+ },
114
+ {
115
+ name: 'Array',
116
+ value: 'array',
117
+ description: 'Array of values',
118
+ },
119
+ ],
120
+ default: 'string',
121
+ description: 'Type of the field',
122
+ },
123
+ {
124
+ displayName: 'Description',
125
+ name: 'description',
126
+ type: 'string',
127
+ default: '',
128
+ placeholder: 'The main title of the product',
129
+ description: 'Description of the field to help the LLM understand what to extract',
130
+ },
131
+ {
132
+ displayName: 'Required',
133
+ name: 'required',
134
+ type: 'boolean',
135
+ default: true,
136
+ description: 'Whether this field is required',
137
+ },
138
+ ],
139
+ },
140
+ ],
141
+ },
142
+ {
143
+ displayName: 'JSON Schema',
144
+ name: 'jsonSchema',
145
+ type: 'string',
146
+ default: `{
147
+ "type": "object",
148
+ "properties": {
149
+ "title": {
150
+ "type": "string",
151
+ "description": "Main page title"
152
+ },
153
+ "description": {
154
+ "type": "string",
155
+ "description": "Page description or summary"
156
+ }
157
+ },
158
+ "required": ["title"]
159
+ }`,
160
+ placeholder: `{
161
+ "type": "object",
162
+ "properties": {
163
+ "title": {
164
+ "type": "string",
165
+ "description": "Main page title"
166
+ },
167
+ "price": {
168
+ "type": "number",
169
+ "description": "Product price"
170
+ },
171
+ "features": {
172
+ "type": "array",
173
+ "items": {"type": "string"},
174
+ "description": "List of product features"
175
+ }
176
+ },
177
+ "required": ["title", "price"]
178
+ }`,
179
+ description: 'JSON schema defining the structure of data to extract. Must be valid JSON format.',
180
+ displayOptions: {
181
+ show: {
182
+ operation: ['llmExtractor'],
183
+ schemaMode: ['advanced'],
184
+ },
185
+ },
186
+ typeOptions: {
187
+ rows: 12,
188
+ alwaysOpenEditWindow: false,
189
+ },
190
+ },
191
+ {
192
+ displayName: 'Browser Options',
193
+ name: 'browserOptions',
194
+ type: 'collection',
195
+ placeholder: 'Add Option',
196
+ default: {},
197
+ displayOptions: {
198
+ show: {
199
+ operation: ['llmExtractor'],
200
+ },
201
+ },
202
+ options: [
203
+ {
204
+ displayName: 'Browser Type',
205
+ name: 'browserType',
206
+ type: 'options',
207
+ options: [
208
+ {
209
+ name: 'Chromium',
210
+ value: 'chromium',
211
+ description: 'Use Chromium browser (default, most compatible)',
212
+ },
213
+ {
214
+ name: 'Firefox',
215
+ value: 'firefox',
216
+ description: 'Use Firefox browser',
217
+ },
218
+ {
219
+ name: 'Webkit',
220
+ value: 'webkit',
221
+ description: 'Use Webkit browser (Safari engine)',
222
+ },
223
+ ],
224
+ default: 'chromium',
225
+ description: 'Which browser engine to use for crawling',
226
+ },
227
+ {
228
+ displayName: 'Enable JavaScript',
229
+ name: 'javaScriptEnabled',
230
+ type: 'boolean',
231
+ default: true,
232
+ description: 'Whether to enable JavaScript execution',
233
+ },
234
+ {
235
+ displayName: 'Enable Stealth Mode',
236
+ name: 'enableStealth',
237
+ type: 'boolean',
238
+ default: false,
239
+ description: 'Whether to enable stealth mode to bypass basic bot detection (hides webdriver properties and modifies browser fingerprints)',
240
+ },
241
+ {
242
+ displayName: 'Headless Mode',
243
+ name: 'headless',
244
+ type: 'boolean',
245
+ default: true,
246
+ description: 'Whether to run browser in headless mode',
247
+ },
248
+ {
249
+ displayName: 'JavaScript Code',
250
+ name: 'jsCode',
251
+ type: 'string',
252
+ typeOptions: {
253
+ rows: 4,
254
+ },
255
+ default: '',
256
+ placeholder: 'document.querySelector("button.load-more").click();',
257
+ description: 'JavaScript code to execute before extraction (e.g., to click buttons, scroll)',
258
+ },
259
+ {
260
+ displayName: 'Timeout (MS)',
261
+ name: 'timeout',
262
+ type: 'number',
263
+ default: 60000,
264
+ description: 'Maximum time to wait for the browser to load the page',
265
+ },
266
+ {
267
+ displayName: 'Viewport Height',
268
+ name: 'viewportHeight',
269
+ type: 'number',
270
+ default: 800,
271
+ description: 'The height of the browser viewport',
272
+ },
273
+ {
274
+ displayName: 'Viewport Width',
275
+ name: 'viewportWidth',
276
+ type: 'number',
277
+ default: 1280,
278
+ description: 'The width of the browser viewport',
279
+ },
280
+ ],
281
+ },
282
+ {
283
+ displayName: 'LLM Options',
284
+ name: 'llmOptions',
285
+ type: 'collection',
286
+ placeholder: 'Add Option',
287
+ default: {},
288
+ displayOptions: {
289
+ show: {
290
+ operation: ['llmExtractor'],
291
+ },
292
+ },
293
+ options: [
294
+ {
295
+ displayName: 'LLM Provider',
296
+ name: 'llmProvider',
297
+ type: 'options',
298
+ options: [
299
+ {
300
+ name: 'Anthropic Claude 3 Haiku',
301
+ value: 'anthropic/claude-3-haiku-20240307',
302
+ description: 'Claude 3 Haiku (Fast)',
303
+ },
304
+ {
305
+ name: 'Anthropic Claude 3 Opus',
306
+ value: 'anthropic/claude-3-opus-20240229',
307
+ description: 'Claude 3 Opus (Most Capable)',
308
+ },
309
+ {
310
+ name: 'Anthropic Claude 3 Sonnet',
311
+ value: 'anthropic/claude-3-sonnet-20240229',
312
+ },
313
+ {
314
+ name: 'Anthropic Claude 3.5 Sonnet',
315
+ value: 'anthropic/claude-3-5-sonnet-20241022',
316
+ },
317
+ {
318
+ name: 'Anthropic Claude 3.7 Sonnet',
319
+ value: 'anthropic/claude-3-7-sonnet-20250219',
320
+ description: 'Claude 3.7 Sonnet (Latest, Best)',
321
+ },
322
+ {
323
+ name: 'DeepSeek Chat',
324
+ value: 'deepseek/deepseek-chat',
325
+ description: 'DeepSeek Chat (Affordable)',
326
+ },
327
+ {
328
+ name: 'DeepSeek Coder',
329
+ value: 'deepseek/deepseek-coder',
330
+ description: 'DeepSeek Coder (Code-Focused)',
331
+ },
332
+ {
333
+ name: 'Google Gemini 1.5 Flash',
334
+ value: 'gemini/gemini-1.5-flash',
335
+ description: 'Gemini 1.5 Flash (Fast)',
336
+ },
337
+ {
338
+ name: 'Google Gemini 1.5 Pro',
339
+ value: 'gemini/gemini-1.5-pro',
340
+ description: 'Gemini 1.5 Pro (Large Context)',
341
+ },
342
+ {
343
+ name: 'Google Gemini Pro',
344
+ value: 'gemini/gemini-pro',
345
+ },
346
+ {
347
+ name: 'Groq Llama 3 70B',
348
+ value: 'groq/llama3-70b-8192',
349
+ description: 'Groq Llama 3 70B (Fast)',
350
+ },
351
+ {
352
+ name: 'Groq Llama 3.1 70B',
353
+ value: 'groq/llama-3.1-70b-versatile',
354
+ description: 'Groq Llama 3.1 70B (Fast)',
355
+ },
356
+ {
357
+ name: 'Groq Llama 3.3 70B',
358
+ value: 'groq/llama-3.3-70b-versatile',
359
+ description: 'Groq Llama 3.3 70B (Fast)',
360
+ },
361
+ {
362
+ name: 'Groq Mixtral 8x7B',
363
+ value: 'groq/mixtral-8x7b-32768',
364
+ description: 'Groq Mixtral 8x7B (Fast)',
365
+ },
366
+ {
367
+ name: 'Ollama Llama 3',
368
+ value: 'ollama/llama3',
369
+ description: 'Ollama Llama 3 (Local)',
370
+ },
371
+ {
372
+ name: 'Ollama Llama 3.3',
373
+ value: 'ollama/llama3.3',
374
+ description: 'Ollama Llama 3.3 (Local)',
375
+ },
376
+ {
377
+ name: 'Ollama Mistral',
378
+ value: 'ollama/mistral',
379
+ description: 'Ollama Mistral (Local)',
380
+ },
381
+ {
382
+ name: 'Ollama Qwen 2.5',
383
+ value: 'ollama/qwen2.5',
384
+ description: 'Ollama Qwen 2.5 (Local)',
385
+ },
386
+ {
387
+ name: 'OpenAI GPT-3.5 Turbo',
388
+ value: 'openai/gpt-3.5-turbo',
389
+ description: 'OpenAI GPT-3.5 Turbo (Fast)',
390
+ },
391
+ {
392
+ name: 'OpenAI GPT-4 Turbo',
393
+ value: 'openai/gpt-4-turbo',
394
+ },
395
+ {
396
+ name: 'OpenAI GPT-4o',
397
+ value: 'openai/gpt-4o',
398
+ description: 'OpenAI GPT-4o (Recommended)',
399
+ },
400
+ {
401
+ name: 'OpenAI GPT-4o Mini',
402
+ value: 'openai/gpt-4o-mini',
403
+ description: 'OpenAI GPT-4o Mini (Fast & Affordable)',
404
+ },
405
+ ],
406
+ default: 'openai/gpt-4o-mini',
407
+ description: 'LLM provider to use for extraction. Supports 100+ models via LiteLLM.',
408
+ displayOptions: {
409
+ show: {
410
+ overrideProvider: [true],
411
+ },
412
+ },
413
+ },
414
+ {
415
+ displayName: 'Max Tokens',
416
+ name: 'maxTokens',
417
+ type: 'number',
418
+ default: 2000,
419
+ description: 'Maximum number of tokens for the LLM response',
420
+ },
421
+ {
422
+ displayName: 'Override LLM Provider',
423
+ name: 'overrideProvider',
424
+ type: 'boolean',
425
+ default: false,
426
+ description: 'Whether to override the LLM provider from credentials',
427
+ },
428
+ {
429
+ displayName: 'Provider API Key',
430
+ name: 'apiKey',
431
+ type: 'string',
432
+ typeOptions: {
433
+ password: true,
434
+ },
435
+ default: '',
436
+ description: 'API key for the LLM provider (leave empty to use API key from credentials)',
437
+ displayOptions: {
438
+ show: {
439
+ overrideProvider: [true],
440
+ },
441
+ },
442
+ },
443
+ {
444
+ displayName: 'Temperature',
445
+ name: 'temperature',
446
+ type: 'number',
447
+ typeOptions: {
448
+ minValue: 0,
449
+ maxValue: 1,
450
+ numberPrecision: 1,
451
+ },
452
+ default: 0,
453
+ description: 'Controls randomness: 0 for deterministic results, higher for more creativity',
454
+ },
455
+ ],
456
+ },
457
+ {
458
+ displayName: 'Options',
459
+ name: 'options',
460
+ type: 'collection',
461
+ placeholder: 'Add Option',
462
+ default: {},
463
+ displayOptions: {
464
+ show: {
465
+ operation: ['llmExtractor'],
466
+ },
467
+ },
468
+ options: [
469
+ {
470
+ displayName: 'Array Handling',
471
+ name: 'arrayHandling',
472
+ type: 'options',
473
+ options: [
474
+ {
475
+ name: 'Keep As Object (Default)',
476
+ value: 'none',
477
+ description: 'Maintain current behavior - arrays become indexed properties',
478
+ },
479
+ {
480
+ name: 'Split Top-Level Arrays',
481
+ value: 'topLevel',
482
+ description: 'Create separate items only for arrays at root level',
483
+ },
484
+ {
485
+ name: 'Split All Object Arrays',
486
+ value: 'allObjects',
487
+ description: 'Split any array containing objects, preserve primitive arrays',
488
+ },
489
+ {
490
+ name: 'Smart Split',
491
+ value: 'smart',
492
+ description: 'Automatically detect main content arrays and split intelligently',
493
+ },
494
+ ],
495
+ default: 'none',
496
+ description: 'How to handle arrays in the extracted data',
497
+ },
498
+ {
499
+ displayName: 'Cache Mode',
500
+ name: 'cacheMode',
501
+ type: 'options',
502
+ options: [
503
+ {
504
+ name: 'Bypass (Skip Cache)',
505
+ value: 'BYPASS',
506
+ description: 'Skip cache for this operation, fetch fresh content',
507
+ },
508
+ {
509
+ name: 'Disabled (No Cache)',
510
+ value: 'DISABLED',
511
+ description: 'No caching at all',
512
+ },
513
+ {
514
+ name: 'Enabled (Read/Write)',
515
+ value: 'ENABLED',
516
+ description: 'Use cache if available, save new results to cache',
517
+ },
518
+ {
519
+ name: 'Read Only',
520
+ value: 'READ_ONLY',
521
+ description: 'Only read from cache, do not write new results',
522
+ },
523
+ {
524
+ name: 'Write Only',
525
+ value: 'WRITE_ONLY',
526
+ description: 'Only write to cache, do not read existing cache',
527
+ },
528
+ ],
529
+ default: 'ENABLED',
530
+ description: 'How to use the cache when crawling',
531
+ },
532
+ {
533
+ displayName: 'CSS Selector',
534
+ name: 'cssSelector',
535
+ type: 'string',
536
+ default: '',
537
+ placeholder: 'article.content',
538
+ description: 'CSS selector to focus extraction on a specific part of the page (leave empty for full page)',
539
+ },
540
+ {
541
+ displayName: 'Include Metadata in Split Items',
542
+ name: 'includeMetadataInSplitItems',
543
+ type: 'boolean',
544
+ default: false,
545
+ description: 'Whether to include URL, success, and other metadata in each split item (reduces redundancy when disabled)',
546
+ displayOptions: {
547
+ show: {
548
+ arrayHandling: ['topLevel', 'allObjects', 'smart'],
549
+ },
550
+ },
551
+ },
552
+ {
553
+ displayName: 'Include Original Text',
554
+ name: 'includeFullText',
555
+ type: 'boolean',
556
+ default: false,
557
+ description: 'Whether to include the original webpage text in output',
558
+ },
559
+ ],
560
+ },
561
+ ];
562
+ function getNumericKeys(obj) {
563
+ return Object.keys(obj).filter(key => /^\d+$/.test(key)).sort((a, b) => parseInt(a) - parseInt(b));
564
+ }
565
+ function getMetadataKeys(obj) {
566
+ return Object.keys(obj).filter(key => !/^\d+$/.test(key));
567
+ }
568
+ function detectMainArray(obj) {
569
+ const numericKeys = getNumericKeys(obj);
570
+ if (numericKeys.length === 0)
571
+ return null;
572
+ const firstItem = obj[numericKeys[0]];
573
+ if (typeof firstItem === 'object' && firstItem !== null) {
574
+ const keyCount = Object.keys(firstItem).length;
575
+ if (keyCount >= 2) {
576
+ return 'numeric';
577
+ }
578
+ }
579
+ return null;
580
+ }
581
+ function processArrayHandling(data, strategy, baseMetadata, includeMetadata = true) {
582
+ if (strategy === 'none') {
583
+ return [data];
584
+ }
585
+ const numericKeys = getNumericKeys(data);
586
+ const baseData = {};
587
+ if (includeMetadata) {
588
+ const metadata = getMetadataKeys(data);
589
+ metadata.forEach(key => {
590
+ baseData[key] = data[key];
591
+ });
592
+ Object.entries(baseMetadata).forEach(([key, value]) => {
593
+ baseData[key] = value;
594
+ });
595
+ }
596
+ if (numericKeys.length === 0) {
597
+ return [data];
598
+ }
599
+ switch (strategy) {
600
+ case 'topLevel':
601
+ return numericKeys.map(key => {
602
+ const itemData = data[key];
603
+ if (typeof itemData === 'object' && itemData !== null) {
604
+ return {
605
+ ...baseData,
606
+ ...itemData
607
+ };
608
+ }
609
+ return {
610
+ ...baseData,
611
+ value: itemData
612
+ };
613
+ });
614
+ case 'allObjects':
615
+ const firstItem = data[numericKeys[0]];
616
+ if (typeof firstItem === 'object' && firstItem !== null) {
617
+ return numericKeys.map(key => {
618
+ const itemData = data[key];
619
+ return {
620
+ ...baseData,
621
+ ...(typeof itemData === 'object' && itemData !== null ? itemData : { value: itemData })
622
+ };
623
+ });
624
+ }
625
+ return [data];
626
+ case 'smart':
627
+ const mainArray = detectMainArray(data);
628
+ if (mainArray === 'numeric') {
629
+ return numericKeys.map(key => {
630
+ const itemData = data[key];
631
+ return {
632
+ ...baseData,
633
+ ...(typeof itemData === 'object' && itemData !== null ? itemData : { value: itemData })
634
+ };
635
+ });
636
+ }
637
+ return [data];
638
+ default:
639
+ return [data];
640
+ }
641
+ }
642
+ async function execute(items, nodeOptions) {
643
+ var _a;
644
+ const allResults = [];
645
+ const credentials = (await this.getCredentials('crawl4aiApi'));
646
+ if (!credentials.enableLlm) {
647
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'LLM features are not enabled in Crawl4AI credentials. Please enable them and configure an LLM provider.', { itemIndex: 0 });
648
+ }
649
+ for (let i = 0; i < items.length; i++) {
650
+ try {
651
+ const url = this.getNodeParameter('url', i, '');
652
+ const instruction = this.getNodeParameter('instruction', i, '');
653
+ const schemaMode = this.getNodeParameter('schemaMode', i, 'simple');
654
+ const schemaFieldsValues = this.getNodeParameter('schemaFields.fieldsValues', i, []);
655
+ const jsonSchema = this.getNodeParameter('jsonSchema', i, {});
656
+ const browserOptions = this.getNodeParameter('browserOptions', i, {});
657
+ const llmOptions = this.getNodeParameter('llmOptions', i, {});
658
+ const options = this.getNodeParameter('options', i, {});
659
+ const arrayHandling = options.arrayHandling || 'none';
660
+ const includeMetadataInSplitItems = options.includeMetadataInSplitItems || false;
661
+ if (!url) {
662
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'URL cannot be empty.', { itemIndex: i });
663
+ }
664
+ if (!(0, utils_1.isValidUrl)(url)) {
665
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid URL: ${url}`, { itemIndex: i });
666
+ }
667
+ if (!instruction) {
668
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Extraction instructions cannot be empty.', { itemIndex: i });
669
+ }
670
+ if (schemaMode === 'simple') {
671
+ if (!schemaFieldsValues || schemaFieldsValues.length === 0) {
672
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'At least one schema field must be defined.', { itemIndex: i });
673
+ }
674
+ }
675
+ else if (schemaMode === 'advanced') {
676
+ if (!jsonSchema || jsonSchema.trim() === '') {
677
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'JSON schema cannot be empty.', { itemIndex: i });
678
+ }
679
+ }
680
+ let schema;
681
+ if (schemaMode === 'simple') {
682
+ const schemaProperties = {};
683
+ const requiredFields = [];
684
+ schemaFieldsValues.forEach(field => {
685
+ const fieldName = field.name;
686
+ schemaProperties[fieldName] = {
687
+ name: fieldName,
688
+ type: field.fieldType,
689
+ description: field.description || undefined,
690
+ };
691
+ if (field.required === true) {
692
+ requiredFields.push(fieldName);
693
+ }
694
+ });
695
+ schema = {
696
+ title: 'ExtractedData',
697
+ type: 'object',
698
+ properties: schemaProperties,
699
+ required: requiredFields.length > 0 ? requiredFields : undefined,
700
+ };
701
+ }
702
+ else {
703
+ const jsonSchemaString = jsonSchema;
704
+ if (!jsonSchemaString || jsonSchemaString.trim() === '') {
705
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'JSON schema cannot be empty in advanced mode.', { itemIndex: i });
706
+ }
707
+ let parsedSchema;
708
+ try {
709
+ parsedSchema = JSON.parse(jsonSchemaString.trim());
710
+ }
711
+ catch (error) {
712
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid JSON schema: ${error.message}`, { itemIndex: i });
713
+ }
714
+ if (!parsedSchema || typeof parsedSchema !== 'object') {
715
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'JSON schema must be a valid object', { itemIndex: i });
716
+ }
717
+ schema = parsedSchema;
718
+ if (!schema.type) {
719
+ schema.type = 'object';
720
+ }
721
+ if (!schema.title) {
722
+ schema.title = 'ExtractedData';
723
+ }
724
+ }
725
+ let provider;
726
+ let apiKey;
727
+ let baseUrl;
728
+ if (credentials.llmProvider === 'other') {
729
+ provider = credentials.customProvider || 'openai/gpt-4o';
730
+ apiKey = credentials.customApiKey || '';
731
+ baseUrl = credentials.customBaseUrl || undefined;
732
+ }
733
+ else {
734
+ provider = credentials.llmProvider || 'openai/gpt-4o';
735
+ apiKey = credentials.apiKey || '';
736
+ baseUrl = undefined;
737
+ }
738
+ if (llmOptions.overrideProvider === true) {
739
+ provider = llmOptions.llmProvider || provider;
740
+ apiKey = llmOptions.apiKey || apiKey;
741
+ }
742
+ const browserConfig = (0, utils_1.createBrowserConfig)(browserOptions);
743
+ const extractionStrategy = (0, utils_1.createLlmExtractionStrategy)(schema, instruction, provider, apiKey, baseUrl);
744
+ const crawler = await (0, utils_1.getCrawl4aiClient)(this);
745
+ const extraArgs = {};
746
+ if (llmOptions.temperature !== undefined) {
747
+ extraArgs.temperature = llmOptions.temperature;
748
+ }
749
+ if (llmOptions.maxTokens !== undefined) {
750
+ extraArgs.max_tokens = llmOptions.maxTokens;
751
+ }
752
+ const result = await crawler.arun(url, {
753
+ browserConfig,
754
+ extractionStrategy,
755
+ cacheMode: options.cacheMode || 'enabled',
756
+ jsCode: browserOptions.jsCode,
757
+ cssSelector: options.cssSelector,
758
+ extraArgs,
759
+ });
760
+ const extractedData = (0, formatters_1.parseExtractedJson)(result);
761
+ const formattedResult = (0, formatters_1.formatExtractionResult)(result, extractedData, options.includeFullText);
762
+ const processedResults = processArrayHandling(formattedResult, arrayHandling, {}, includeMetadataInSplitItems);
763
+ processedResults.forEach(processedResult => {
764
+ allResults.push({
765
+ json: processedResult,
766
+ pairedItem: { item: i },
767
+ });
768
+ });
769
+ }
770
+ catch (error) {
771
+ if (this.continueOnFail()) {
772
+ const node = this.getNode();
773
+ const errorItemIndex = (_a = error.itemIndex) !== null && _a !== void 0 ? _a : i;
774
+ allResults.push({
775
+ json: items[i].json,
776
+ error: new n8n_workflow_1.NodeOperationError(node, error.message, { itemIndex: errorItemIndex }),
777
+ pairedItem: { item: i },
778
+ });
779
+ continue;
780
+ }
781
+ throw error;
782
+ }
783
+ }
784
+ return allResults;
785
+ }
786
+ //# sourceMappingURL=llmExtractor.operation.js.map