n8n-nodes-crawl4ai-plus 2.0.9 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +23 -23
- package/README.md +129 -41
- package/dist/credentials/Crawl4aiApi.credentials.js +2 -34
- package/dist/credentials/Crawl4aiApi.credentials.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js +1230 -30
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js +715 -9
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js +495 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js +9 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.d.ts +4 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js +94 -60
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.d.ts +8 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js +49 -12
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/interfaces.d.ts +38 -5
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.d.ts +13 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js +270 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js +445 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js +108 -8
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js +49 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js +134 -17
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js +27 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js +206 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js +376 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.d.ts +4 -2
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js +53 -16
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/index.js +11 -11
- package/package.json +1 -1
|
@@ -54,7 +54,7 @@ exports.description = [
|
|
|
54
54
|
},
|
|
55
55
|
],
|
|
56
56
|
default: 'chromium',
|
|
57
|
-
description: 'Which browser engine to use for crawling',
|
|
57
|
+
description: 'Which browser engine to use for crawling. Default: Chromium (if not specified).',
|
|
58
58
|
},
|
|
59
59
|
{
|
|
60
60
|
displayName: 'Enable JavaScript',
|
|
@@ -70,6 +70,32 @@ exports.description = [
|
|
|
70
70
|
default: false,
|
|
71
71
|
description: 'Whether to enable stealth mode to bypass basic bot detection (hides webdriver properties and modifies browser fingerprints)',
|
|
72
72
|
},
|
|
73
|
+
{
|
|
74
|
+
displayName: 'Extra Browser Arguments',
|
|
75
|
+
name: 'extraArgs',
|
|
76
|
+
type: 'fixedCollection',
|
|
77
|
+
typeOptions: {
|
|
78
|
+
multipleValues: true,
|
|
79
|
+
},
|
|
80
|
+
default: {},
|
|
81
|
+
description: 'Additional command-line arguments to pass to the browser (advanced users only)',
|
|
82
|
+
options: [
|
|
83
|
+
{
|
|
84
|
+
name: 'args',
|
|
85
|
+
displayName: 'Arguments',
|
|
86
|
+
values: [
|
|
87
|
+
{
|
|
88
|
+
displayName: 'Argument',
|
|
89
|
+
name: 'value',
|
|
90
|
+
type: 'string',
|
|
91
|
+
default: '',
|
|
92
|
+
placeholder: '--disable-blink-features=AutomationControlled',
|
|
93
|
+
description: 'Browser command-line argument (e.g., --disable-blink-features=AutomationControlled)',
|
|
94
|
+
},
|
|
95
|
+
],
|
|
96
|
+
},
|
|
97
|
+
],
|
|
98
|
+
},
|
|
73
99
|
{
|
|
74
100
|
displayName: 'Headless Mode',
|
|
75
101
|
name: 'headless',
|
|
@@ -108,6 +134,71 @@ exports.description = [
|
|
|
108
134
|
},
|
|
109
135
|
],
|
|
110
136
|
},
|
|
137
|
+
{
|
|
138
|
+
displayName: 'Session & Authentication',
|
|
139
|
+
name: 'sessionOptions',
|
|
140
|
+
type: 'collection',
|
|
141
|
+
placeholder: 'Add Option',
|
|
142
|
+
default: {},
|
|
143
|
+
displayOptions: {
|
|
144
|
+
show: {
|
|
145
|
+
operation: ['crawlSingleUrl'],
|
|
146
|
+
},
|
|
147
|
+
},
|
|
148
|
+
options: [
|
|
149
|
+
{
|
|
150
|
+
displayName: 'Cookies',
|
|
151
|
+
name: 'cookies',
|
|
152
|
+
type: 'json',
|
|
153
|
+
default: '',
|
|
154
|
+
placeholder: '[{"name": "session_id", "value": "abc123", "domain": ".example.com", "path": "/"}]',
|
|
155
|
+
description: 'Array of cookie objects to inject. Alternative to storage state for simple cookie-based auth.',
|
|
156
|
+
},
|
|
157
|
+
{
|
|
158
|
+
displayName: 'Storage State (JSON)',
|
|
159
|
+
name: 'storageState',
|
|
160
|
+
type: 'string',
|
|
161
|
+
typeOptions: {
|
|
162
|
+
rows: 6,
|
|
163
|
+
},
|
|
164
|
+
default: '',
|
|
165
|
+
placeholder: '{"cookies": [...], "origins": [...]}',
|
|
166
|
+
description: 'Browser storage state as JSON (cookies, localStorage, sessionStorage). Captures authenticated session state. Works in all n8n environments.',
|
|
167
|
+
},
|
|
168
|
+
{
|
|
169
|
+
displayName: 'Use Managed Browser',
|
|
170
|
+
name: 'useManagedBrowser',
|
|
171
|
+
type: 'boolean',
|
|
172
|
+
default: false,
|
|
173
|
+
description: 'Whether to use managed browser mode (required for persistent contexts). Advanced option.',
|
|
174
|
+
displayOptions: {
|
|
175
|
+
show: {
|
|
176
|
+
usePersistentContext: [true],
|
|
177
|
+
},
|
|
178
|
+
},
|
|
179
|
+
},
|
|
180
|
+
{
|
|
181
|
+
displayName: 'Use Persistent Browser Context',
|
|
182
|
+
name: 'usePersistentContext',
|
|
183
|
+
type: 'boolean',
|
|
184
|
+
default: false,
|
|
185
|
+
description: 'Whether to use a persistent browser context (requires user data directory). Only use in self-hosted environments with persistent storage.',
|
|
186
|
+
},
|
|
187
|
+
{
|
|
188
|
+
displayName: 'User Data Directory',
|
|
189
|
+
name: 'userDataDir',
|
|
190
|
+
type: 'string',
|
|
191
|
+
default: '',
|
|
192
|
+
placeholder: '/data/browser-profiles/profile1',
|
|
193
|
+
description: 'Path to browser profile directory for persistent sessions. Advanced: Only works in self-hosted n8n with persistent volumes. Use Storage State for cloud deployments.',
|
|
194
|
+
displayOptions: {
|
|
195
|
+
show: {
|
|
196
|
+
usePersistentContext: [true],
|
|
197
|
+
},
|
|
198
|
+
},
|
|
199
|
+
},
|
|
200
|
+
],
|
|
201
|
+
},
|
|
111
202
|
{
|
|
112
203
|
displayName: 'Crawler Options',
|
|
113
204
|
name: 'crawlerOptions',
|
|
@@ -249,8 +340,8 @@ exports.description = [
|
|
|
249
340
|
],
|
|
250
341
|
},
|
|
251
342
|
{
|
|
252
|
-
displayName: 'Options',
|
|
253
|
-
name: '
|
|
343
|
+
displayName: 'Output Options',
|
|
344
|
+
name: 'outputOptions',
|
|
254
345
|
type: 'collection',
|
|
255
346
|
placeholder: 'Add Option',
|
|
256
347
|
default: {},
|
|
@@ -260,12 +351,71 @@ exports.description = [
|
|
|
260
351
|
},
|
|
261
352
|
},
|
|
262
353
|
options: [
|
|
354
|
+
{
|
|
355
|
+
displayName: 'Capture Screenshot',
|
|
356
|
+
name: 'screenshot',
|
|
357
|
+
type: 'boolean',
|
|
358
|
+
default: false,
|
|
359
|
+
description: 'Whether to capture a screenshot of the page (returned as base64)',
|
|
360
|
+
},
|
|
361
|
+
{
|
|
362
|
+
displayName: 'Fetch SSL Certificate',
|
|
363
|
+
name: 'fetchSslCertificate',
|
|
364
|
+
type: 'boolean',
|
|
365
|
+
default: false,
|
|
366
|
+
description: 'Whether to retrieve SSL certificate information from the server',
|
|
367
|
+
},
|
|
368
|
+
{
|
|
369
|
+
displayName: 'Generate PDF',
|
|
370
|
+
name: 'pdf',
|
|
371
|
+
type: 'boolean',
|
|
372
|
+
default: false,
|
|
373
|
+
description: 'Whether to generate a PDF of the page (returned as base64 or binary)',
|
|
374
|
+
},
|
|
375
|
+
{
|
|
376
|
+
displayName: 'Include Links',
|
|
377
|
+
name: 'includeLinks',
|
|
378
|
+
type: 'boolean',
|
|
379
|
+
default: true,
|
|
380
|
+
description: 'Whether to include structured internal/external links data in output',
|
|
381
|
+
},
|
|
263
382
|
{
|
|
264
383
|
displayName: 'Include Media Data',
|
|
265
384
|
name: 'includeMedia',
|
|
266
385
|
type: 'boolean',
|
|
267
386
|
default: false,
|
|
268
|
-
description: 'Whether to include media data in output (images, videos)',
|
|
387
|
+
description: 'Whether to include media data in output (images, videos, audios)',
|
|
388
|
+
},
|
|
389
|
+
{
|
|
390
|
+
displayName: 'Include Tables',
|
|
391
|
+
name: 'includeTables',
|
|
392
|
+
type: 'boolean',
|
|
393
|
+
default: true,
|
|
394
|
+
description: 'Whether to include extracted tables in the output (if table extraction is enabled)',
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
displayName: 'Markdown Output',
|
|
398
|
+
name: 'markdownOutput',
|
|
399
|
+
type: 'options',
|
|
400
|
+
options: [
|
|
401
|
+
{
|
|
402
|
+
name: 'Raw Markdown',
|
|
403
|
+
value: 'raw',
|
|
404
|
+
description: 'Return raw markdown (default, full content)',
|
|
405
|
+
},
|
|
406
|
+
{
|
|
407
|
+
name: 'Filtered Markdown',
|
|
408
|
+
value: 'fit',
|
|
409
|
+
description: 'Return content-filtered markdown (cleaner, main content only)',
|
|
410
|
+
},
|
|
411
|
+
{
|
|
412
|
+
name: 'Both',
|
|
413
|
+
value: 'both',
|
|
414
|
+
description: 'Return both raw and filtered markdown variants',
|
|
415
|
+
},
|
|
416
|
+
],
|
|
417
|
+
default: 'raw',
|
|
418
|
+
description: 'Which markdown variant(s) to return in the output',
|
|
269
419
|
},
|
|
270
420
|
{
|
|
271
421
|
displayName: 'Verbose Response',
|
|
@@ -276,30 +426,586 @@ exports.description = [
|
|
|
276
426
|
},
|
|
277
427
|
],
|
|
278
428
|
},
|
|
429
|
+
{
|
|
430
|
+
displayName: 'Content Filter',
|
|
431
|
+
name: 'contentFilter',
|
|
432
|
+
type: 'collection',
|
|
433
|
+
placeholder: 'Add Filter',
|
|
434
|
+
default: {},
|
|
435
|
+
displayOptions: {
|
|
436
|
+
show: {
|
|
437
|
+
operation: ['crawlSingleUrl'],
|
|
438
|
+
},
|
|
439
|
+
},
|
|
440
|
+
options: [
|
|
441
|
+
{
|
|
442
|
+
displayName: 'BM25 Threshold',
|
|
443
|
+
name: 'bm25Threshold',
|
|
444
|
+
type: 'number',
|
|
445
|
+
default: 1.0,
|
|
446
|
+
displayOptions: {
|
|
447
|
+
show: {
|
|
448
|
+
filterType: ['bm25'],
|
|
449
|
+
},
|
|
450
|
+
},
|
|
451
|
+
description: 'Minimum BM25 score threshold for content inclusion (default: 1.0)',
|
|
452
|
+
},
|
|
453
|
+
{
|
|
454
|
+
displayName: 'Chunk Token Threshold',
|
|
455
|
+
name: 'chunkTokenThreshold',
|
|
456
|
+
type: 'number',
|
|
457
|
+
displayOptions: {
|
|
458
|
+
show: {
|
|
459
|
+
filterType: ['llm'],
|
|
460
|
+
},
|
|
461
|
+
},
|
|
462
|
+
default: 8192,
|
|
463
|
+
description: 'Maximum tokens per chunk for LLM processing (default: 8192, recommended: 4096-16384)',
|
|
464
|
+
},
|
|
465
|
+
{
|
|
466
|
+
displayName: 'Filter Type',
|
|
467
|
+
name: 'filterType',
|
|
468
|
+
type: 'options',
|
|
469
|
+
options: [
|
|
470
|
+
{
|
|
471
|
+
name: 'None',
|
|
472
|
+
value: 'none',
|
|
473
|
+
description: 'No content filtering (return all content)',
|
|
474
|
+
},
|
|
475
|
+
{
|
|
476
|
+
name: 'Pruning Filter',
|
|
477
|
+
value: 'pruning',
|
|
478
|
+
description: 'Remove low-value content using relevance thresholds',
|
|
479
|
+
},
|
|
480
|
+
{
|
|
481
|
+
name: 'BM25 Filter',
|
|
482
|
+
value: 'bm25',
|
|
483
|
+
description: 'Filter content based on query relevance using BM25 algorithm',
|
|
484
|
+
},
|
|
485
|
+
{
|
|
486
|
+
name: 'LLM Filter',
|
|
487
|
+
value: 'llm',
|
|
488
|
+
description: 'Intelligent content filtering using LLM (requires LLM credentials)',
|
|
489
|
+
},
|
|
490
|
+
],
|
|
491
|
+
default: 'none',
|
|
492
|
+
description: 'Type of content filtering to apply',
|
|
493
|
+
},
|
|
494
|
+
{
|
|
495
|
+
displayName: 'Ignore Cache',
|
|
496
|
+
name: 'ignoreCache',
|
|
497
|
+
type: 'boolean',
|
|
498
|
+
displayOptions: {
|
|
499
|
+
show: {
|
|
500
|
+
filterType: ['llm'],
|
|
501
|
+
},
|
|
502
|
+
},
|
|
503
|
+
default: false,
|
|
504
|
+
description: 'Whether to skip cache and always generate fresh filtered content',
|
|
505
|
+
},
|
|
506
|
+
{
|
|
507
|
+
displayName: 'Ignore Links',
|
|
508
|
+
name: 'ignoreLinks',
|
|
509
|
+
type: 'boolean',
|
|
510
|
+
default: false,
|
|
511
|
+
description: 'Whether to exclude links from markdown output',
|
|
512
|
+
},
|
|
513
|
+
{
|
|
514
|
+
displayName: 'LLM Instruction',
|
|
515
|
+
name: 'llmInstruction',
|
|
516
|
+
type: 'string',
|
|
517
|
+
typeOptions: {
|
|
518
|
+
rows: 8,
|
|
519
|
+
},
|
|
520
|
+
displayOptions: {
|
|
521
|
+
show: {
|
|
522
|
+
filterType: ['llm'],
|
|
523
|
+
},
|
|
524
|
+
},
|
|
525
|
+
default: `Extract the main content while preserving its original wording and substance completely.
|
|
526
|
+
Remove only clearly irrelevant elements like:
|
|
527
|
+
- Navigation menus
|
|
528
|
+
- Advertisement sections
|
|
529
|
+
- Cookie notices
|
|
530
|
+
- Footers with site information
|
|
531
|
+
- Sidebars with external links
|
|
532
|
+
- Any UI elements that don't contribute to the content
|
|
533
|
+
|
|
534
|
+
Keep all valuable educational or informational content intact.`,
|
|
535
|
+
description: 'Instructions for the LLM on how to filter and clean the content',
|
|
536
|
+
required: true,
|
|
537
|
+
},
|
|
538
|
+
{
|
|
539
|
+
displayName: 'LLM Verbose',
|
|
540
|
+
name: 'llmVerbose',
|
|
541
|
+
type: 'boolean',
|
|
542
|
+
displayOptions: {
|
|
543
|
+
show: {
|
|
544
|
+
filterType: ['llm'],
|
|
545
|
+
},
|
|
546
|
+
},
|
|
547
|
+
default: false,
|
|
548
|
+
description: 'Whether to enable verbose logging for LLM content filtering',
|
|
549
|
+
},
|
|
550
|
+
{
|
|
551
|
+
displayName: 'Min Word Threshold',
|
|
552
|
+
name: 'minWordThreshold',
|
|
553
|
+
type: 'number',
|
|
554
|
+
default: 0,
|
|
555
|
+
displayOptions: {
|
|
556
|
+
show: {
|
|
557
|
+
filterType: ['pruning'],
|
|
558
|
+
},
|
|
559
|
+
},
|
|
560
|
+
description: 'Minimum word count for content blocks to be considered (0 = no minimum)',
|
|
561
|
+
},
|
|
562
|
+
{
|
|
563
|
+
displayName: 'Threshold',
|
|
564
|
+
name: 'threshold',
|
|
565
|
+
type: 'number',
|
|
566
|
+
default: 0.48,
|
|
567
|
+
displayOptions: {
|
|
568
|
+
show: {
|
|
569
|
+
filterType: ['pruning'],
|
|
570
|
+
},
|
|
571
|
+
},
|
|
572
|
+
description: 'Relevance threshold for pruning (0.0-1.0, default: 0.48). Higher values = more aggressive filtering.',
|
|
573
|
+
},
|
|
574
|
+
{
|
|
575
|
+
displayName: 'Threshold Type',
|
|
576
|
+
name: 'thresholdType',
|
|
577
|
+
type: 'options',
|
|
578
|
+
options: [
|
|
579
|
+
{
|
|
580
|
+
name: 'Fixed',
|
|
581
|
+
value: 'fixed',
|
|
582
|
+
description: 'Use fixed threshold value',
|
|
583
|
+
},
|
|
584
|
+
{
|
|
585
|
+
name: 'Dynamic',
|
|
586
|
+
value: 'dynamic',
|
|
587
|
+
description: 'Calculate threshold dynamically based on content',
|
|
588
|
+
},
|
|
589
|
+
],
|
|
590
|
+
default: 'fixed',
|
|
591
|
+
displayOptions: {
|
|
592
|
+
show: {
|
|
593
|
+
filterType: ['pruning'],
|
|
594
|
+
},
|
|
595
|
+
},
|
|
596
|
+
description: 'How to apply the pruning threshold',
|
|
597
|
+
},
|
|
598
|
+
{
|
|
599
|
+
displayName: 'User Query',
|
|
600
|
+
name: 'userQuery',
|
|
601
|
+
type: 'string',
|
|
602
|
+
default: '',
|
|
603
|
+
placeholder: 'main content topics keywords',
|
|
604
|
+
displayOptions: {
|
|
605
|
+
show: {
|
|
606
|
+
filterType: ['bm25'],
|
|
607
|
+
},
|
|
608
|
+
},
|
|
609
|
+
description: 'Query string to filter relevant content (BM25 will rank content by relevance to this query)',
|
|
610
|
+
},
|
|
611
|
+
],
|
|
612
|
+
},
|
|
613
|
+
{
|
|
614
|
+
displayName: 'Advanced Options',
|
|
615
|
+
name: 'advancedOptions',
|
|
616
|
+
type: 'collection',
|
|
617
|
+
placeholder: 'Add Option',
|
|
618
|
+
default: {},
|
|
619
|
+
displayOptions: {
|
|
620
|
+
show: {
|
|
621
|
+
operation: ['crawlSingleUrl'],
|
|
622
|
+
},
|
|
623
|
+
},
|
|
624
|
+
options: [
|
|
625
|
+
{
|
|
626
|
+
displayName: 'Anti-Bot Features',
|
|
627
|
+
name: 'antiBotFeatures',
|
|
628
|
+
type: 'fixedCollection',
|
|
629
|
+
default: {},
|
|
630
|
+
options: [
|
|
631
|
+
{
|
|
632
|
+
name: 'features',
|
|
633
|
+
displayName: 'Features',
|
|
634
|
+
values: [
|
|
635
|
+
{
|
|
636
|
+
displayName: 'Magic Mode',
|
|
637
|
+
name: 'magic',
|
|
638
|
+
type: 'boolean',
|
|
639
|
+
default: false,
|
|
640
|
+
description: 'Whether to enable anti-detection techniques (stealth++)',
|
|
641
|
+
},
|
|
642
|
+
{
|
|
643
|
+
displayName: 'Simulate User Behavior',
|
|
644
|
+
name: 'simulateUser',
|
|
645
|
+
type: 'boolean',
|
|
646
|
+
default: false,
|
|
647
|
+
description: 'Whether to simulate human-like browsing behavior',
|
|
648
|
+
},
|
|
649
|
+
{
|
|
650
|
+
displayName: 'Override Navigator',
|
|
651
|
+
name: 'overrideNavigator',
|
|
652
|
+
type: 'boolean',
|
|
653
|
+
default: false,
|
|
654
|
+
description: 'Whether to override navigator properties to avoid detection',
|
|
655
|
+
},
|
|
656
|
+
],
|
|
657
|
+
},
|
|
658
|
+
],
|
|
659
|
+
},
|
|
660
|
+
{
|
|
661
|
+
displayName: 'Delay Before Return (Ms)',
|
|
662
|
+
name: 'delayBeforeReturnHtml',
|
|
663
|
+
type: 'number',
|
|
664
|
+
default: 0,
|
|
665
|
+
description: 'Milliseconds to wait before returning HTML (useful for dynamic content)',
|
|
666
|
+
},
|
|
667
|
+
{
|
|
668
|
+
displayName: 'Exclude External Images',
|
|
669
|
+
name: 'excludeExternalImages',
|
|
670
|
+
type: 'boolean',
|
|
671
|
+
default: false,
|
|
672
|
+
description: 'Whether to exclude images hosted on external domains',
|
|
673
|
+
},
|
|
674
|
+
{
|
|
675
|
+
displayName: 'Exclude Social Media Links',
|
|
676
|
+
name: 'excludeSocialMediaLinks',
|
|
677
|
+
type: 'boolean',
|
|
678
|
+
default: false,
|
|
679
|
+
description: 'Whether to exclude links to social media platforms',
|
|
680
|
+
},
|
|
681
|
+
{
|
|
682
|
+
displayName: 'Verbose Mode',
|
|
683
|
+
name: 'verbose',
|
|
684
|
+
type: 'boolean',
|
|
685
|
+
default: false,
|
|
686
|
+
description: 'Whether to enable verbose logging (debug mode)',
|
|
687
|
+
},
|
|
688
|
+
{
|
|
689
|
+
displayName: 'Wait Until',
|
|
690
|
+
name: 'waitUntil',
|
|
691
|
+
type: 'options',
|
|
692
|
+
options: [
|
|
693
|
+
{
|
|
694
|
+
name: 'Load',
|
|
695
|
+
value: 'load',
|
|
696
|
+
description: 'Wait for the load event',
|
|
697
|
+
},
|
|
698
|
+
{
|
|
699
|
+
name: 'DOM Content Loaded',
|
|
700
|
+
value: 'domcontentloaded',
|
|
701
|
+
description: 'Wait for DOMContentLoaded event',
|
|
702
|
+
},
|
|
703
|
+
{
|
|
704
|
+
name: 'Network Idle',
|
|
705
|
+
value: 'networkidle',
|
|
706
|
+
description: 'Wait for network to be idle (no requests for 500ms)',
|
|
707
|
+
},
|
|
708
|
+
{
|
|
709
|
+
name: 'Network Idle 2',
|
|
710
|
+
value: 'networkidle2',
|
|
711
|
+
description: 'Wait for network to be idle (no more than 2 requests for 500ms)',
|
|
712
|
+
},
|
|
713
|
+
],
|
|
714
|
+
default: 'load',
|
|
715
|
+
description: 'When to consider page load complete',
|
|
716
|
+
},
|
|
717
|
+
],
|
|
718
|
+
},
|
|
719
|
+
{
|
|
720
|
+
displayName: 'Table Extraction',
|
|
721
|
+
name: 'tableExtraction',
|
|
722
|
+
type: 'collection',
|
|
723
|
+
placeholder: 'Add Option',
|
|
724
|
+
default: {},
|
|
725
|
+
displayOptions: {
|
|
726
|
+
show: {
|
|
727
|
+
operation: ['crawlSingleUrl'],
|
|
728
|
+
},
|
|
729
|
+
},
|
|
730
|
+
options: [
|
|
731
|
+
{
|
|
732
|
+
displayName: 'Chunk Token Threshold',
|
|
733
|
+
name: 'chunkTokenThreshold',
|
|
734
|
+
type: 'number',
|
|
735
|
+
displayOptions: {
|
|
736
|
+
show: {
|
|
737
|
+
strategyType: ['llm'],
|
|
738
|
+
enableChunking: [true],
|
|
739
|
+
},
|
|
740
|
+
},
|
|
741
|
+
default: 10000,
|
|
742
|
+
description: 'Maximum tokens per chunk when processing large tables (default: 10000)',
|
|
743
|
+
},
|
|
744
|
+
{
|
|
745
|
+
displayName: 'CSS Selector',
|
|
746
|
+
name: 'cssSelector',
|
|
747
|
+
type: 'string',
|
|
748
|
+
displayOptions: {
|
|
749
|
+
show: {
|
|
750
|
+
strategyType: ['llm'],
|
|
751
|
+
},
|
|
752
|
+
},
|
|
753
|
+
default: '',
|
|
754
|
+
placeholder: '.main-content',
|
|
755
|
+
description: 'CSS selector to focus table extraction on specific page area (optional)',
|
|
756
|
+
},
|
|
757
|
+
{
|
|
758
|
+
displayName: 'Enable Chunking',
|
|
759
|
+
name: 'enableChunking',
|
|
760
|
+
type: 'boolean',
|
|
761
|
+
displayOptions: {
|
|
762
|
+
show: {
|
|
763
|
+
strategyType: ['llm'],
|
|
764
|
+
},
|
|
765
|
+
},
|
|
766
|
+
default: false,
|
|
767
|
+
description: 'Whether to enable chunking for large tables (recommended for tables with 100+ rows)',
|
|
768
|
+
},
|
|
769
|
+
{
|
|
770
|
+
displayName: 'Max Parallel Chunks',
|
|
771
|
+
name: 'maxParallelChunks',
|
|
772
|
+
type: 'number',
|
|
773
|
+
displayOptions: {
|
|
774
|
+
show: {
|
|
775
|
+
strategyType: ['llm'],
|
|
776
|
+
enableChunking: [true],
|
|
777
|
+
},
|
|
778
|
+
},
|
|
779
|
+
default: 5,
|
|
780
|
+
description: 'Maximum number of chunks to process in parallel (default: 5)',
|
|
781
|
+
},
|
|
782
|
+
{
|
|
783
|
+
displayName: 'Max Tries',
|
|
784
|
+
name: 'maxTries',
|
|
785
|
+
type: 'number',
|
|
786
|
+
displayOptions: {
|
|
787
|
+
show: {
|
|
788
|
+
strategyType: ['llm'],
|
|
789
|
+
},
|
|
790
|
+
},
|
|
791
|
+
default: 3,
|
|
792
|
+
description: 'Maximum number of retry attempts for LLM extraction (default: 3)',
|
|
793
|
+
},
|
|
794
|
+
{
|
|
795
|
+
displayName: 'Min Rows Per Chunk',
|
|
796
|
+
name: 'minRowsPerChunk',
|
|
797
|
+
type: 'number',
|
|
798
|
+
displayOptions: {
|
|
799
|
+
show: {
|
|
800
|
+
strategyType: ['llm'],
|
|
801
|
+
enableChunking: [true],
|
|
802
|
+
},
|
|
803
|
+
},
|
|
804
|
+
default: 20,
|
|
805
|
+
description: 'Minimum number of rows per chunk (default: 20)',
|
|
806
|
+
},
|
|
807
|
+
{
|
|
808
|
+
displayName: 'Strategy Type',
|
|
809
|
+
name: 'strategyType',
|
|
810
|
+
type: 'options',
|
|
811
|
+
options: [
|
|
812
|
+
{
|
|
813
|
+
name: 'None',
|
|
814
|
+
value: 'none',
|
|
815
|
+
description: 'No table extraction',
|
|
816
|
+
},
|
|
817
|
+
{
|
|
818
|
+
name: 'LLM Table Extraction',
|
|
819
|
+
value: 'llm',
|
|
820
|
+
description: 'Extract tables using LLM (handles complex tables with rowspan/colspan)',
|
|
821
|
+
},
|
|
822
|
+
{
|
|
823
|
+
name: 'Default Table Extraction',
|
|
824
|
+
value: 'default',
|
|
825
|
+
description: 'Extract tables using heuristics (faster, simpler tables only)',
|
|
826
|
+
},
|
|
827
|
+
],
|
|
828
|
+
default: 'none',
|
|
829
|
+
description: 'Table extraction strategy to use',
|
|
830
|
+
},
|
|
831
|
+
{
|
|
832
|
+
displayName: 'Table Score Threshold',
|
|
833
|
+
name: 'tableScoreThreshold',
|
|
834
|
+
type: 'number',
|
|
835
|
+
displayOptions: {
|
|
836
|
+
show: {
|
|
837
|
+
strategyType: ['default'],
|
|
838
|
+
},
|
|
839
|
+
},
|
|
840
|
+
default: 5,
|
|
841
|
+
description: 'Minimum score for table to be included in results (default: 5, range: 0-10)',
|
|
842
|
+
},
|
|
843
|
+
{
|
|
844
|
+
displayName: 'Verbose',
|
|
845
|
+
name: 'verbose',
|
|
846
|
+
type: 'boolean',
|
|
847
|
+
displayOptions: {
|
|
848
|
+
show: {
|
|
849
|
+
strategyType: ['llm', 'default'],
|
|
850
|
+
},
|
|
851
|
+
},
|
|
852
|
+
default: false,
|
|
853
|
+
description: 'Whether to enable verbose logging for table extraction',
|
|
854
|
+
},
|
|
855
|
+
],
|
|
856
|
+
},
|
|
279
857
|
];
|
|
280
858
|
async function execute(items, nodeOptions) {
|
|
281
|
-
var _a;
|
|
859
|
+
var _a, _b;
|
|
282
860
|
const allResults = [];
|
|
283
861
|
for (let i = 0; i < items.length; i++) {
|
|
284
862
|
try {
|
|
285
863
|
const url = this.getNodeParameter('url', i, '');
|
|
286
864
|
const browserOptions = this.getNodeParameter('browserOptions', i, {});
|
|
865
|
+
const sessionOptions = this.getNodeParameter('sessionOptions', i, {});
|
|
287
866
|
const crawlerOptions = this.getNodeParameter('crawlerOptions', i, {});
|
|
288
|
-
const
|
|
867
|
+
const outputOptions = this.getNodeParameter('outputOptions', i, {});
|
|
868
|
+
const contentFilter = this.getNodeParameter('contentFilter', i, {});
|
|
869
|
+
const advancedOptions = this.getNodeParameter('advancedOptions', i, {});
|
|
870
|
+
let mergedBrowserOptions = { ...browserOptions, ...sessionOptions };
|
|
871
|
+
if (browserOptions.extraArgs && typeof browserOptions.extraArgs === 'object') {
|
|
872
|
+
const extraArgsCollection = browserOptions.extraArgs;
|
|
873
|
+
if (extraArgsCollection.args && Array.isArray(extraArgsCollection.args)) {
|
|
874
|
+
mergedBrowserOptions.extraArgs = extraArgsCollection.args.map((arg) => arg.value).filter((v) => v);
|
|
875
|
+
}
|
|
876
|
+
}
|
|
289
877
|
if (!url) {
|
|
290
878
|
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'URL cannot be empty.', { itemIndex: i });
|
|
291
879
|
}
|
|
292
880
|
if (!(0, utils_1.isValidUrl)(url)) {
|
|
293
881
|
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid URL: ${url}`, { itemIndex: i });
|
|
294
882
|
}
|
|
295
|
-
const browserConfig = (0, utils_1.createBrowserConfig)(
|
|
883
|
+
const browserConfig = (0, utils_1.createBrowserConfig)(mergedBrowserOptions);
|
|
884
|
+
const antiBotFeatures = ((_a = advancedOptions.antiBotFeatures) === null || _a === void 0 ? void 0 : _a.features) || {};
|
|
296
885
|
const crawlerConfig = (0, utils_1.createCrawlerRunConfig)({
|
|
297
886
|
...crawlerOptions,
|
|
298
887
|
...browserConfig,
|
|
888
|
+
screenshot: outputOptions.screenshot,
|
|
889
|
+
pdf: outputOptions.pdf,
|
|
890
|
+
fetchSslCertificate: outputOptions.fetchSslCertificate,
|
|
891
|
+
magic: antiBotFeatures.magic,
|
|
892
|
+
simulateUser: antiBotFeatures.simulateUser,
|
|
893
|
+
overrideNavigator: antiBotFeatures.overrideNavigator,
|
|
894
|
+
excludeSocialMediaLinks: advancedOptions.excludeSocialMediaLinks,
|
|
895
|
+
excludeExternalImages: advancedOptions.excludeExternalImages,
|
|
896
|
+
delayBeforeReturnHtml: advancedOptions.delayBeforeReturnHtml,
|
|
897
|
+
waitUntil: advancedOptions.waitUntil,
|
|
898
|
+
verbose: advancedOptions.verbose,
|
|
299
899
|
});
|
|
900
|
+
if (contentFilter.filterType && contentFilter.filterType !== 'none') {
|
|
901
|
+
const enrichedFilterConfig = { ...contentFilter };
|
|
902
|
+
if (contentFilter.filterType === 'llm') {
|
|
903
|
+
const credentials = await this.getCredentials('crawl4aiPlusApi');
|
|
904
|
+
if (!credentials.enableLlm) {
|
|
905
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'LLM features must be enabled in Crawl4AI credentials to use LLM content filtering.', { itemIndex: i });
|
|
906
|
+
}
|
|
907
|
+
let provider = 'openai/gpt-4o';
|
|
908
|
+
let apiKey = '';
|
|
909
|
+
if (credentials.llmProvider === 'openai') {
|
|
910
|
+
const model = credentials.llmModel || 'gpt-4o';
|
|
911
|
+
provider = `openai/${model}`;
|
|
912
|
+
apiKey = credentials.apiKey || '';
|
|
913
|
+
}
|
|
914
|
+
else if (credentials.llmProvider === 'anthropic') {
|
|
915
|
+
const model = credentials.llmModel || 'claude-3-haiku-20240307';
|
|
916
|
+
provider = `anthropic/${model}`;
|
|
917
|
+
apiKey = credentials.apiKey || '';
|
|
918
|
+
}
|
|
919
|
+
else if (credentials.llmProvider === 'groq') {
|
|
920
|
+
const model = credentials.llmModel || 'llama3-70b-8192';
|
|
921
|
+
provider = `groq/${model}`;
|
|
922
|
+
apiKey = credentials.apiKey || '';
|
|
923
|
+
}
|
|
924
|
+
else if (credentials.llmProvider === 'ollama') {
|
|
925
|
+
const model = credentials.ollamaModel || 'llama3';
|
|
926
|
+
provider = `ollama/${model}`;
|
|
927
|
+
}
|
|
928
|
+
else if (credentials.llmProvider === 'other') {
|
|
929
|
+
provider = credentials.customProvider || 'custom/model';
|
|
930
|
+
apiKey = credentials.customApiKey || '';
|
|
931
|
+
}
|
|
932
|
+
if (!apiKey && credentials.llmProvider !== 'ollama') {
|
|
933
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `API key is required for ${credentials.llmProvider} provider. Please configure it in the Crawl4AI credentials.`, { itemIndex: i });
|
|
934
|
+
}
|
|
935
|
+
enrichedFilterConfig.llmConfig = {
|
|
936
|
+
type: 'LLMConfig',
|
|
937
|
+
params: {
|
|
938
|
+
provider,
|
|
939
|
+
api_token: apiKey,
|
|
940
|
+
...(credentials.llmProvider === 'other' && credentials.customBaseUrl ?
|
|
941
|
+
{ api_base: credentials.customBaseUrl } : {}),
|
|
942
|
+
...(credentials.llmProvider === 'ollama' && credentials.ollamaUrl ?
|
|
943
|
+
{ api_base: credentials.ollamaUrl } : {})
|
|
944
|
+
}
|
|
945
|
+
};
|
|
946
|
+
}
|
|
947
|
+
crawlerConfig.markdownGenerator = (0, utils_1.createMarkdownGenerator)(enrichedFilterConfig);
|
|
948
|
+
}
|
|
949
|
+
const tableExtractionConfig = this.getNodeParameter('tableExtraction', i, {});
|
|
950
|
+
if (tableExtractionConfig.strategyType && tableExtractionConfig.strategyType !== 'none') {
|
|
951
|
+
const enrichedTableConfig = { ...tableExtractionConfig };
|
|
952
|
+
if (tableExtractionConfig.strategyType === 'llm') {
|
|
953
|
+
const credentials = await this.getCredentials('crawl4aiPlusApi');
|
|
954
|
+
if (!credentials.enableLlm) {
|
|
955
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'LLM features must be enabled in Crawl4AI credentials to use LLM table extraction.', { itemIndex: i });
|
|
956
|
+
}
|
|
957
|
+
let provider = 'openai/gpt-4o';
|
|
958
|
+
let apiKey = '';
|
|
959
|
+
if (credentials.llmProvider === 'openai') {
|
|
960
|
+
const model = credentials.llmModel || 'gpt-4o';
|
|
961
|
+
provider = `openai/${model}`;
|
|
962
|
+
apiKey = credentials.apiKey || '';
|
|
963
|
+
}
|
|
964
|
+
else if (credentials.llmProvider === 'anthropic') {
|
|
965
|
+
const model = credentials.llmModel || 'claude-3-haiku-20240307';
|
|
966
|
+
provider = `anthropic/${model}`;
|
|
967
|
+
apiKey = credentials.apiKey || '';
|
|
968
|
+
}
|
|
969
|
+
else if (credentials.llmProvider === 'groq') {
|
|
970
|
+
const model = credentials.llmModel || 'llama3-70b-8192';
|
|
971
|
+
provider = `groq/${model}`;
|
|
972
|
+
apiKey = credentials.apiKey || '';
|
|
973
|
+
}
|
|
974
|
+
else if (credentials.llmProvider === 'ollama') {
|
|
975
|
+
const model = credentials.ollamaModel || 'llama3';
|
|
976
|
+
provider = `ollama/${model}`;
|
|
977
|
+
}
|
|
978
|
+
else if (credentials.llmProvider === 'other') {
|
|
979
|
+
provider = credentials.customProvider || 'custom/model';
|
|
980
|
+
apiKey = credentials.customApiKey || '';
|
|
981
|
+
}
|
|
982
|
+
if (!apiKey && credentials.llmProvider !== 'ollama') {
|
|
983
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `API key is required for ${credentials.llmProvider} provider. Please configure it in the Crawl4AI credentials.`, { itemIndex: i });
|
|
984
|
+
}
|
|
985
|
+
enrichedTableConfig.llmConfig = {
|
|
986
|
+
type: 'LLMConfig',
|
|
987
|
+
params: {
|
|
988
|
+
provider,
|
|
989
|
+
api_token: apiKey,
|
|
990
|
+
...(credentials.llmProvider === 'other' && credentials.customBaseUrl ?
|
|
991
|
+
{ api_base: credentials.customBaseUrl } : {}),
|
|
992
|
+
...(credentials.llmProvider === 'ollama' && credentials.ollamaUrl ?
|
|
993
|
+
{ api_base: credentials.ollamaUrl } : {})
|
|
994
|
+
}
|
|
995
|
+
};
|
|
996
|
+
}
|
|
997
|
+
crawlerConfig.tableExtraction = (0, utils_1.createTableExtractionStrategy)(enrichedTableConfig);
|
|
998
|
+
}
|
|
300
999
|
const crawler = await (0, utils_1.getCrawl4aiClient)(this);
|
|
301
1000
|
const result = await crawler.crawlUrl(url, crawlerConfig);
|
|
302
|
-
const formattedResult = (0, formatters_1.formatCrawlResult)(result,
|
|
1001
|
+
const formattedResult = (0, formatters_1.formatCrawlResult)(result, outputOptions.includeMedia, outputOptions.verboseResponse, {
|
|
1002
|
+
markdownOutput: outputOptions.markdownOutput,
|
|
1003
|
+
includeLinks: outputOptions.includeLinks,
|
|
1004
|
+
includeScreenshot: outputOptions.screenshot,
|
|
1005
|
+
includePdf: outputOptions.pdf,
|
|
1006
|
+
includeSslCertificate: outputOptions.fetchSslCertificate,
|
|
1007
|
+
includeTables: outputOptions.includeTables,
|
|
1008
|
+
});
|
|
303
1009
|
allResults.push({
|
|
304
1010
|
json: formattedResult,
|
|
305
1011
|
pairedItem: { item: i },
|
|
@@ -308,7 +1014,7 @@ async function execute(items, nodeOptions) {
|
|
|
308
1014
|
catch (error) {
|
|
309
1015
|
if (this.continueOnFail()) {
|
|
310
1016
|
const node = this.getNode();
|
|
311
|
-
const errorItemIndex = (
|
|
1017
|
+
const errorItemIndex = (_b = error.itemIndex) !== null && _b !== void 0 ? _b : i;
|
|
312
1018
|
allResults.push({
|
|
313
1019
|
json: items[i].json,
|
|
314
1020
|
error: new n8n_workflow_1.NodeOperationError(node, error.message, { itemIndex: errorItemIndex }),
|