n8n-nodes-crawl4ai-plus 2.0.9 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/LICENSE +23 -23
  2. package/README.md +129 -41
  3. package/dist/credentials/Crawl4aiApi.credentials.js +2 -34
  4. package/dist/credentials/Crawl4aiApi.credentials.js.map +1 -1
  5. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js +1230 -30
  6. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js.map +1 -1
  7. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js +715 -9
  8. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js.map +1 -1
  9. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.d.ts +4 -0
  10. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js +495 -0
  11. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js.map +1 -0
  12. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js +9 -0
  13. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js.map +1 -1
  14. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js +1 -1
  15. package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js.map +1 -1
  16. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.d.ts +4 -1
  17. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js +94 -60
  18. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js.map +1 -1
  19. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.d.ts +8 -1
  20. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js +49 -12
  21. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js.map +1 -1
  22. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/interfaces.d.ts +38 -5
  23. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.d.ts +13 -0
  24. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js +270 -0
  25. package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js.map +1 -1
  26. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.d.ts +4 -0
  27. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js +445 -0
  28. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js.map +1 -0
  29. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js +108 -8
  30. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js.map +1 -1
  31. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js +49 -9
  32. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js.map +1 -1
  33. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js +134 -17
  34. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js.map +1 -1
  35. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js +27 -9
  36. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js.map +1 -1
  37. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js +206 -9
  38. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js.map +1 -1
  39. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.d.ts +4 -0
  40. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js +376 -0
  41. package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js.map +1 -0
  42. package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.d.ts +4 -2
  43. package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js +53 -16
  44. package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js.map +1 -1
  45. package/dist/tsconfig.tsbuildinfo +1 -1
  46. package/index.js +11 -11
  47. package/package.json +1 -1
@@ -54,7 +54,7 @@ exports.description = [
54
54
  },
55
55
  ],
56
56
  default: 'chromium',
57
- description: 'Which browser engine to use for crawling',
57
+ description: 'Which browser engine to use for crawling. Default: Chromium (if not specified).',
58
58
  },
59
59
  {
60
60
  displayName: 'Enable JavaScript',
@@ -70,6 +70,32 @@ exports.description = [
70
70
  default: false,
71
71
  description: 'Whether to enable stealth mode to bypass basic bot detection (hides webdriver properties and modifies browser fingerprints)',
72
72
  },
73
+ {
74
+ displayName: 'Extra Browser Arguments',
75
+ name: 'extraArgs',
76
+ type: 'fixedCollection',
77
+ typeOptions: {
78
+ multipleValues: true,
79
+ },
80
+ default: {},
81
+ description: 'Additional command-line arguments to pass to the browser (advanced users only)',
82
+ options: [
83
+ {
84
+ name: 'args',
85
+ displayName: 'Arguments',
86
+ values: [
87
+ {
88
+ displayName: 'Argument',
89
+ name: 'value',
90
+ type: 'string',
91
+ default: '',
92
+ placeholder: '--disable-blink-features=AutomationControlled',
93
+ description: 'Browser command-line argument (e.g., --disable-blink-features=AutomationControlled)',
94
+ },
95
+ ],
96
+ },
97
+ ],
98
+ },
73
99
  {
74
100
  displayName: 'Headless Mode',
75
101
  name: 'headless',
@@ -108,6 +134,71 @@ exports.description = [
108
134
  },
109
135
  ],
110
136
  },
137
+ {
138
+ displayName: 'Session & Authentication',
139
+ name: 'sessionOptions',
140
+ type: 'collection',
141
+ placeholder: 'Add Option',
142
+ default: {},
143
+ displayOptions: {
144
+ show: {
145
+ operation: ['crawlSingleUrl'],
146
+ },
147
+ },
148
+ options: [
149
+ {
150
+ displayName: 'Cookies',
151
+ name: 'cookies',
152
+ type: 'json',
153
+ default: '',
154
+ placeholder: '[{"name": "session_id", "value": "abc123", "domain": ".example.com", "path": "/"}]',
155
+ description: 'Array of cookie objects to inject. Alternative to storage state for simple cookie-based auth.',
156
+ },
157
+ {
158
+ displayName: 'Storage State (JSON)',
159
+ name: 'storageState',
160
+ type: 'string',
161
+ typeOptions: {
162
+ rows: 6,
163
+ },
164
+ default: '',
165
+ placeholder: '{"cookies": [...], "origins": [...]}',
166
+ description: 'Browser storage state as JSON (cookies, localStorage, sessionStorage). Captures authenticated session state. Works in all n8n environments.',
167
+ },
168
+ {
169
+ displayName: 'Use Managed Browser',
170
+ name: 'useManagedBrowser',
171
+ type: 'boolean',
172
+ default: false,
173
+ description: 'Whether to use managed browser mode (required for persistent contexts). Advanced option.',
174
+ displayOptions: {
175
+ show: {
176
+ usePersistentContext: [true],
177
+ },
178
+ },
179
+ },
180
+ {
181
+ displayName: 'Use Persistent Browser Context',
182
+ name: 'usePersistentContext',
183
+ type: 'boolean',
184
+ default: false,
185
+ description: 'Whether to use a persistent browser context (requires user data directory). Only use in self-hosted environments with persistent storage.',
186
+ },
187
+ {
188
+ displayName: 'User Data Directory',
189
+ name: 'userDataDir',
190
+ type: 'string',
191
+ default: '',
192
+ placeholder: '/data/browser-profiles/profile1',
193
+ description: 'Path to browser profile directory for persistent sessions. Advanced: Only works in self-hosted n8n with persistent volumes. Use Storage State for cloud deployments.',
194
+ displayOptions: {
195
+ show: {
196
+ usePersistentContext: [true],
197
+ },
198
+ },
199
+ },
200
+ ],
201
+ },
111
202
  {
112
203
  displayName: 'Crawler Options',
113
204
  name: 'crawlerOptions',
@@ -249,8 +340,8 @@ exports.description = [
249
340
  ],
250
341
  },
251
342
  {
252
- displayName: 'Options',
253
- name: 'options',
343
+ displayName: 'Output Options',
344
+ name: 'outputOptions',
254
345
  type: 'collection',
255
346
  placeholder: 'Add Option',
256
347
  default: {},
@@ -260,12 +351,71 @@ exports.description = [
260
351
  },
261
352
  },
262
353
  options: [
354
+ {
355
+ displayName: 'Capture Screenshot',
356
+ name: 'screenshot',
357
+ type: 'boolean',
358
+ default: false,
359
+ description: 'Whether to capture a screenshot of the page (returned as base64)',
360
+ },
361
+ {
362
+ displayName: 'Fetch SSL Certificate',
363
+ name: 'fetchSslCertificate',
364
+ type: 'boolean',
365
+ default: false,
366
+ description: 'Whether to retrieve SSL certificate information from the server',
367
+ },
368
+ {
369
+ displayName: 'Generate PDF',
370
+ name: 'pdf',
371
+ type: 'boolean',
372
+ default: false,
373
+ description: 'Whether to generate a PDF of the page (returned as base64 or binary)',
374
+ },
375
+ {
376
+ displayName: 'Include Links',
377
+ name: 'includeLinks',
378
+ type: 'boolean',
379
+ default: true,
380
+ description: 'Whether to include structured internal/external links data in output',
381
+ },
263
382
  {
264
383
  displayName: 'Include Media Data',
265
384
  name: 'includeMedia',
266
385
  type: 'boolean',
267
386
  default: false,
268
- description: 'Whether to include media data in output (images, videos)',
387
+ description: 'Whether to include media data in output (images, videos, audios)',
388
+ },
389
+ {
390
+ displayName: 'Include Tables',
391
+ name: 'includeTables',
392
+ type: 'boolean',
393
+ default: true,
394
+ description: 'Whether to include extracted tables in the output (if table extraction is enabled)',
395
+ },
396
+ {
397
+ displayName: 'Markdown Output',
398
+ name: 'markdownOutput',
399
+ type: 'options',
400
+ options: [
401
+ {
402
+ name: 'Raw Markdown',
403
+ value: 'raw',
404
+ description: 'Return raw markdown (default, full content)',
405
+ },
406
+ {
407
+ name: 'Filtered Markdown',
408
+ value: 'fit',
409
+ description: 'Return content-filtered markdown (cleaner, main content only)',
410
+ },
411
+ {
412
+ name: 'Both',
413
+ value: 'both',
414
+ description: 'Return both raw and filtered markdown variants',
415
+ },
416
+ ],
417
+ default: 'raw',
418
+ description: 'Which markdown variant(s) to return in the output',
269
419
  },
270
420
  {
271
421
  displayName: 'Verbose Response',
@@ -276,30 +426,586 @@ exports.description = [
276
426
  },
277
427
  ],
278
428
  },
429
+ {
430
+ displayName: 'Content Filter',
431
+ name: 'contentFilter',
432
+ type: 'collection',
433
+ placeholder: 'Add Filter',
434
+ default: {},
435
+ displayOptions: {
436
+ show: {
437
+ operation: ['crawlSingleUrl'],
438
+ },
439
+ },
440
+ options: [
441
+ {
442
+ displayName: 'BM25 Threshold',
443
+ name: 'bm25Threshold',
444
+ type: 'number',
445
+ default: 1.0,
446
+ displayOptions: {
447
+ show: {
448
+ filterType: ['bm25'],
449
+ },
450
+ },
451
+ description: 'Minimum BM25 score threshold for content inclusion (default: 1.0)',
452
+ },
453
+ {
454
+ displayName: 'Chunk Token Threshold',
455
+ name: 'chunkTokenThreshold',
456
+ type: 'number',
457
+ displayOptions: {
458
+ show: {
459
+ filterType: ['llm'],
460
+ },
461
+ },
462
+ default: 8192,
463
+ description: 'Maximum tokens per chunk for LLM processing (default: 8192, recommended: 4096-16384)',
464
+ },
465
+ {
466
+ displayName: 'Filter Type',
467
+ name: 'filterType',
468
+ type: 'options',
469
+ options: [
470
+ {
471
+ name: 'None',
472
+ value: 'none',
473
+ description: 'No content filtering (return all content)',
474
+ },
475
+ {
476
+ name: 'Pruning Filter',
477
+ value: 'pruning',
478
+ description: 'Remove low-value content using relevance thresholds',
479
+ },
480
+ {
481
+ name: 'BM25 Filter',
482
+ value: 'bm25',
483
+ description: 'Filter content based on query relevance using BM25 algorithm',
484
+ },
485
+ {
486
+ name: 'LLM Filter',
487
+ value: 'llm',
488
+ description: 'Intelligent content filtering using LLM (requires LLM credentials)',
489
+ },
490
+ ],
491
+ default: 'none',
492
+ description: 'Type of content filtering to apply',
493
+ },
494
+ {
495
+ displayName: 'Ignore Cache',
496
+ name: 'ignoreCache',
497
+ type: 'boolean',
498
+ displayOptions: {
499
+ show: {
500
+ filterType: ['llm'],
501
+ },
502
+ },
503
+ default: false,
504
+ description: 'Whether to skip cache and always generate fresh filtered content',
505
+ },
506
+ {
507
+ displayName: 'Ignore Links',
508
+ name: 'ignoreLinks',
509
+ type: 'boolean',
510
+ default: false,
511
+ description: 'Whether to exclude links from markdown output',
512
+ },
513
+ {
514
+ displayName: 'LLM Instruction',
515
+ name: 'llmInstruction',
516
+ type: 'string',
517
+ typeOptions: {
518
+ rows: 8,
519
+ },
520
+ displayOptions: {
521
+ show: {
522
+ filterType: ['llm'],
523
+ },
524
+ },
525
+ default: `Extract the main content while preserving its original wording and substance completely.
526
+ Remove only clearly irrelevant elements like:
527
+ - Navigation menus
528
+ - Advertisement sections
529
+ - Cookie notices
530
+ - Footers with site information
531
+ - Sidebars with external links
532
+ - Any UI elements that don't contribute to the content
533
+
534
+ Keep all valuable educational or informational content intact.`,
535
+ description: 'Instructions for the LLM on how to filter and clean the content',
536
+ required: true,
537
+ },
538
+ {
539
+ displayName: 'LLM Verbose',
540
+ name: 'llmVerbose',
541
+ type: 'boolean',
542
+ displayOptions: {
543
+ show: {
544
+ filterType: ['llm'],
545
+ },
546
+ },
547
+ default: false,
548
+ description: 'Whether to enable verbose logging for LLM content filtering',
549
+ },
550
+ {
551
+ displayName: 'Min Word Threshold',
552
+ name: 'minWordThreshold',
553
+ type: 'number',
554
+ default: 0,
555
+ displayOptions: {
556
+ show: {
557
+ filterType: ['pruning'],
558
+ },
559
+ },
560
+ description: 'Minimum word count for content blocks to be considered (0 = no minimum)',
561
+ },
562
+ {
563
+ displayName: 'Threshold',
564
+ name: 'threshold',
565
+ type: 'number',
566
+ default: 0.48,
567
+ displayOptions: {
568
+ show: {
569
+ filterType: ['pruning'],
570
+ },
571
+ },
572
+ description: 'Relevance threshold for pruning (0.0-1.0, default: 0.48). Higher values = more aggressive filtering.',
573
+ },
574
+ {
575
+ displayName: 'Threshold Type',
576
+ name: 'thresholdType',
577
+ type: 'options',
578
+ options: [
579
+ {
580
+ name: 'Fixed',
581
+ value: 'fixed',
582
+ description: 'Use fixed threshold value',
583
+ },
584
+ {
585
+ name: 'Dynamic',
586
+ value: 'dynamic',
587
+ description: 'Calculate threshold dynamically based on content',
588
+ },
589
+ ],
590
+ default: 'fixed',
591
+ displayOptions: {
592
+ show: {
593
+ filterType: ['pruning'],
594
+ },
595
+ },
596
+ description: 'How to apply the pruning threshold',
597
+ },
598
+ {
599
+ displayName: 'User Query',
600
+ name: 'userQuery',
601
+ type: 'string',
602
+ default: '',
603
+ placeholder: 'main content topics keywords',
604
+ displayOptions: {
605
+ show: {
606
+ filterType: ['bm25'],
607
+ },
608
+ },
609
+ description: 'Query string to filter relevant content (BM25 will rank content by relevance to this query)',
610
+ },
611
+ ],
612
+ },
613
+ {
614
+ displayName: 'Advanced Options',
615
+ name: 'advancedOptions',
616
+ type: 'collection',
617
+ placeholder: 'Add Option',
618
+ default: {},
619
+ displayOptions: {
620
+ show: {
621
+ operation: ['crawlSingleUrl'],
622
+ },
623
+ },
624
+ options: [
625
+ {
626
+ displayName: 'Anti-Bot Features',
627
+ name: 'antiBotFeatures',
628
+ type: 'fixedCollection',
629
+ default: {},
630
+ options: [
631
+ {
632
+ name: 'features',
633
+ displayName: 'Features',
634
+ values: [
635
+ {
636
+ displayName: 'Magic Mode',
637
+ name: 'magic',
638
+ type: 'boolean',
639
+ default: false,
640
+ description: 'Whether to enable anti-detection techniques (stealth++)',
641
+ },
642
+ {
643
+ displayName: 'Simulate User Behavior',
644
+ name: 'simulateUser',
645
+ type: 'boolean',
646
+ default: false,
647
+ description: 'Whether to simulate human-like browsing behavior',
648
+ },
649
+ {
650
+ displayName: 'Override Navigator',
651
+ name: 'overrideNavigator',
652
+ type: 'boolean',
653
+ default: false,
654
+ description: 'Whether to override navigator properties to avoid detection',
655
+ },
656
+ ],
657
+ },
658
+ ],
659
+ },
660
+ {
661
+ displayName: 'Delay Before Return (Ms)',
662
+ name: 'delayBeforeReturnHtml',
663
+ type: 'number',
664
+ default: 0,
665
+ description: 'Milliseconds to wait before returning HTML (useful for dynamic content)',
666
+ },
667
+ {
668
+ displayName: 'Exclude External Images',
669
+ name: 'excludeExternalImages',
670
+ type: 'boolean',
671
+ default: false,
672
+ description: 'Whether to exclude images hosted on external domains',
673
+ },
674
+ {
675
+ displayName: 'Exclude Social Media Links',
676
+ name: 'excludeSocialMediaLinks',
677
+ type: 'boolean',
678
+ default: false,
679
+ description: 'Whether to exclude links to social media platforms',
680
+ },
681
+ {
682
+ displayName: 'Verbose Mode',
683
+ name: 'verbose',
684
+ type: 'boolean',
685
+ default: false,
686
+ description: 'Whether to enable verbose logging (debug mode)',
687
+ },
688
+ {
689
+ displayName: 'Wait Until',
690
+ name: 'waitUntil',
691
+ type: 'options',
692
+ options: [
693
+ {
694
+ name: 'Load',
695
+ value: 'load',
696
+ description: 'Wait for the load event',
697
+ },
698
+ {
699
+ name: 'DOM Content Loaded',
700
+ value: 'domcontentloaded',
701
+ description: 'Wait for DOMContentLoaded event',
702
+ },
703
+ {
704
+ name: 'Network Idle',
705
+ value: 'networkidle',
706
+ description: 'Wait for network to be idle (no requests for 500ms)',
707
+ },
708
+ {
709
+ name: 'Network Idle 2',
710
+ value: 'networkidle2',
711
+ description: 'Wait for network to be idle (no more than 2 requests for 500ms)',
712
+ },
713
+ ],
714
+ default: 'load',
715
+ description: 'When to consider page load complete',
716
+ },
717
+ ],
718
+ },
719
+ {
720
+ displayName: 'Table Extraction',
721
+ name: 'tableExtraction',
722
+ type: 'collection',
723
+ placeholder: 'Add Option',
724
+ default: {},
725
+ displayOptions: {
726
+ show: {
727
+ operation: ['crawlSingleUrl'],
728
+ },
729
+ },
730
+ options: [
731
+ {
732
+ displayName: 'Chunk Token Threshold',
733
+ name: 'chunkTokenThreshold',
734
+ type: 'number',
735
+ displayOptions: {
736
+ show: {
737
+ strategyType: ['llm'],
738
+ enableChunking: [true],
739
+ },
740
+ },
741
+ default: 10000,
742
+ description: 'Maximum tokens per chunk when processing large tables (default: 10000)',
743
+ },
744
+ {
745
+ displayName: 'CSS Selector',
746
+ name: 'cssSelector',
747
+ type: 'string',
748
+ displayOptions: {
749
+ show: {
750
+ strategyType: ['llm'],
751
+ },
752
+ },
753
+ default: '',
754
+ placeholder: '.main-content',
755
+ description: 'CSS selector to focus table extraction on specific page area (optional)',
756
+ },
757
+ {
758
+ displayName: 'Enable Chunking',
759
+ name: 'enableChunking',
760
+ type: 'boolean',
761
+ displayOptions: {
762
+ show: {
763
+ strategyType: ['llm'],
764
+ },
765
+ },
766
+ default: false,
767
+ description: 'Whether to enable chunking for large tables (recommended for tables with 100+ rows)',
768
+ },
769
+ {
770
+ displayName: 'Max Parallel Chunks',
771
+ name: 'maxParallelChunks',
772
+ type: 'number',
773
+ displayOptions: {
774
+ show: {
775
+ strategyType: ['llm'],
776
+ enableChunking: [true],
777
+ },
778
+ },
779
+ default: 5,
780
+ description: 'Maximum number of chunks to process in parallel (default: 5)',
781
+ },
782
+ {
783
+ displayName: 'Max Tries',
784
+ name: 'maxTries',
785
+ type: 'number',
786
+ displayOptions: {
787
+ show: {
788
+ strategyType: ['llm'],
789
+ },
790
+ },
791
+ default: 3,
792
+ description: 'Maximum number of retry attempts for LLM extraction (default: 3)',
793
+ },
794
+ {
795
+ displayName: 'Min Rows Per Chunk',
796
+ name: 'minRowsPerChunk',
797
+ type: 'number',
798
+ displayOptions: {
799
+ show: {
800
+ strategyType: ['llm'],
801
+ enableChunking: [true],
802
+ },
803
+ },
804
+ default: 20,
805
+ description: 'Minimum number of rows per chunk (default: 20)',
806
+ },
807
+ {
808
+ displayName: 'Strategy Type',
809
+ name: 'strategyType',
810
+ type: 'options',
811
+ options: [
812
+ {
813
+ name: 'None',
814
+ value: 'none',
815
+ description: 'No table extraction',
816
+ },
817
+ {
818
+ name: 'LLM Table Extraction',
819
+ value: 'llm',
820
+ description: 'Extract tables using LLM (handles complex tables with rowspan/colspan)',
821
+ },
822
+ {
823
+ name: 'Default Table Extraction',
824
+ value: 'default',
825
+ description: 'Extract tables using heuristics (faster, simpler tables only)',
826
+ },
827
+ ],
828
+ default: 'none',
829
+ description: 'Table extraction strategy to use',
830
+ },
831
+ {
832
+ displayName: 'Table Score Threshold',
833
+ name: 'tableScoreThreshold',
834
+ type: 'number',
835
+ displayOptions: {
836
+ show: {
837
+ strategyType: ['default'],
838
+ },
839
+ },
840
+ default: 5,
841
+ description: 'Minimum score for table to be included in results (default: 5, range: 0-10)',
842
+ },
843
+ {
844
+ displayName: 'Verbose',
845
+ name: 'verbose',
846
+ type: 'boolean',
847
+ displayOptions: {
848
+ show: {
849
+ strategyType: ['llm', 'default'],
850
+ },
851
+ },
852
+ default: false,
853
+ description: 'Whether to enable verbose logging for table extraction',
854
+ },
855
+ ],
856
+ },
279
857
  ];
280
858
  async function execute(items, nodeOptions) {
281
- var _a;
859
+ var _a, _b;
282
860
  const allResults = [];
283
861
  for (let i = 0; i < items.length; i++) {
284
862
  try {
285
863
  const url = this.getNodeParameter('url', i, '');
286
864
  const browserOptions = this.getNodeParameter('browserOptions', i, {});
865
+ const sessionOptions = this.getNodeParameter('sessionOptions', i, {});
287
866
  const crawlerOptions = this.getNodeParameter('crawlerOptions', i, {});
288
- const options = this.getNodeParameter('options', i, {});
867
+ const outputOptions = this.getNodeParameter('outputOptions', i, {});
868
+ const contentFilter = this.getNodeParameter('contentFilter', i, {});
869
+ const advancedOptions = this.getNodeParameter('advancedOptions', i, {});
870
+ let mergedBrowserOptions = { ...browserOptions, ...sessionOptions };
871
+ if (browserOptions.extraArgs && typeof browserOptions.extraArgs === 'object') {
872
+ const extraArgsCollection = browserOptions.extraArgs;
873
+ if (extraArgsCollection.args && Array.isArray(extraArgsCollection.args)) {
874
+ mergedBrowserOptions.extraArgs = extraArgsCollection.args.map((arg) => arg.value).filter((v) => v);
875
+ }
876
+ }
289
877
  if (!url) {
290
878
  throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'URL cannot be empty.', { itemIndex: i });
291
879
  }
292
880
  if (!(0, utils_1.isValidUrl)(url)) {
293
881
  throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid URL: ${url}`, { itemIndex: i });
294
882
  }
295
- const browserConfig = (0, utils_1.createBrowserConfig)(browserOptions);
883
+ const browserConfig = (0, utils_1.createBrowserConfig)(mergedBrowserOptions);
884
+ const antiBotFeatures = ((_a = advancedOptions.antiBotFeatures) === null || _a === void 0 ? void 0 : _a.features) || {};
296
885
  const crawlerConfig = (0, utils_1.createCrawlerRunConfig)({
297
886
  ...crawlerOptions,
298
887
  ...browserConfig,
888
+ screenshot: outputOptions.screenshot,
889
+ pdf: outputOptions.pdf,
890
+ fetchSslCertificate: outputOptions.fetchSslCertificate,
891
+ magic: antiBotFeatures.magic,
892
+ simulateUser: antiBotFeatures.simulateUser,
893
+ overrideNavigator: antiBotFeatures.overrideNavigator,
894
+ excludeSocialMediaLinks: advancedOptions.excludeSocialMediaLinks,
895
+ excludeExternalImages: advancedOptions.excludeExternalImages,
896
+ delayBeforeReturnHtml: advancedOptions.delayBeforeReturnHtml,
897
+ waitUntil: advancedOptions.waitUntil,
898
+ verbose: advancedOptions.verbose,
299
899
  });
900
+ if (contentFilter.filterType && contentFilter.filterType !== 'none') {
901
+ const enrichedFilterConfig = { ...contentFilter };
902
+ if (contentFilter.filterType === 'llm') {
903
+ const credentials = await this.getCredentials('crawl4aiPlusApi');
904
+ if (!credentials.enableLlm) {
905
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'LLM features must be enabled in Crawl4AI credentials to use LLM content filtering.', { itemIndex: i });
906
+ }
907
+ let provider = 'openai/gpt-4o';
908
+ let apiKey = '';
909
+ if (credentials.llmProvider === 'openai') {
910
+ const model = credentials.llmModel || 'gpt-4o';
911
+ provider = `openai/${model}`;
912
+ apiKey = credentials.apiKey || '';
913
+ }
914
+ else if (credentials.llmProvider === 'anthropic') {
915
+ const model = credentials.llmModel || 'claude-3-haiku-20240307';
916
+ provider = `anthropic/${model}`;
917
+ apiKey = credentials.apiKey || '';
918
+ }
919
+ else if (credentials.llmProvider === 'groq') {
920
+ const model = credentials.llmModel || 'llama3-70b-8192';
921
+ provider = `groq/${model}`;
922
+ apiKey = credentials.apiKey || '';
923
+ }
924
+ else if (credentials.llmProvider === 'ollama') {
925
+ const model = credentials.ollamaModel || 'llama3';
926
+ provider = `ollama/${model}`;
927
+ }
928
+ else if (credentials.llmProvider === 'other') {
929
+ provider = credentials.customProvider || 'custom/model';
930
+ apiKey = credentials.customApiKey || '';
931
+ }
932
+ if (!apiKey && credentials.llmProvider !== 'ollama') {
933
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), `API key is required for ${credentials.llmProvider} provider. Please configure it in the Crawl4AI credentials.`, { itemIndex: i });
934
+ }
935
+ enrichedFilterConfig.llmConfig = {
936
+ type: 'LLMConfig',
937
+ params: {
938
+ provider,
939
+ api_token: apiKey,
940
+ ...(credentials.llmProvider === 'other' && credentials.customBaseUrl ?
941
+ { api_base: credentials.customBaseUrl } : {}),
942
+ ...(credentials.llmProvider === 'ollama' && credentials.ollamaUrl ?
943
+ { api_base: credentials.ollamaUrl } : {})
944
+ }
945
+ };
946
+ }
947
+ crawlerConfig.markdownGenerator = (0, utils_1.createMarkdownGenerator)(enrichedFilterConfig);
948
+ }
949
+ const tableExtractionConfig = this.getNodeParameter('tableExtraction', i, {});
950
+ if (tableExtractionConfig.strategyType && tableExtractionConfig.strategyType !== 'none') {
951
+ const enrichedTableConfig = { ...tableExtractionConfig };
952
+ if (tableExtractionConfig.strategyType === 'llm') {
953
+ const credentials = await this.getCredentials('crawl4aiPlusApi');
954
+ if (!credentials.enableLlm) {
955
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'LLM features must be enabled in Crawl4AI credentials to use LLM table extraction.', { itemIndex: i });
956
+ }
957
+ let provider = 'openai/gpt-4o';
958
+ let apiKey = '';
959
+ if (credentials.llmProvider === 'openai') {
960
+ const model = credentials.llmModel || 'gpt-4o';
961
+ provider = `openai/${model}`;
962
+ apiKey = credentials.apiKey || '';
963
+ }
964
+ else if (credentials.llmProvider === 'anthropic') {
965
+ const model = credentials.llmModel || 'claude-3-haiku-20240307';
966
+ provider = `anthropic/${model}`;
967
+ apiKey = credentials.apiKey || '';
968
+ }
969
+ else if (credentials.llmProvider === 'groq') {
970
+ const model = credentials.llmModel || 'llama3-70b-8192';
971
+ provider = `groq/${model}`;
972
+ apiKey = credentials.apiKey || '';
973
+ }
974
+ else if (credentials.llmProvider === 'ollama') {
975
+ const model = credentials.ollamaModel || 'llama3';
976
+ provider = `ollama/${model}`;
977
+ }
978
+ else if (credentials.llmProvider === 'other') {
979
+ provider = credentials.customProvider || 'custom/model';
980
+ apiKey = credentials.customApiKey || '';
981
+ }
982
+ if (!apiKey && credentials.llmProvider !== 'ollama') {
983
+ throw new n8n_workflow_1.NodeOperationError(this.getNode(), `API key is required for ${credentials.llmProvider} provider. Please configure it in the Crawl4AI credentials.`, { itemIndex: i });
984
+ }
985
+ enrichedTableConfig.llmConfig = {
986
+ type: 'LLMConfig',
987
+ params: {
988
+ provider,
989
+ api_token: apiKey,
990
+ ...(credentials.llmProvider === 'other' && credentials.customBaseUrl ?
991
+ { api_base: credentials.customBaseUrl } : {}),
992
+ ...(credentials.llmProvider === 'ollama' && credentials.ollamaUrl ?
993
+ { api_base: credentials.ollamaUrl } : {})
994
+ }
995
+ };
996
+ }
997
+ crawlerConfig.tableExtraction = (0, utils_1.createTableExtractionStrategy)(enrichedTableConfig);
998
+ }
300
999
  const crawler = await (0, utils_1.getCrawl4aiClient)(this);
301
1000
  const result = await crawler.crawlUrl(url, crawlerConfig);
302
- const formattedResult = (0, formatters_1.formatCrawlResult)(result, options.includeMedia, options.verboseResponse);
1001
+ const formattedResult = (0, formatters_1.formatCrawlResult)(result, outputOptions.includeMedia, outputOptions.verboseResponse, {
1002
+ markdownOutput: outputOptions.markdownOutput,
1003
+ includeLinks: outputOptions.includeLinks,
1004
+ includeScreenshot: outputOptions.screenshot,
1005
+ includePdf: outputOptions.pdf,
1006
+ includeSslCertificate: outputOptions.fetchSslCertificate,
1007
+ includeTables: outputOptions.includeTables,
1008
+ });
303
1009
  allResults.push({
304
1010
  json: formattedResult,
305
1011
  pairedItem: { item: i },
@@ -308,7 +1014,7 @@ async function execute(items, nodeOptions) {
308
1014
  catch (error) {
309
1015
  if (this.continueOnFail()) {
310
1016
  const node = this.getNode();
311
- const errorItemIndex = (_a = error.itemIndex) !== null && _a !== void 0 ? _a : i;
1017
+ const errorItemIndex = (_b = error.itemIndex) !== null && _b !== void 0 ? _b : i;
312
1018
  allResults.push({
313
1019
  json: items[i].json,
314
1020
  error: new n8n_workflow_1.NodeOperationError(node, error.message, { itemIndex: errorItemIndex }),