n8n-nodes-crawl4ai-plus 2.0.8 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +23 -23
- package/README.md +129 -41
- package/dist/credentials/Crawl4aiApi.credentials.js +2 -34
- package/dist/credentials/Crawl4aiApi.credentials.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/Crawl4aiPlusBasicCrawler.node.js +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/Crawl4aiPlusBasicCrawler.node.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js +1230 -30
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js +715 -9
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js +495 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js +9 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.d.ts +4 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js +94 -60
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.d.ts +8 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js +49 -12
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/interfaces.d.ts +38 -5
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.d.ts +13 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js +270 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/Crawl4aiPlusContentExtractor.node.js +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/Crawl4aiPlusContentExtractor.node.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js +445 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js +108 -8
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js +49 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js +134 -17
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js +27 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js +206 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js +376 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.d.ts +4 -2
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js +53 -16
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/index.js +11 -11
- package/package.json +1 -1
- /package/dist/nodes/Crawl4aiPlusBasicCrawler/{crawl4ai.svg → crawl4aiplus.svg} +0 -0
- /package/dist/nodes/Crawl4aiPlusContentExtractor/{crawl4ai.svg → crawl4aiplus.svg} +0 -0
package/LICENSE
CHANGED
|
@@ -1,23 +1,23 @@
|
|
|
1
|
-
MIT License
|
|
2
|
-
|
|
3
|
-
Copyright (c) 2025 Heictor Hsiao (Original Author)
|
|
4
|
-
Copyright (c) 2025 Matias Lopez (First Maintainer)
|
|
5
|
-
Copyright (c) 2025 Max Soukhomlinov (Current Maintainer)
|
|
6
|
-
|
|
7
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
9
|
-
in the Software without restriction, including without limitation the rights
|
|
10
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
12
|
-
furnished to do so, subject to the following conditions:
|
|
13
|
-
|
|
14
|
-
The above copyright notice and this permission notice shall be included in all
|
|
15
|
-
copies or substantial portions of the Software.
|
|
16
|
-
|
|
17
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
|
-
SOFTWARE.
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Heictor Hsiao (Original Author)
|
|
4
|
+
Copyright (c) 2025 Matias Lopez (First Maintainer)
|
|
5
|
+
Copyright (c) 2025 Max Soukhomlinov (Current Maintainer)
|
|
6
|
+
|
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
9
|
+
in the Software without restriction, including without limitation the rights
|
|
10
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
11
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
12
|
+
furnished to do so, subject to the following conditions:
|
|
13
|
+
|
|
14
|
+
The above copyright notice and this permission notice shall be included in all
|
|
15
|
+
copies or substantial portions of the Software.
|
|
16
|
+
|
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
20
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
21
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
22
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
23
|
+
SOFTWARE.
|
package/README.md
CHANGED
|
@@ -13,39 +13,66 @@ This is a maintained fork with enhanced features for Crawl4AI 0.7.x.
|
|
|
13
13
|
|
|
14
14
|
All credit for the original implementation goes to **Heictor Hsiao** and **Matias Lopez**.
|
|
15
15
|
|
|
16
|
-
## What's New in
|
|
17
|
-
|
|
18
|
-
This
|
|
19
|
-
|
|
20
|
-
|
|
16
|
+
## What's New in v2.1.0
|
|
17
|
+
|
|
18
|
+
This version includes comprehensive Crawl4AI 0.7.4+ support with major improvements:
|
|
19
|
+
|
|
20
|
+
### 🚀 Major Features
|
|
21
|
+
- ✅ **Recursive Deep Crawling** - Keyword-driven recursive crawling with BestFirst/BFS/DFS strategies
|
|
22
|
+
- ✅ **6 Extraction Strategies** - CSS, LLM, JSON, Regex, Cosine Similarity, and SEO Metadata extraction
|
|
23
|
+
- ✅ **LLM Pattern Generation** - Natural language to regex pattern conversion
|
|
24
|
+
- ✅ **Table Extraction** - LLM-based and default table extraction for complex structures
|
|
25
|
+
- ✅ **Session Management** - Browser storage state, cookies, and persistent contexts
|
|
26
|
+
- ✅ **Output Formats** - Screenshot, PDF, and SSL certificate extraction
|
|
27
|
+
- ✅ **Content Filtering** - Pruning, BM25, and LLM content filters
|
|
28
|
+
- ✅ **Anti-Bot Features** - Magic mode, user simulation, and navigator override
|
|
29
|
+
|
|
30
|
+
### 🔧 Core Improvements
|
|
31
|
+
- ✅ **Unified API Client** - Standardized error handling with actionable messages
|
|
32
|
+
- ✅ **95%+ API Coverage** - Comprehensive support for Crawl4AI 0.7.4 REST API
|
|
21
33
|
- ✅ **Multi-Browser Support** - Chromium, Firefox, and Webkit
|
|
34
|
+
- ✅ **22+ LLM Providers** - OpenAI, Anthropic, Groq, Ollama, and custom providers
|
|
22
35
|
- ✅ **Enhanced Cache Modes** - 5 modes (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS)
|
|
23
|
-
- ✅ **22+ LLM Providers** - OpenAI, Anthropic, Google, DeepSeek, Groq, Ollama, and more
|
|
24
|
-
- ✅ **Dynamic Content** - wait_for parameter for JavaScript-heavy sites
|
|
25
|
-
- ✅ **External LiteLLM Proxy** - Connect to custom LLM endpoints
|
|
26
|
-
- ✅ **Better Error Handling** - Exposed status codes and detailed error messages
|
|
27
36
|
|
|
28
37
|
---
|
|
29
38
|
|
|
30
39
|
This project provides n8n integration for Crawl4AI, a powerful web crawling and data extraction tool. It consists of two main nodes:
|
|
31
40
|
|
|
32
|
-
1. **Crawl4AI: Basic Crawler** - For general web crawling and content extraction
|
|
33
|
-
2. **Crawl4AI: Content Extractor** - For extracting structured data using
|
|
41
|
+
1. **Crawl4AI Plus: Basic Crawler** - For general web crawling, recursive discovery, and content extraction
|
|
42
|
+
2. **Crawl4AI Plus: Content Extractor** - For extracting structured data using 6 different extraction strategies
|
|
34
43
|
|
|
35
44
|
## Features
|
|
36
45
|
|
|
37
46
|
### Basic Crawler Node
|
|
38
47
|
|
|
39
|
-
- **Crawl Single URL** - Extract content from a single web page
|
|
40
|
-
- **Crawl Multiple URLs** - Process multiple web pages
|
|
48
|
+
- **Crawl Single URL** - Extract content from a single web page with full configuration options
|
|
49
|
+
- **Crawl Multiple URLs** - Process multiple web pages or use recursive discovery mode
|
|
50
|
+
- **Recursive Discovery** - Keyword-driven deep crawling with configurable depth and filters
|
|
51
|
+
- **Multiple Strategies** - BestFirst (recommended), BFS, or DFS crawling strategies
|
|
52
|
+
- **Extraction Options** - Apply CSS or LLM extraction to each discovered page (shallow crawl with extraction)
|
|
41
53
|
- **Process Raw HTML** - Extract content from raw HTML without crawling
|
|
54
|
+
- **Discover Links** - Extract and filter all links from a page
|
|
55
|
+
- **Link Types** - Filter by internal or external links
|
|
56
|
+
- **Pattern Filters** - Include/exclude URLs by pattern matching
|
|
57
|
+
- **Output Formats** - Grouped or split output for workflow flexibility
|
|
42
58
|
|
|
43
59
|
### Content Extractor Node
|
|
44
60
|
|
|
45
61
|
- **CSS Selector Extractor** - Extract structured data using CSS selectors
|
|
46
|
-
- **LLM Extractor** - Use AI to extract structured data
|
|
47
|
-
- **
|
|
48
|
-
- **
|
|
62
|
+
- **LLM Extractor** - Use AI to extract structured data with schema support
|
|
63
|
+
- **Input Formats** - Markdown, HTML, or fit_markdown
|
|
64
|
+
- **Schema Modes** - Simple fields or advanced JSON schema
|
|
65
|
+
- **JSON Extractor** - Extract and process JSON data from web pages (direct, script tags, or JSON-LD)
|
|
66
|
+
- **Regex Extractor** - Extract data using 21 built-in patterns, custom regex, or LLM-generated patterns
|
|
67
|
+
- **Quick Presets** - Contact Info and Financial Data presets for common extraction tasks
|
|
68
|
+
- **Cosine Similarity Extractor** - Semantic similarity-based content extraction with clustering (requires `all` Docker image)
|
|
69
|
+
- **SEO Metadata Extractor** - Extract SEO metadata including:
|
|
70
|
+
- **Basic Meta Tags** - Title, description, keywords, canonical URL
|
|
71
|
+
- **Open Graph Tags** - OG title, description, image, type
|
|
72
|
+
- **Twitter Cards** - Twitter card metadata
|
|
73
|
+
- **JSON-LD** - Schema.org structured data
|
|
74
|
+
|
|
75
|
+
> **Note**: Table extraction is available in the **Basic Crawler** node via the Table Extraction options (LLM-based or default heuristics).
|
|
49
76
|
|
|
50
77
|
## Installation
|
|
51
78
|
|
|
@@ -62,9 +89,10 @@ Before using the nodes, you need to set up Crawl4AI API credentials:
|
|
|
62
89
|
1. Go to **Settings > Credentials > New**
|
|
63
90
|
2. Select **Crawl4AI API**
|
|
64
91
|
3. Configure connection settings:
|
|
65
|
-
- **
|
|
66
|
-
- **Authentication**:
|
|
67
|
-
- **LLM Settings**: Enable and configure
|
|
92
|
+
- **Docker URL**: URL of your Crawl4AI Docker container (default: `http://localhost:11235`)
|
|
93
|
+
- **Authentication**: Optional token or basic auth if your Docker instance requires it
|
|
94
|
+
- **LLM Settings**: Enable and configure LLM provider for AI extraction features
|
|
95
|
+
- Supported providers: OpenAI, Anthropic, Groq, Ollama, or custom LiteLLM endpoints
|
|
68
96
|
|
|
69
97
|
### Basic Crawler Usage
|
|
70
98
|
|
|
@@ -79,75 +107,135 @@ The Basic Crawler node allows you to crawl web pages and extract their content:
|
|
|
79
107
|
|
|
80
108
|
The Content Extractor node allows you to extract structured data from web pages:
|
|
81
109
|
|
|
82
|
-
1. Add the "Crawl4AI: Content Extractor" node to your workflow
|
|
83
|
-
2. Select an extraction method
|
|
110
|
+
1. Add the "Crawl4AI Plus: Content Extractor" node to your workflow
|
|
111
|
+
2. Select an extraction method:
|
|
112
|
+
- **CSS Selector** - For structured pages with consistent selectors
|
|
113
|
+
- **LLM Extractor** - For AI-powered extraction with natural language instructions
|
|
114
|
+
- **JSON Extractor** - For JSON APIs or embedded JSON data
|
|
115
|
+
- **Regex Extractor** - For pattern-based extraction (21 built-in patterns or custom)
|
|
116
|
+
- **Cosine Extractor** - For semantic similarity-based clustering (requires transformers)
|
|
117
|
+
- **SEO Metadata** - For extracting page titles, meta tags, OG tags, and JSON-LD structured data
|
|
84
118
|
3. Configure the extraction parameters
|
|
85
119
|
4. Run the workflow to extract structured data
|
|
86
120
|
|
|
121
|
+
> **Tip**: For table extraction, use the **Basic Crawler** node with Table Extraction options enabled.
|
|
122
|
+
|
|
87
123
|
## Configuration Options
|
|
88
124
|
|
|
89
125
|
### Browser Options
|
|
90
126
|
|
|
127
|
+
- **Browser Type**: Chromium (default), Firefox, or Webkit
|
|
91
128
|
- **Headless Mode**: Run browser in headless mode
|
|
92
129
|
- **Enable JavaScript**: Enable JavaScript execution
|
|
130
|
+
- **Enable Stealth Mode**: Bypass basic bot detection
|
|
131
|
+
- **Extra Browser Arguments**: Custom command-line arguments
|
|
93
132
|
- **Viewport Size**: Set browser viewport dimensions
|
|
94
133
|
- **Timeout**: Maximum time to wait for page load
|
|
95
134
|
- **User Agent**: Override browser user agent
|
|
96
135
|
|
|
136
|
+
### Session & Authentication
|
|
137
|
+
|
|
138
|
+
- **Storage State (JSON)**: Browser storage state for authenticated sessions (works in all n8n environments)
|
|
139
|
+
- **Cookies**: Array of cookie objects for simple authentication
|
|
140
|
+
- **User Data Directory**: Persistent browser profiles (self-hosted only)
|
|
141
|
+
- **Use Managed Browser**: Enable managed browser mode for persistent contexts
|
|
142
|
+
|
|
97
143
|
### Crawler Options
|
|
98
144
|
|
|
99
|
-
- **Cache Mode**:
|
|
100
|
-
- **JavaScript Code**: Execute custom JS on the page
|
|
145
|
+
- **Cache Mode**: 5 modes (ENABLED, DISABLED, READ_ONLY, WRITE_ONLY, BYPASS)
|
|
146
|
+
- **JavaScript Code**: Execute custom JS on the page before extraction
|
|
101
147
|
- **CSS Selector**: Focus crawling on specific elements
|
|
148
|
+
- **Wait Until**: Control when page is considered loaded
|
|
149
|
+
- **Delay Before Return**: Add delay before returning HTML
|
|
102
150
|
- **Excluded Tags**: Skip specific HTML tags
|
|
103
151
|
- **Check Robots.txt**: Respect robots.txt rules
|
|
104
152
|
- **Word Count Threshold**: Filter content by word count
|
|
105
153
|
|
|
154
|
+
### Deep Crawl Options (Crawl Multiple URLs)
|
|
155
|
+
|
|
156
|
+
- **Crawl Mode**: Manual URL list or Recursive Discovery
|
|
157
|
+
- **Seed URL**: Starting URL for recursive discovery
|
|
158
|
+
- **Query**: Keywords for relevance-based crawling
|
|
159
|
+
- **Strategy**: BestFirst (recommended), BFS, or DFS
|
|
160
|
+
- **Max Depth**: Maximum crawl depth
|
|
161
|
+
- **Max Pages**: Maximum number of pages to crawl
|
|
162
|
+
- **Domain Filters**: Include/exclude specific domains
|
|
163
|
+
- **URL Pattern Filters**: Regex patterns for URL filtering
|
|
164
|
+
|
|
165
|
+
### Output Options
|
|
166
|
+
|
|
167
|
+
- **Screenshot**: Capture page screenshots
|
|
168
|
+
- **PDF**: Generate PDF from page
|
|
169
|
+
- **SSL Certificate**: Extract SSL certificate information
|
|
170
|
+
- **Markdown Variants**: Raw markdown, fit markdown, or cleaned markdown
|
|
171
|
+
- **Structured Links**: Extract and structure all links from page
|
|
172
|
+
|
|
173
|
+
### Content Filtering
|
|
174
|
+
|
|
175
|
+
- **Pruning Filter**: Remove low-value content based on thresholds
|
|
176
|
+
- **BM25 Filter**: Relevance-based content filtering
|
|
177
|
+
- **LLM Content Filter**: Intelligent content filtering using LLM
|
|
178
|
+
|
|
106
179
|
### LLM Extraction Options
|
|
107
180
|
|
|
108
|
-
- **Extraction Instructions**:
|
|
109
|
-
- **Schema
|
|
181
|
+
- **Extraction Instructions**: Natural language instructions for the AI
|
|
182
|
+
- **Schema Mode**: Simple fields or advanced JSON schema
|
|
183
|
+
- **Input Format**: Markdown (default), HTML, or fit_markdown
|
|
110
184
|
- **LLM Provider**: Choose AI model provider
|
|
111
|
-
- **Temperature**: Control randomness of AI responses
|
|
185
|
+
- **Temperature**: Control randomness of AI responses (0-1)
|
|
186
|
+
- **Max Tokens**: Maximum tokens for LLM response
|
|
112
187
|
|
|
113
188
|
## Project Structure
|
|
114
189
|
|
|
115
190
|
```
|
|
116
191
|
nodes/
|
|
117
|
-
├──
|
|
118
|
-
│ ├──
|
|
119
|
-
│ ├──
|
|
192
|
+
├── Crawl4aiPlusBasicCrawler/
|
|
193
|
+
│ ├── Crawl4aiPlusBasicCrawler.node.ts # Main node file
|
|
194
|
+
│ ├── crawl4aiplus.svg # Icon
|
|
120
195
|
│ ├── actions/
|
|
121
196
|
│ │ ├── operations.ts # Operations definition
|
|
122
197
|
│ │ ├── router.ts # Router handler
|
|
123
198
|
│ │ ├── crawlSingleUrl.operation.ts # Single URL crawl operation
|
|
124
|
-
│ │ ├── crawlMultipleUrls.operation.ts # Multiple URL crawl
|
|
125
|
-
│ │
|
|
199
|
+
│ │ ├── crawlMultipleUrls.operation.ts # Multiple URL crawl + recursive discovery
|
|
200
|
+
│ │ ├── processRawHtml.operation.ts # Raw HTML processing operation
|
|
201
|
+
│ │ └── discoverLinks.operation.ts # Link discovery and extraction
|
|
126
202
|
│ └── helpers/
|
|
127
203
|
│ ├── interfaces.ts # Interface definitions
|
|
128
204
|
│ ├── utils.ts # Common utilities
|
|
129
|
-
│ ├── apiClient.ts # API client
|
|
205
|
+
│ ├── apiClient.ts # Unified API client
|
|
130
206
|
│ └── formatters.ts # Formatting tools
|
|
131
207
|
│
|
|
132
|
-
└──
|
|
133
|
-
├──
|
|
134
|
-
├──
|
|
208
|
+
└── Crawl4aiPlusContentExtractor/
|
|
209
|
+
├── Crawl4aiPlusContentExtractor.node.ts # Main node file
|
|
210
|
+
├── crawl4aiplus.svg # Icon
|
|
135
211
|
├── actions/
|
|
136
212
|
│ ├── operations.ts # Operations definition
|
|
137
213
|
│ ├── router.ts # Router handler
|
|
138
|
-
│ ├── cssExtractor.operation.ts # CSS selector extraction
|
|
139
|
-
│ ├── llmExtractor.operation.ts # LLM extraction
|
|
140
|
-
│
|
|
214
|
+
│ ├── cssExtractor.operation.ts # CSS selector extraction
|
|
215
|
+
│ ├── llmExtractor.operation.ts # LLM extraction
|
|
216
|
+
│ ├── jsonExtractor.operation.ts # JSON extraction
|
|
217
|
+
│ ├── regexExtractor.operation.ts # Regex extraction
|
|
218
|
+
│ ├── cosineExtractor.operation.ts # Cosine similarity extraction
|
|
219
|
+
│ └── seoExtractor.operation.ts # SEO metadata extraction
|
|
141
220
|
└── helpers/
|
|
142
221
|
├── interfaces.ts # Interface definitions
|
|
143
|
-
|
|
144
|
-
├── apiClient.ts # API client
|
|
145
|
-
└── formatters.ts # Formatting tools
|
|
222
|
+
└── utils.ts # Common utilities (re-exports from BasicCrawler)
|
|
146
223
|
|
|
147
224
|
credentials/
|
|
148
225
|
└── Crawl4aiApi.credentials.ts # Credentials definition
|
|
149
226
|
```
|
|
150
227
|
|
|
228
|
+
## Requirements
|
|
229
|
+
|
|
230
|
+
- **n8n**: Version 1.79.1 or higher
|
|
231
|
+
- **Crawl4AI Docker**: Version 0.7.3+ (0.7.4 recommended)
|
|
232
|
+
- For Cosine Similarity Extractor: Use `unclecode/crawl4ai:all` image (includes transformers)
|
|
233
|
+
- Standard image: `unclecode/crawl4ai:latest` or `unclecode/crawl4ai:0.7.3`
|
|
234
|
+
|
|
235
|
+
## Version History
|
|
236
|
+
|
|
237
|
+
See [CHANGELOG.md](CHANGELOG.md) for detailed version history and breaking changes.
|
|
238
|
+
|
|
151
239
|
## License
|
|
152
240
|
|
|
153
241
|
MIT
|
|
@@ -7,37 +7,13 @@ class Crawl4aiApi {
|
|
|
7
7
|
this.displayName = 'Crawl4AI Plus API';
|
|
8
8
|
this.documentationUrl = 'https://github.com/msoukhomlinov/n8n-nodes-crawl4ai';
|
|
9
9
|
this.properties = [
|
|
10
|
-
{
|
|
11
|
-
displayName: 'Connection Mode',
|
|
12
|
-
name: 'connectionMode',
|
|
13
|
-
type: 'options',
|
|
14
|
-
options: [
|
|
15
|
-
{
|
|
16
|
-
name: 'Direct Python Package',
|
|
17
|
-
value: 'direct',
|
|
18
|
-
description: 'Use Crawl4AI directly as a Python package'
|
|
19
|
-
},
|
|
20
|
-
{
|
|
21
|
-
name: 'Docker Client',
|
|
22
|
-
value: 'docker',
|
|
23
|
-
description: 'Connect to a Crawl4AI Docker container'
|
|
24
|
-
},
|
|
25
|
-
],
|
|
26
|
-
default: 'docker',
|
|
27
|
-
description: 'The mode to connect to Crawl4AI'
|
|
28
|
-
},
|
|
29
10
|
{
|
|
30
11
|
displayName: 'Docker Server URL',
|
|
31
12
|
name: 'dockerUrl',
|
|
32
13
|
type: 'string',
|
|
33
14
|
default: 'http://crawl4ai:11235',
|
|
34
15
|
placeholder: 'http://crawl4ai:11235',
|
|
35
|
-
description: 'The URL of the Crawl4AI Docker server',
|
|
36
|
-
displayOptions: {
|
|
37
|
-
show: {
|
|
38
|
-
connectionMode: ['docker'],
|
|
39
|
-
},
|
|
40
|
-
},
|
|
16
|
+
description: 'The URL of the Crawl4AI Docker REST API server. This node connects to Crawl4AI via Docker REST API.',
|
|
41
17
|
},
|
|
42
18
|
{
|
|
43
19
|
displayName: 'Authentication Type',
|
|
@@ -61,12 +37,7 @@ class Crawl4aiApi {
|
|
|
61
37
|
},
|
|
62
38
|
],
|
|
63
39
|
default: 'token',
|
|
64
|
-
description: 'The authentication method to use',
|
|
65
|
-
displayOptions: {
|
|
66
|
-
show: {
|
|
67
|
-
connectionMode: ['docker'],
|
|
68
|
-
},
|
|
69
|
-
},
|
|
40
|
+
description: 'The authentication method to use for the Docker REST API',
|
|
70
41
|
},
|
|
71
42
|
{
|
|
72
43
|
displayName: 'API Token',
|
|
@@ -79,7 +50,6 @@ class Crawl4aiApi {
|
|
|
79
50
|
description: 'The API token for Docker server authentication',
|
|
80
51
|
displayOptions: {
|
|
81
52
|
show: {
|
|
82
|
-
connectionMode: ['docker'],
|
|
83
53
|
authenticationType: ['token'],
|
|
84
54
|
},
|
|
85
55
|
},
|
|
@@ -92,7 +62,6 @@ class Crawl4aiApi {
|
|
|
92
62
|
description: 'The username for Docker server authentication',
|
|
93
63
|
displayOptions: {
|
|
94
64
|
show: {
|
|
95
|
-
connectionMode: ['docker'],
|
|
96
65
|
authenticationType: ['basic'],
|
|
97
66
|
},
|
|
98
67
|
},
|
|
@@ -108,7 +77,6 @@ class Crawl4aiApi {
|
|
|
108
77
|
description: 'The password for Docker server authentication',
|
|
109
78
|
displayOptions: {
|
|
110
79
|
show: {
|
|
111
|
-
connectionMode: ['docker'],
|
|
112
80
|
authenticationType: ['basic'],
|
|
113
81
|
},
|
|
114
82
|
},
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"Crawl4aiApi.credentials.js","sourceRoot":"","sources":["../../credentials/Crawl4aiApi.credentials.ts"],"names":[],"mappings":";;;AAEA,MAAa,WAAW;IAAxB;QACC,SAAI,GAAG,iBAAiB,CAAC;QACzB,gBAAW,GAAG,mBAAmB,CAAC;QAClC,qBAAgB,GAAG,qDAAqD,CAAC;QACzE,eAAU,GAAsB;YAE/B;gBACC,WAAW,EAAE,
|
|
1
|
+
{"version":3,"file":"Crawl4aiApi.credentials.js","sourceRoot":"","sources":["../../credentials/Crawl4aiApi.credentials.ts"],"names":[],"mappings":";;;AAEA,MAAa,WAAW;IAAxB;QACC,SAAI,GAAG,iBAAiB,CAAC;QACzB,gBAAW,GAAG,mBAAmB,CAAC;QAClC,qBAAgB,GAAG,qDAAqD,CAAC;QACzE,eAAU,GAAsB;YAE/B;gBACC,WAAW,EAAE,mBAAmB;gBAChC,IAAI,EAAE,WAAW;gBACjB,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,uBAAuB;gBAChC,WAAW,EAAE,uBAAuB;gBACpC,WAAW,EAAE,qGAAqG;aAClH;YACD;gBACC,WAAW,EAAE,qBAAqB;gBAClC,IAAI,EAAE,oBAAoB;gBAC1B,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE;oBACR;wBACC,IAAI,EAAE,mBAAmB;wBACzB,KAAK,EAAE,MAAM;wBACb,WAAW,EAAE,+BAA+B;qBAC5C;oBACD;wBACC,IAAI,EAAE,sBAAsB;wBAC5B,KAAK,EAAE,OAAO;wBACd,WAAW,EAAE,qCAAqC;qBAClD;oBACD;wBACC,IAAI,EAAE,kCAAkC;wBACxC,KAAK,EAAE,OAAO;wBACd,WAAW,EAAE,8CAA8C;qBAC3D;iBACD;gBACD,OAAO,EAAE,OAAO;gBAChB,WAAW,EAAE,0DAA0D;aACvE;YACD;gBACC,WAAW,EAAE,WAAW;gBACxB,IAAI,EAAE,UAAU;gBAChB,IAAI,EAAE,QAAQ;gBACd,WAAW,EAAE;oBACZ,QAAQ,EAAE,IAAI;iBACd;gBACD,OAAO,EAAE,EAAE;gBACX,WAAW,EAAE,gDAAgD;gBAC7D,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,kBAAkB,EAAE,CAAC,OAAO,CAAC;qBAC7B;iBACD;aACD;YACD;gBACC,WAAW,EAAE,UAAU;gBACvB,IAAI,EAAE,UAAU;gBAChB,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,EAAE;gBACX,WAAW,EAAE,+CAA+C;gBAC5D,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,kBAAkB,EAAE,CAAC,OAAO,CAAC;qBAC7B;iBACD;aACD;YACD;gBACC,WAAW,EAAE,UAAU;gBACvB,IAAI,EAAE,UAAU;gBAChB,IAAI,EAAE,QAAQ;gBACd,WAAW,EAAE;oBACZ,QAAQ,EAAE,IAAI;iBACd;gBACD,OAAO,EAAE,EAAE;gBACX,WAAW,EAAE,+CAA+C;gBAC5D,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,kBAAkB,EAAE,CAAC,OAAO,CAAC;qBAC7B;iBACD;aACD;YAED;gBACC,WAAW,EAAE,qBAAqB;gBAClC,IAAI,EAAE,WAAW;gBACjB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE,KAAK;gBACd,WAAW,EAAE,sCAAsC;aACnD;YACD;gBACC,WAAW,EAAE,cAAc;gBAC3B,IAAI,EAAE,aAAa;gBACnB,IAAI,EAAE,SAAS;gBACf,OAAO,EAAE;oBACR;wBACC,IAAI,EAAE,QAAQ;wBACd,KAAK,EAAE,QAAQ;qBACf;oBACD;wBACC,IAAI,EAAE,QAAQ;wBACd,KAAK,EAAE,QAAQ;qBACf;oBACD;wBACC,IAAI,EAAE,MAAM;wBACZ,KAAK,EAAE,MAAM;qBACb;oBACD;wBACC,IAAI,EAAE,WAAW;wBACjB,KAAK,EAAE,WAAW;qBAClB;oBACD;wBACC,IAAI,EAAE,kBAAkB;wBACxB,KAAK,EAAE,OAAO;qBACd;iBACD;gBACD,OAAO,EAAE,QAAQ;gBACjB,WAAW,EAAE,gDAAgD;gBAC7D,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,SAAS,EAAE,CAAC,IAAI,CAAC;qBACjB;iBACD;aACD;YACD;gBACC,WAAW,EAAE,cAAc;gBAC3B,IAAI,EAAE,UAAU;gBAChB,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,QAAQ;gBACjB,WAAW,EAAE,aAAa;gBAC1B,WAAW,EAAE,4FAA4F;gBACzG,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,SAAS,EAAE,CAAC,IAAI,CAAC;wBACjB,WAAW,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,WAAW,CAAC;qBAC5C;iBACD;aACD;YACD;gBACC,WAAW,EAAE,iBAAiB;gBAC9B,IAAI,EAAE,aAAa;gBACnB,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,QAAQ;gBACjB,WAAW,EAAE,UAAU;gBACvB,WAAW,EAAE,4CAA4C;gBACzD,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,SAAS,EAAE,CAAC,IAAI,CAAC;wBACjB,WAAW,EAAE,CAAC,QAAQ,CAAC;qBACvB;iBACD;aACD;YACD;gBACC,WAAW,EAAE,SAAS;gBACtB,IAAI,EAAE,QAAQ;gBACd,IAAI,EAAE,QAAQ;gBACd,WAAW,EAAE;oBACZ,QAAQ,EAAE,IAAI;iBACd;gBACD,OAAO,EAAE,EAAE;gBACX,WAAW,EAAE,kCAAkC;gBAC/C,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,SAAS,EAAE,CAAC,IAAI,CAAC;wBACjB,WAAW,EAAE,CAAC,QAAQ,EAAE,MAAM,EAAE,WAAW,CAAC;qBAC5C;iBACD;aACD;YACD;gBACC,WAAW,EAAE,YAAY;gBACzB,IAAI,EAAE,WAAW;gBACjB,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,wBAAwB;gBACjC,WAAW,EAAE,2BAA2B;gBACxC,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,SAAS,EAAE,CAAC,IAAI,CAAC;wBACjB,WAAW,EAAE,CAAC,QAAQ,CAAC;qBACvB;iBACD;aACD;YACD;gBACC,WAAW,EAAE,iBAAiB;gBAC9B,IAAI,EAAE,gBAAgB;gBACtB,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,EAAE;gBACX,WAAW,EAAE,sCAAsC;gBACnD,WAAW,EAAE,kJAAkJ;gBAC/J,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,SAAS,EAAE,CAAC,IAAI,CAAC;wBACjB,WAAW,EAAE,CAAC,OAAO,CAAC;qBACtB;iBACD;aACD;YACD;gBACC,WAAW,EAAE,iBAAiB;gBAC9B,IAAI,EAAE,eAAe;gBACrB,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,EAAE;gBACX,WAAW,EAAE,sCAAsC;gBACnD,WAAW,EAAE,0IAA0I;gBACvJ,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,SAAS,EAAE,CAAC,IAAI,CAAC;wBACjB,WAAW,EAAE,CAAC,OAAO,CAAC;qBACtB;iBACD;aACD;YACD;gBACC,WAAW,EAAE,yBAAyB;gBACtC,IAAI,EAAE,cAAc;gBACpB,IAAI,EAAE,QAAQ;gBACd,WAAW,EAAE;oBACZ,QAAQ,EAAE,IAAI;iBACd;gBACD,OAAO,EAAE,EAAE;gBACX,WAAW,EAAE,6DAA6D;gBAC1E,cAAc,EAAE;oBACf,IAAI,EAAE;wBACL,SAAS,EAAE,CAAC,IAAI,CAAC;wBACjB,WAAW,EAAE,CAAC,OAAO,CAAC;qBACtB;iBACD;aACD;YAED;gBACC,WAAW,EAAE,iBAAiB;gBAC9B,IAAI,EAAE,UAAU;gBAChB,IAAI,EAAE,QAAQ;gBACd,OAAO,EAAE,EAAE;gBACX,WAAW,EAAE,gBAAgB;gBAC7B,WAAW,EAAE,8DAA8D;aAC3E;SACD,CAAC;IACH,CAAC;CAAA;AAzOD,kCAyOC"}
|
|
@@ -8,7 +8,7 @@ class Crawl4aiPlusBasicCrawler {
|
|
|
8
8
|
this.description = {
|
|
9
9
|
displayName: 'Crawl4AI Plus: Basic Crawler',
|
|
10
10
|
name: 'crawl4aiPlusBasicCrawler',
|
|
11
|
-
icon: 'file:
|
|
11
|
+
icon: 'file:crawl4aiplus.svg',
|
|
12
12
|
group: ['transform'],
|
|
13
13
|
version: 1,
|
|
14
14
|
subtitle: '={{$parameter["operation"]}}',
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"Crawl4aiPlusBasicCrawler.node.js","sourceRoot":"","sources":["../../../nodes/Crawl4aiPlusBasicCrawler/Crawl4aiPlusBasicCrawler.node.ts"],"names":[],"mappings":";;;AAQA,6CAA0C;AAC1C,qDAA4E;AAG5E,MAAa,wBAAwB;IAArC;QACC,gBAAW,GAAyB;YACnC,WAAW,EAAE,8BAA8B;YAC3C,IAAI,EAAE,0BAA0B;YAChC,IAAI,EAAE,
|
|
1
|
+
{"version":3,"file":"Crawl4aiPlusBasicCrawler.node.js","sourceRoot":"","sources":["../../../nodes/Crawl4aiPlusBasicCrawler/Crawl4aiPlusBasicCrawler.node.ts"],"names":[],"mappings":";;;AAQA,6CAA0C;AAC1C,qDAA4E;AAG5E,MAAa,wBAAwB;IAArC;QACC,gBAAW,GAAyB;YACnC,WAAW,EAAE,8BAA8B;YAC3C,IAAI,EAAE,0BAA0B;YAChC,IAAI,EAAE,uBAAuB;YAC7B,KAAK,EAAE,CAAC,WAAW,CAAC;YACpB,OAAO,EAAE,CAAC;YACV,QAAQ,EAAE,8BAA8B;YACxC,WAAW,EAAE,kDAAkD;YAC/D,QAAQ,EAAE;gBACT,IAAI,EAAE,8BAA8B;aACpC;YACD,MAAM,EAAE,CAAC,MAAM,CAAC;YAChB,OAAO,EAAE,CAAC,MAAM,CAAC;YACjB,WAAW,EAAE;gBACZ;oBACC,IAAI,EAAE,iBAAiB;oBACvB,QAAQ,EAAE,IAAI;iBACd;aACD;YACD,UAAU,EAAE;gBACX,GAAG,wBAAqB;aACxB;SACD,CAAC;IAKH,CAAC;IAHA,KAAK,CAAC,OAAO;QACZ,OAAO,MAAM,eAAM,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAChC,CAAC;CACD;AA5BD,4DA4BC"}
|