n8n-nodes-crawl4ai-plus 2.0.8 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +23 -23
- package/README.md +129 -41
- package/dist/credentials/Crawl4aiApi.credentials.js +2 -34
- package/dist/credentials/Crawl4aiApi.credentials.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/Crawl4aiPlusBasicCrawler.node.js +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/Crawl4aiPlusBasicCrawler.node.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js +1230 -30
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlMultipleUrls.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js +715 -9
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/crawlSingleUrl.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js +495 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/discoverLinks.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js +9 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/operations.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/actions/processRawHtml.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.d.ts +4 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js +94 -60
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/apiClient.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.d.ts +8 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js +49 -12
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/formatters.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/interfaces.d.ts +38 -5
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.d.ts +13 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js +270 -0
- package/dist/nodes/Crawl4aiPlusBasicCrawler/helpers/utils.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/Crawl4aiPlusContentExtractor.node.js +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/Crawl4aiPlusContentExtractor.node.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js +445 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cosineExtractor.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js +108 -8
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/cssExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js +49 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/jsonExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js +134 -17
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/llmExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js +27 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/operations.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js +206 -9
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/regexExtractor.operation.js.map +1 -1
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.d.ts +4 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js +376 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/actions/seoExtractor.operation.js.map +1 -0
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.d.ts +4 -2
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js +53 -16
- package/dist/nodes/Crawl4aiPlusContentExtractor/helpers/utils.js.map +1 -1
- package/dist/tsconfig.tsbuildinfo +1 -1
- package/index.js +11 -11
- package/package.json +1 -1
- /package/dist/nodes/Crawl4aiPlusBasicCrawler/{crawl4ai.svg → crawl4aiplus.svg} +0 -0
- /package/dist/nodes/Crawl4aiPlusContentExtractor/{crawl4ai.svg → crawl4aiplus.svg} +0 -0
|
@@ -6,18 +6,282 @@ const n8n_workflow_1 = require("n8n-workflow");
|
|
|
6
6
|
const utils_1 = require("../helpers/utils");
|
|
7
7
|
const formatters_1 = require("../helpers/formatters");
|
|
8
8
|
exports.description = [
|
|
9
|
+
{
|
|
10
|
+
displayName: 'Crawl Mode',
|
|
11
|
+
name: 'crawlMode',
|
|
12
|
+
type: 'options',
|
|
13
|
+
options: [
|
|
14
|
+
{
|
|
15
|
+
name: 'Manual URL List',
|
|
16
|
+
value: 'manual',
|
|
17
|
+
description: 'Provide an explicit list of URLs to crawl',
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
name: 'Discover From Seed URL',
|
|
21
|
+
value: 'discover',
|
|
22
|
+
description: 'Start from a homepage and recursively follow links matching your keywords (e.g. find all product pages, documentation sections, or contact info)',
|
|
23
|
+
},
|
|
24
|
+
],
|
|
25
|
+
default: 'manual',
|
|
26
|
+
displayOptions: {
|
|
27
|
+
show: {
|
|
28
|
+
operation: ['crawlMultipleUrls'],
|
|
29
|
+
},
|
|
30
|
+
},
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
displayName: 'Discovery Options',
|
|
34
|
+
name: 'discoveryOptions',
|
|
35
|
+
type: 'collection',
|
|
36
|
+
placeholder: 'Add Option',
|
|
37
|
+
default: {},
|
|
38
|
+
displayOptions: {
|
|
39
|
+
show: {
|
|
40
|
+
operation: ['crawlMultipleUrls'],
|
|
41
|
+
crawlMode: ['discover'],
|
|
42
|
+
},
|
|
43
|
+
},
|
|
44
|
+
options: [
|
|
45
|
+
{
|
|
46
|
+
displayName: 'Crawl Strategy',
|
|
47
|
+
name: 'crawlStrategy',
|
|
48
|
+
type: 'options',
|
|
49
|
+
options: [
|
|
50
|
+
{
|
|
51
|
+
name: 'Best-First (Recommended)',
|
|
52
|
+
value: 'BestFirstCrawlingStrategy',
|
|
53
|
+
description: 'Visit highest-scoring pages first, regardless of depth. Best for finding most relevant content quickly. Requires query keywords to score pages.',
|
|
54
|
+
},
|
|
55
|
+
{
|
|
56
|
+
name: 'Breadth-First Search (BFS)',
|
|
57
|
+
value: 'BFSDeepCrawlStrategy',
|
|
58
|
+
description: 'Explore all pages at each depth level before going deeper. Best for comprehensive coverage of nearby pages.',
|
|
59
|
+
},
|
|
60
|
+
{
|
|
61
|
+
name: 'Depth-First Search (DFS)',
|
|
62
|
+
value: 'DFSDeepCrawlStrategy',
|
|
63
|
+
description: 'Follow links as deep as possible on each branch before backtracking. Best for focused deep exploration of specific paths.',
|
|
64
|
+
},
|
|
65
|
+
],
|
|
66
|
+
default: 'BestFirstCrawlingStrategy',
|
|
67
|
+
description: 'How the crawler explores links. Best-First prioritises relevance, BFS ensures breadth, DFS goes deep. Note: Best-First and DFS are validated but not officially tested by Crawl4AI.',
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
displayName: 'Discovery Query',
|
|
71
|
+
name: 'query',
|
|
72
|
+
type: 'string',
|
|
73
|
+
default: '',
|
|
74
|
+
placeholder: 'pricing features documentation',
|
|
75
|
+
description: 'Keywords that guide which links to follow. Use spaces for AND logic, "OR" for alternatives (e.g. "api documentation" finds pages about APIs and docs; "pricing OR plans" finds either).',
|
|
76
|
+
},
|
|
77
|
+
{
|
|
78
|
+
displayName: 'Exclude Domains',
|
|
79
|
+
name: 'excludeDomains',
|
|
80
|
+
type: 'string',
|
|
81
|
+
default: '',
|
|
82
|
+
placeholder: 'social.example.com, cdn.example.com',
|
|
83
|
+
description: 'Block specific domains even if external crawling is enabled. Useful for excluding CDNs, social media subdomains, etc.',
|
|
84
|
+
},
|
|
85
|
+
{
|
|
86
|
+
displayName: 'Exclude Patterns',
|
|
87
|
+
name: 'excludePatterns',
|
|
88
|
+
type: 'string',
|
|
89
|
+
default: '',
|
|
90
|
+
placeholder: '*/careers/*, */legal/*',
|
|
91
|
+
description: 'Skip URLs matching these patterns. Useful for avoiding careers, legal, login pages, etc.',
|
|
92
|
+
},
|
|
93
|
+
{
|
|
94
|
+
displayName: 'Include External Domains',
|
|
95
|
+
name: 'includeExternal',
|
|
96
|
+
type: 'boolean',
|
|
97
|
+
default: false,
|
|
98
|
+
description: 'Whether to follow links to other domains. Leave OFF to stay on the same site, turn ON to follow external references.',
|
|
99
|
+
},
|
|
100
|
+
{
|
|
101
|
+
displayName: 'Include Patterns',
|
|
102
|
+
name: 'includePatterns',
|
|
103
|
+
type: 'string',
|
|
104
|
+
default: '',
|
|
105
|
+
placeholder: '*/products/*, */pricing/*',
|
|
106
|
+
description: 'Only follow URLs matching these patterns (wildcards supported). Example: "*/blog/*, */docs/*" only crawls blog and docs sections.',
|
|
107
|
+
},
|
|
108
|
+
{
|
|
109
|
+
displayName: 'Limit Returned Results',
|
|
110
|
+
name: 'resultLimit',
|
|
111
|
+
type: 'number',
|
|
112
|
+
default: 0,
|
|
113
|
+
description: 'Cap the number of results returned to n8n (0 = return all discovered pages). Useful for sampling or performance.',
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
displayName: 'Maximum Depth',
|
|
117
|
+
name: 'maxDepth',
|
|
118
|
+
type: 'number',
|
|
119
|
+
default: 2,
|
|
120
|
+
description: 'How many link levels deep to crawl. Depth 1 = only pages linked from seed; Depth 2 = seed + 1 more hop; etc. Range: 1-5. Lower is faster but less comprehensive.',
|
|
121
|
+
},
|
|
122
|
+
{
|
|
123
|
+
displayName: 'Maximum Pages',
|
|
124
|
+
name: 'maxPages',
|
|
125
|
+
type: 'number',
|
|
126
|
+
default: 50,
|
|
127
|
+
description: 'Hard limit on total pages crawled to prevent runaway crawls. Range: 1-200. Tip: Start with 20-50 for testing, increase if needed.',
|
|
128
|
+
},
|
|
129
|
+
{
|
|
130
|
+
displayName: 'Respect robots.txt',
|
|
131
|
+
name: 'respectRobotsTxt',
|
|
132
|
+
type: 'boolean',
|
|
133
|
+
default: true,
|
|
134
|
+
description: 'Whether to check and respect robots.txt directives during discovery',
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
displayName: 'Score Threshold',
|
|
138
|
+
name: 'scoreThreshold',
|
|
139
|
+
type: 'number',
|
|
140
|
+
default: 0,
|
|
141
|
+
description: 'Minimum relevance score (0-1) for pages to be crawled. 0 = no threshold. Only used with BFS/DFS strategies; Best-First automatically prioritises high scores.',
|
|
142
|
+
displayOptions: {
|
|
143
|
+
show: {
|
|
144
|
+
crawlStrategy: ['BFSDeepCrawlStrategy', 'DFSDeepCrawlStrategy'],
|
|
145
|
+
},
|
|
146
|
+
},
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
displayName: 'Seed URL',
|
|
150
|
+
name: 'seedUrl',
|
|
151
|
+
type: 'string',
|
|
152
|
+
default: '',
|
|
153
|
+
placeholder: 'https://example.com',
|
|
154
|
+
description: 'Starting point URL (usually homepage or main section). The crawler will follow links from here matching your query.',
|
|
155
|
+
required: true,
|
|
156
|
+
},
|
|
157
|
+
],
|
|
158
|
+
},
|
|
159
|
+
{
|
|
160
|
+
displayName: 'Extraction Options',
|
|
161
|
+
name: 'extractionOptions',
|
|
162
|
+
type: 'collection',
|
|
163
|
+
placeholder: 'Add Extraction',
|
|
164
|
+
default: {},
|
|
165
|
+
displayOptions: {
|
|
166
|
+
show: {
|
|
167
|
+
operation: ['crawlMultipleUrls'],
|
|
168
|
+
crawlMode: ['discover'],
|
|
169
|
+
},
|
|
170
|
+
},
|
|
171
|
+
description: 'Apply extraction strategies to discovered pages (shallow crawl with extraction)',
|
|
172
|
+
options: [
|
|
173
|
+
{
|
|
174
|
+
displayName: 'Base Selector',
|
|
175
|
+
name: 'baseSelector',
|
|
176
|
+
type: 'string',
|
|
177
|
+
default: '',
|
|
178
|
+
placeholder: 'article, .product-card',
|
|
179
|
+
description: 'CSS selector for the repeating container element',
|
|
180
|
+
displayOptions: {
|
|
181
|
+
show: {
|
|
182
|
+
enableExtraction: [true],
|
|
183
|
+
extractionType: ['css'],
|
|
184
|
+
},
|
|
185
|
+
},
|
|
186
|
+
},
|
|
187
|
+
{
|
|
188
|
+
displayName: 'Enable Extraction',
|
|
189
|
+
name: 'enableExtraction',
|
|
190
|
+
type: 'boolean',
|
|
191
|
+
default: false,
|
|
192
|
+
description: 'Whether to apply an extraction strategy to each discovered page',
|
|
193
|
+
},
|
|
194
|
+
{
|
|
195
|
+
displayName: 'Extraction Instructions',
|
|
196
|
+
name: 'llmInstruction',
|
|
197
|
+
type: 'string',
|
|
198
|
+
typeOptions: {
|
|
199
|
+
rows: 4,
|
|
200
|
+
},
|
|
201
|
+
default: '',
|
|
202
|
+
placeholder: 'Extract the main content, title, and any contact information from each page.',
|
|
203
|
+
description: 'Natural language instructions for the LLM on what to extract',
|
|
204
|
+
displayOptions: {
|
|
205
|
+
show: {
|
|
206
|
+
enableExtraction: [true],
|
|
207
|
+
extractionType: ['llm'],
|
|
208
|
+
},
|
|
209
|
+
},
|
|
210
|
+
},
|
|
211
|
+
{
|
|
212
|
+
displayName: 'Extraction Type',
|
|
213
|
+
name: 'extractionType',
|
|
214
|
+
type: 'options',
|
|
215
|
+
options: [
|
|
216
|
+
{
|
|
217
|
+
name: 'CSS Selector Schema',
|
|
218
|
+
value: 'css',
|
|
219
|
+
description: 'Extract structured data using CSS selectors',
|
|
220
|
+
},
|
|
221
|
+
{
|
|
222
|
+
name: 'LLM Extraction',
|
|
223
|
+
value: 'llm',
|
|
224
|
+
description: 'Extract data using LLM with natural language instructions (requires LLM credentials)',
|
|
225
|
+
},
|
|
226
|
+
],
|
|
227
|
+
default: 'css',
|
|
228
|
+
description: 'Type of extraction to apply to discovered pages',
|
|
229
|
+
displayOptions: {
|
|
230
|
+
show: {
|
|
231
|
+
enableExtraction: [true],
|
|
232
|
+
},
|
|
233
|
+
},
|
|
234
|
+
},
|
|
235
|
+
{
|
|
236
|
+
displayName: 'Field Selectors (JSON)',
|
|
237
|
+
name: 'fieldSelectors',
|
|
238
|
+
type: 'string',
|
|
239
|
+
typeOptions: {
|
|
240
|
+
rows: 6,
|
|
241
|
+
},
|
|
242
|
+
default: '[\n {"name": "title", "selector": "h1, h2", "type": "text"},\n {"name": "content", "selector": "p, .content", "type": "text"}\n]',
|
|
243
|
+
placeholder: '[{"name": "title", "selector": "h1", "type": "text"}]',
|
|
244
|
+
description: 'JSON array of field definitions: [{name, selector, type, attribute?}]. Type can be "text", "html", or "attribute".',
|
|
245
|
+
displayOptions: {
|
|
246
|
+
show: {
|
|
247
|
+
enableExtraction: [true],
|
|
248
|
+
extractionType: ['css'],
|
|
249
|
+
},
|
|
250
|
+
},
|
|
251
|
+
},
|
|
252
|
+
{
|
|
253
|
+
displayName: 'Schema (JSON)',
|
|
254
|
+
name: 'llmSchema',
|
|
255
|
+
type: 'string',
|
|
256
|
+
typeOptions: {
|
|
257
|
+
rows: 8,
|
|
258
|
+
},
|
|
259
|
+
default: '{\n "type": "object",\n "properties": {\n "title": {"type": "string"},\n "summary": {"type": "string"}\n }\n}',
|
|
260
|
+
placeholder: '{"type": "object", "properties": {"title": {"type": "string"}}}',
|
|
261
|
+
description: 'JSON schema defining the structure of data to extract',
|
|
262
|
+
displayOptions: {
|
|
263
|
+
show: {
|
|
264
|
+
enableExtraction: [true],
|
|
265
|
+
extractionType: ['llm'],
|
|
266
|
+
},
|
|
267
|
+
},
|
|
268
|
+
},
|
|
269
|
+
],
|
|
270
|
+
},
|
|
9
271
|
{
|
|
10
272
|
displayName: 'URLs',
|
|
11
273
|
name: 'urls',
|
|
12
274
|
type: 'string',
|
|
13
|
-
required: true,
|
|
14
275
|
default: '',
|
|
15
276
|
placeholder: 'https://example.com, https://example.org',
|
|
16
|
-
description: 'Comma-separated list of URLs to crawl',
|
|
277
|
+
description: 'Comma-separated list of URLs to crawl. Required when using Manual URL List mode.',
|
|
17
278
|
displayOptions: {
|
|
18
279
|
show: {
|
|
19
280
|
operation: ['crawlMultipleUrls'],
|
|
20
281
|
},
|
|
282
|
+
hide: {
|
|
283
|
+
crawlMode: ['discover'],
|
|
284
|
+
},
|
|
21
285
|
},
|
|
22
286
|
},
|
|
23
287
|
{
|
|
@@ -46,6 +310,32 @@ exports.description = [
|
|
|
46
310
|
default: false,
|
|
47
311
|
description: 'Whether to enable stealth mode to bypass basic bot detection (hides webdriver properties and modifies browser fingerprints)',
|
|
48
312
|
},
|
|
313
|
+
{
|
|
314
|
+
displayName: 'Extra Browser Arguments',
|
|
315
|
+
name: 'extraArgs',
|
|
316
|
+
type: 'fixedCollection',
|
|
317
|
+
typeOptions: {
|
|
318
|
+
multipleValues: true,
|
|
319
|
+
},
|
|
320
|
+
default: {},
|
|
321
|
+
description: 'Additional command-line arguments to pass to the browser (advanced users only)',
|
|
322
|
+
options: [
|
|
323
|
+
{
|
|
324
|
+
name: 'args',
|
|
325
|
+
displayName: 'Arguments',
|
|
326
|
+
values: [
|
|
327
|
+
{
|
|
328
|
+
displayName: 'Argument',
|
|
329
|
+
name: 'value',
|
|
330
|
+
type: 'string',
|
|
331
|
+
default: '',
|
|
332
|
+
placeholder: '--disable-blink-features=AutomationControlled',
|
|
333
|
+
description: 'Browser command-line argument (e.g., --disable-blink-features=AutomationControlled)',
|
|
334
|
+
},
|
|
335
|
+
],
|
|
336
|
+
},
|
|
337
|
+
],
|
|
338
|
+
},
|
|
49
339
|
{
|
|
50
340
|
displayName: 'Headless Mode',
|
|
51
341
|
name: 'headless',
|
|
@@ -84,6 +374,71 @@ exports.description = [
|
|
|
84
374
|
},
|
|
85
375
|
],
|
|
86
376
|
},
|
|
377
|
+
{
|
|
378
|
+
displayName: 'Session & Authentication',
|
|
379
|
+
name: 'sessionOptions',
|
|
380
|
+
type: 'collection',
|
|
381
|
+
placeholder: 'Add Option',
|
|
382
|
+
default: {},
|
|
383
|
+
displayOptions: {
|
|
384
|
+
show: {
|
|
385
|
+
operation: ['crawlMultipleUrls'],
|
|
386
|
+
},
|
|
387
|
+
},
|
|
388
|
+
options: [
|
|
389
|
+
{
|
|
390
|
+
displayName: 'Cookies',
|
|
391
|
+
name: 'cookies',
|
|
392
|
+
type: 'json',
|
|
393
|
+
default: '',
|
|
394
|
+
placeholder: '[{"name": "session_id", "value": "abc123", "domain": ".example.com", "path": "/"}]',
|
|
395
|
+
description: 'Array of cookie objects to inject. Alternative to storage state for simple cookie-based auth.',
|
|
396
|
+
},
|
|
397
|
+
{
|
|
398
|
+
displayName: 'Storage State (JSON)',
|
|
399
|
+
name: 'storageState',
|
|
400
|
+
type: 'string',
|
|
401
|
+
typeOptions: {
|
|
402
|
+
rows: 6,
|
|
403
|
+
},
|
|
404
|
+
default: '',
|
|
405
|
+
placeholder: '{"cookies": [...], "origins": [...]}',
|
|
406
|
+
description: 'Browser storage state as JSON (cookies, localStorage, sessionStorage). Captures authenticated session state. Works in all n8n environments.',
|
|
407
|
+
},
|
|
408
|
+
{
|
|
409
|
+
displayName: 'Use Managed Browser',
|
|
410
|
+
name: 'useManagedBrowser',
|
|
411
|
+
type: 'boolean',
|
|
412
|
+
default: false,
|
|
413
|
+
description: 'Whether to use managed browser mode (required for persistent contexts). Advanced option.',
|
|
414
|
+
displayOptions: {
|
|
415
|
+
show: {
|
|
416
|
+
usePersistentContext: [true],
|
|
417
|
+
},
|
|
418
|
+
},
|
|
419
|
+
},
|
|
420
|
+
{
|
|
421
|
+
displayName: 'Use Persistent Browser Context',
|
|
422
|
+
name: 'usePersistentContext',
|
|
423
|
+
type: 'boolean',
|
|
424
|
+
default: false,
|
|
425
|
+
description: 'Whether to use a persistent browser context (requires user data directory). Only use in self-hosted environments with persistent storage.',
|
|
426
|
+
},
|
|
427
|
+
{
|
|
428
|
+
displayName: 'User Data Directory',
|
|
429
|
+
name: 'userDataDir',
|
|
430
|
+
type: 'string',
|
|
431
|
+
default: '',
|
|
432
|
+
placeholder: '/data/browser-profiles/profile1',
|
|
433
|
+
description: 'Path to browser profile directory for persistent sessions. Advanced: Only works in self-hosted n8n with persistent volumes. Use Storage State for cloud deployments.',
|
|
434
|
+
displayOptions: {
|
|
435
|
+
show: {
|
|
436
|
+
usePersistentContext: [true],
|
|
437
|
+
},
|
|
438
|
+
},
|
|
439
|
+
},
|
|
440
|
+
],
|
|
441
|
+
},
|
|
87
442
|
{
|
|
88
443
|
displayName: 'Crawler Options',
|
|
89
444
|
name: 'crawlerOptions',
|
|
@@ -206,8 +561,8 @@ exports.description = [
|
|
|
206
561
|
],
|
|
207
562
|
},
|
|
208
563
|
{
|
|
209
|
-
displayName: 'Options',
|
|
210
|
-
name: '
|
|
564
|
+
displayName: 'Output Options',
|
|
565
|
+
name: 'outputOptions',
|
|
211
566
|
type: 'collection',
|
|
212
567
|
placeholder: 'Add Option',
|
|
213
568
|
default: {},
|
|
@@ -217,12 +572,78 @@ exports.description = [
|
|
|
217
572
|
},
|
|
218
573
|
},
|
|
219
574
|
options: [
|
|
575
|
+
{
|
|
576
|
+
displayName: 'Capture Screenshot',
|
|
577
|
+
name: 'screenshot',
|
|
578
|
+
type: 'boolean',
|
|
579
|
+
default: false,
|
|
580
|
+
description: 'Whether to capture a screenshot of each page (returned as base64)',
|
|
581
|
+
},
|
|
582
|
+
{
|
|
583
|
+
displayName: 'Fetch SSL Certificate',
|
|
584
|
+
name: 'fetchSslCertificate',
|
|
585
|
+
type: 'boolean',
|
|
586
|
+
default: false,
|
|
587
|
+
description: 'Whether to retrieve SSL certificate information from each server',
|
|
588
|
+
},
|
|
589
|
+
{
|
|
590
|
+
displayName: 'Generate PDF',
|
|
591
|
+
name: 'pdf',
|
|
592
|
+
type: 'boolean',
|
|
593
|
+
default: false,
|
|
594
|
+
description: 'Whether to generate a PDF of each page (returned as base64 or binary)',
|
|
595
|
+
},
|
|
596
|
+
{
|
|
597
|
+
displayName: 'Include Links',
|
|
598
|
+
name: 'includeLinks',
|
|
599
|
+
type: 'boolean',
|
|
600
|
+
default: true,
|
|
601
|
+
description: 'Whether to include structured internal/external links data in output',
|
|
602
|
+
},
|
|
220
603
|
{
|
|
221
604
|
displayName: 'Include Media Data',
|
|
222
605
|
name: 'includeMedia',
|
|
223
606
|
type: 'boolean',
|
|
224
607
|
default: false,
|
|
225
|
-
description: 'Whether to include media data in output (images, videos)',
|
|
608
|
+
description: 'Whether to include media data in output (images, videos, audios)',
|
|
609
|
+
},
|
|
610
|
+
{
|
|
611
|
+
displayName: 'Include Tables',
|
|
612
|
+
name: 'includeTables',
|
|
613
|
+
type: 'boolean',
|
|
614
|
+
default: true,
|
|
615
|
+
description: 'Whether to include extracted tables in the output (if table extraction is enabled)',
|
|
616
|
+
},
|
|
617
|
+
{
|
|
618
|
+
displayName: 'Markdown Output',
|
|
619
|
+
name: 'markdownOutput',
|
|
620
|
+
type: 'options',
|
|
621
|
+
options: [
|
|
622
|
+
{
|
|
623
|
+
name: 'Raw Markdown',
|
|
624
|
+
value: 'raw',
|
|
625
|
+
description: 'Return raw markdown (default, full content)',
|
|
626
|
+
},
|
|
627
|
+
{
|
|
628
|
+
name: 'Filtered Markdown',
|
|
629
|
+
value: 'fit',
|
|
630
|
+
description: 'Return content-filtered markdown (cleaner, main content only)',
|
|
631
|
+
},
|
|
632
|
+
{
|
|
633
|
+
name: 'Both',
|
|
634
|
+
value: 'both',
|
|
635
|
+
description: 'Return both raw and filtered markdown variants',
|
|
636
|
+
},
|
|
637
|
+
],
|
|
638
|
+
default: 'raw',
|
|
639
|
+
description: 'Which markdown variant(s) to return in the output',
|
|
640
|
+
},
|
|
641
|
+
{
|
|
642
|
+
displayName: 'Max Concurrent Crawls',
|
|
643
|
+
name: 'maxConcurrent',
|
|
644
|
+
type: 'number',
|
|
645
|
+
default: 5,
|
|
646
|
+
description: 'Maximum number of concurrent crawls',
|
|
226
647
|
},
|
|
227
648
|
{
|
|
228
649
|
displayName: 'Verbose Response',
|
|
@@ -231,49 +652,828 @@ exports.description = [
|
|
|
231
652
|
default: false,
|
|
232
653
|
description: 'Whether to include detailed data in output (HTML, status codes, etc.)',
|
|
233
654
|
},
|
|
655
|
+
],
|
|
656
|
+
},
|
|
657
|
+
{
|
|
658
|
+
displayName: 'Content Filter',
|
|
659
|
+
name: 'contentFilter',
|
|
660
|
+
type: 'collection',
|
|
661
|
+
placeholder: 'Add Filter',
|
|
662
|
+
default: {},
|
|
663
|
+
displayOptions: {
|
|
664
|
+
show: {
|
|
665
|
+
operation: ['crawlMultipleUrls'],
|
|
666
|
+
},
|
|
667
|
+
},
|
|
668
|
+
options: [
|
|
234
669
|
{
|
|
235
|
-
displayName: '
|
|
236
|
-
name: '
|
|
670
|
+
displayName: 'BM25 Threshold',
|
|
671
|
+
name: 'bm25Threshold',
|
|
672
|
+
type: 'number',
|
|
673
|
+
default: 1.0,
|
|
674
|
+
displayOptions: {
|
|
675
|
+
show: {
|
|
676
|
+
filterType: ['bm25'],
|
|
677
|
+
},
|
|
678
|
+
},
|
|
679
|
+
description: 'Minimum BM25 score threshold for content inclusion (default: 1.0)',
|
|
680
|
+
},
|
|
681
|
+
{
|
|
682
|
+
displayName: 'Chunk Token Threshold',
|
|
683
|
+
name: 'chunkTokenThreshold',
|
|
684
|
+
type: 'number',
|
|
685
|
+
displayOptions: {
|
|
686
|
+
show: {
|
|
687
|
+
filterType: ['llm'],
|
|
688
|
+
},
|
|
689
|
+
},
|
|
690
|
+
default: 8192,
|
|
691
|
+
description: 'Maximum tokens per chunk for LLM processing (default: 8192, recommended: 4096-16384)',
|
|
692
|
+
},
|
|
693
|
+
{
|
|
694
|
+
displayName: 'Filter Type',
|
|
695
|
+
name: 'filterType',
|
|
696
|
+
type: 'options',
|
|
697
|
+
options: [
|
|
698
|
+
{
|
|
699
|
+
name: 'None',
|
|
700
|
+
value: 'none',
|
|
701
|
+
description: 'No content filtering (return all content)',
|
|
702
|
+
},
|
|
703
|
+
{
|
|
704
|
+
name: 'Pruning Filter',
|
|
705
|
+
value: 'pruning',
|
|
706
|
+
description: 'Remove low-value content using relevance thresholds',
|
|
707
|
+
},
|
|
708
|
+
{
|
|
709
|
+
name: 'BM25 Filter',
|
|
710
|
+
value: 'bm25',
|
|
711
|
+
description: 'Filter content based on query relevance using BM25 algorithm',
|
|
712
|
+
},
|
|
713
|
+
{
|
|
714
|
+
name: 'LLM Filter',
|
|
715
|
+
value: 'llm',
|
|
716
|
+
description: 'Intelligent content filtering using LLM (requires LLM credentials)',
|
|
717
|
+
},
|
|
718
|
+
],
|
|
719
|
+
default: 'none',
|
|
720
|
+
description: 'Type of content filtering to apply',
|
|
721
|
+
},
|
|
722
|
+
{
|
|
723
|
+
displayName: 'Ignore Cache',
|
|
724
|
+
name: 'ignoreCache',
|
|
725
|
+
type: 'boolean',
|
|
726
|
+
displayOptions: {
|
|
727
|
+
show: {
|
|
728
|
+
filterType: ['llm'],
|
|
729
|
+
},
|
|
730
|
+
},
|
|
731
|
+
default: false,
|
|
732
|
+
description: 'Whether to skip cache and always generate fresh filtered content',
|
|
733
|
+
},
|
|
734
|
+
{
|
|
735
|
+
displayName: 'Ignore Links',
|
|
736
|
+
name: 'ignoreLinks',
|
|
737
|
+
type: 'boolean',
|
|
738
|
+
default: false,
|
|
739
|
+
description: 'Whether to exclude links from markdown output',
|
|
740
|
+
},
|
|
741
|
+
{
|
|
742
|
+
displayName: 'LLM Instruction',
|
|
743
|
+
name: 'llmInstruction',
|
|
744
|
+
type: 'string',
|
|
745
|
+
typeOptions: {
|
|
746
|
+
rows: 8,
|
|
747
|
+
},
|
|
748
|
+
displayOptions: {
|
|
749
|
+
show: {
|
|
750
|
+
filterType: ['llm'],
|
|
751
|
+
},
|
|
752
|
+
},
|
|
753
|
+
default: `Extract the main content while preserving its original wording and substance completely.
|
|
754
|
+
Remove only clearly irrelevant elements like:
|
|
755
|
+
- Navigation menus
|
|
756
|
+
- Advertisement sections
|
|
757
|
+
- Cookie notices
|
|
758
|
+
- Footers with site information
|
|
759
|
+
- Sidebars with external links
|
|
760
|
+
- Any UI elements that don't contribute to the content
|
|
761
|
+
|
|
762
|
+
Keep all valuable educational or informational content intact.`,
|
|
763
|
+
description: 'Instructions for the LLM on how to filter and clean the content',
|
|
764
|
+
required: true,
|
|
765
|
+
},
|
|
766
|
+
{
|
|
767
|
+
displayName: 'LLM Verbose',
|
|
768
|
+
name: 'llmVerbose',
|
|
769
|
+
type: 'boolean',
|
|
770
|
+
displayOptions: {
|
|
771
|
+
show: {
|
|
772
|
+
filterType: ['llm'],
|
|
773
|
+
},
|
|
774
|
+
},
|
|
775
|
+
default: false,
|
|
776
|
+
description: 'Whether to enable verbose logging for LLM content filtering',
|
|
777
|
+
},
|
|
778
|
+
{
|
|
779
|
+
displayName: 'Min Word Threshold',
|
|
780
|
+
name: 'minWordThreshold',
|
|
781
|
+
type: 'number',
|
|
782
|
+
default: 0,
|
|
783
|
+
displayOptions: {
|
|
784
|
+
show: {
|
|
785
|
+
filterType: ['pruning'],
|
|
786
|
+
},
|
|
787
|
+
},
|
|
788
|
+
description: 'Minimum word count for content blocks to be considered (0 = no minimum)',
|
|
789
|
+
},
|
|
790
|
+
{
|
|
791
|
+
displayName: 'Threshold',
|
|
792
|
+
name: 'threshold',
|
|
793
|
+
type: 'number',
|
|
794
|
+
default: 0.48,
|
|
795
|
+
displayOptions: {
|
|
796
|
+
show: {
|
|
797
|
+
filterType: ['pruning'],
|
|
798
|
+
},
|
|
799
|
+
},
|
|
800
|
+
description: 'Relevance threshold for pruning (0.0-1.0, default: 0.48). Higher values = more aggressive filtering.',
|
|
801
|
+
},
|
|
802
|
+
{
|
|
803
|
+
displayName: 'Threshold Type',
|
|
804
|
+
name: 'thresholdType',
|
|
805
|
+
type: 'options',
|
|
806
|
+
options: [
|
|
807
|
+
{
|
|
808
|
+
name: 'Fixed',
|
|
809
|
+
value: 'fixed',
|
|
810
|
+
description: 'Use fixed threshold value',
|
|
811
|
+
},
|
|
812
|
+
{
|
|
813
|
+
name: 'Dynamic',
|
|
814
|
+
value: 'dynamic',
|
|
815
|
+
description: 'Calculate threshold dynamically based on content',
|
|
816
|
+
},
|
|
817
|
+
],
|
|
818
|
+
default: 'fixed',
|
|
819
|
+
displayOptions: {
|
|
820
|
+
show: {
|
|
821
|
+
filterType: ['pruning'],
|
|
822
|
+
},
|
|
823
|
+
},
|
|
824
|
+
description: 'How to apply the pruning threshold',
|
|
825
|
+
},
|
|
826
|
+
{
|
|
827
|
+
displayName: 'User Query',
|
|
828
|
+
name: 'userQuery',
|
|
829
|
+
type: 'string',
|
|
830
|
+
default: '',
|
|
831
|
+
placeholder: 'main content topics keywords',
|
|
832
|
+
displayOptions: {
|
|
833
|
+
show: {
|
|
834
|
+
filterType: ['bm25'],
|
|
835
|
+
},
|
|
836
|
+
},
|
|
837
|
+
description: 'Query string to filter relevant content (BM25 will rank content by relevance to this query)',
|
|
838
|
+
},
|
|
839
|
+
],
|
|
840
|
+
},
|
|
841
|
+
{
|
|
842
|
+
displayName: 'Advanced Options',
|
|
843
|
+
name: 'advancedOptions',
|
|
844
|
+
type: 'collection',
|
|
845
|
+
placeholder: 'Add Option',
|
|
846
|
+
default: {},
|
|
847
|
+
displayOptions: {
|
|
848
|
+
show: {
|
|
849
|
+
operation: ['crawlMultipleUrls'],
|
|
850
|
+
},
|
|
851
|
+
},
|
|
852
|
+
options: [
|
|
853
|
+
{
|
|
854
|
+
displayName: 'Anti-Bot Features',
|
|
855
|
+
name: 'antiBotFeatures',
|
|
856
|
+
type: 'fixedCollection',
|
|
857
|
+
default: {},
|
|
858
|
+
options: [
|
|
859
|
+
{
|
|
860
|
+
name: 'features',
|
|
861
|
+
displayName: 'Features',
|
|
862
|
+
values: [
|
|
863
|
+
{
|
|
864
|
+
displayName: 'Magic Mode',
|
|
865
|
+
name: 'magic',
|
|
866
|
+
type: 'boolean',
|
|
867
|
+
default: false,
|
|
868
|
+
description: 'Whether to enable anti-detection techniques (stealth++)',
|
|
869
|
+
},
|
|
870
|
+
{
|
|
871
|
+
displayName: 'Simulate User Behavior',
|
|
872
|
+
name: 'simulateUser',
|
|
873
|
+
type: 'boolean',
|
|
874
|
+
default: false,
|
|
875
|
+
description: 'Whether to simulate human-like browsing behavior',
|
|
876
|
+
},
|
|
877
|
+
{
|
|
878
|
+
displayName: 'Override Navigator',
|
|
879
|
+
name: 'overrideNavigator',
|
|
880
|
+
type: 'boolean',
|
|
881
|
+
default: false,
|
|
882
|
+
description: 'Whether to override navigator properties to avoid detection',
|
|
883
|
+
},
|
|
884
|
+
],
|
|
885
|
+
},
|
|
886
|
+
],
|
|
887
|
+
},
|
|
888
|
+
{
|
|
889
|
+
displayName: 'Delay Before Return (Ms)',
|
|
890
|
+
name: 'delayBeforeReturnHtml',
|
|
237
891
|
type: 'number',
|
|
892
|
+
default: 0,
|
|
893
|
+
description: 'Milliseconds to wait before returning HTML (useful for dynamic content)',
|
|
894
|
+
},
|
|
895
|
+
{
|
|
896
|
+
displayName: 'Exclude External Images',
|
|
897
|
+
name: 'excludeExternalImages',
|
|
898
|
+
type: 'boolean',
|
|
899
|
+
default: false,
|
|
900
|
+
description: 'Whether to exclude images hosted on external domains',
|
|
901
|
+
},
|
|
902
|
+
{
|
|
903
|
+
displayName: 'Exclude Social Media Links',
|
|
904
|
+
name: 'excludeSocialMediaLinks',
|
|
905
|
+
type: 'boolean',
|
|
906
|
+
default: false,
|
|
907
|
+
description: 'Whether to exclude links to social media platforms',
|
|
908
|
+
},
|
|
909
|
+
{
|
|
910
|
+
displayName: 'Verbose Mode',
|
|
911
|
+
name: 'verbose',
|
|
912
|
+
type: 'boolean',
|
|
913
|
+
default: false,
|
|
914
|
+
description: 'Whether to enable verbose logging (debug mode)',
|
|
915
|
+
},
|
|
916
|
+
{
|
|
917
|
+
displayName: 'Wait Until',
|
|
918
|
+
name: 'waitUntil',
|
|
919
|
+
type: 'options',
|
|
920
|
+
options: [
|
|
921
|
+
{
|
|
922
|
+
name: 'Load',
|
|
923
|
+
value: 'load',
|
|
924
|
+
description: 'Wait for the load event',
|
|
925
|
+
},
|
|
926
|
+
{
|
|
927
|
+
name: 'DOM Content Loaded',
|
|
928
|
+
value: 'domcontentloaded',
|
|
929
|
+
description: 'Wait for DOMContentLoaded event',
|
|
930
|
+
},
|
|
931
|
+
{
|
|
932
|
+
name: 'Network Idle',
|
|
933
|
+
value: 'networkidle',
|
|
934
|
+
description: 'Wait for network to be idle (no requests for 500ms)',
|
|
935
|
+
},
|
|
936
|
+
{
|
|
937
|
+
name: 'Network Idle 2',
|
|
938
|
+
value: 'networkidle2',
|
|
939
|
+
description: 'Wait for network to be idle (no more than 2 requests for 500ms)',
|
|
940
|
+
},
|
|
941
|
+
],
|
|
942
|
+
default: 'load',
|
|
943
|
+
description: 'When to consider page load complete',
|
|
944
|
+
},
|
|
945
|
+
],
|
|
946
|
+
},
|
|
947
|
+
{
|
|
948
|
+
displayName: 'Table Extraction',
|
|
949
|
+
name: 'tableExtraction',
|
|
950
|
+
type: 'collection',
|
|
951
|
+
placeholder: 'Add Option',
|
|
952
|
+
default: {},
|
|
953
|
+
displayOptions: {
|
|
954
|
+
show: {
|
|
955
|
+
operation: ['crawlMultipleUrls'],
|
|
956
|
+
},
|
|
957
|
+
},
|
|
958
|
+
options: [
|
|
959
|
+
{
|
|
960
|
+
displayName: 'Chunk Token Threshold',
|
|
961
|
+
name: 'chunkTokenThreshold',
|
|
962
|
+
type: 'number',
|
|
963
|
+
displayOptions: {
|
|
964
|
+
show: {
|
|
965
|
+
strategyType: ['llm'],
|
|
966
|
+
enableChunking: [true],
|
|
967
|
+
},
|
|
968
|
+
},
|
|
969
|
+
default: 10000,
|
|
970
|
+
description: 'Maximum tokens per chunk when processing large tables (default: 10000)',
|
|
971
|
+
},
|
|
972
|
+
{
|
|
973
|
+
displayName: 'CSS Selector',
|
|
974
|
+
name: 'cssSelector',
|
|
975
|
+
type: 'string',
|
|
976
|
+
displayOptions: {
|
|
977
|
+
show: {
|
|
978
|
+
strategyType: ['llm'],
|
|
979
|
+
},
|
|
980
|
+
},
|
|
981
|
+
default: '',
|
|
982
|
+
placeholder: '.main-content',
|
|
983
|
+
description: 'CSS selector to focus table extraction on specific page area (optional)',
|
|
984
|
+
},
|
|
985
|
+
{
|
|
986
|
+
displayName: 'Enable Chunking',
|
|
987
|
+
name: 'enableChunking',
|
|
988
|
+
type: 'boolean',
|
|
989
|
+
displayOptions: {
|
|
990
|
+
show: {
|
|
991
|
+
strategyType: ['llm'],
|
|
992
|
+
},
|
|
993
|
+
},
|
|
994
|
+
default: false,
|
|
995
|
+
description: 'Whether to enable chunking for large tables (recommended for tables with 100+ rows)',
|
|
996
|
+
},
|
|
997
|
+
{
|
|
998
|
+
displayName: 'Max Parallel Chunks',
|
|
999
|
+
name: 'maxParallelChunks',
|
|
1000
|
+
type: 'number',
|
|
1001
|
+
displayOptions: {
|
|
1002
|
+
show: {
|
|
1003
|
+
strategyType: ['llm'],
|
|
1004
|
+
enableChunking: [true],
|
|
1005
|
+
},
|
|
1006
|
+
},
|
|
238
1007
|
default: 5,
|
|
239
|
-
description: 'Maximum number of
|
|
1008
|
+
description: 'Maximum number of chunks to process in parallel (default: 5)',
|
|
1009
|
+
},
|
|
1010
|
+
{
|
|
1011
|
+
displayName: 'Max Tries',
|
|
1012
|
+
name: 'maxTries',
|
|
1013
|
+
type: 'number',
|
|
1014
|
+
displayOptions: {
|
|
1015
|
+
show: {
|
|
1016
|
+
strategyType: ['llm'],
|
|
1017
|
+
},
|
|
1018
|
+
},
|
|
1019
|
+
default: 3,
|
|
1020
|
+
description: 'Maximum number of retry attempts for LLM extraction (default: 3)',
|
|
1021
|
+
},
|
|
1022
|
+
{
|
|
1023
|
+
displayName: 'Min Rows Per Chunk',
|
|
1024
|
+
name: 'minRowsPerChunk',
|
|
1025
|
+
type: 'number',
|
|
1026
|
+
displayOptions: {
|
|
1027
|
+
show: {
|
|
1028
|
+
strategyType: ['llm'],
|
|
1029
|
+
enableChunking: [true],
|
|
1030
|
+
},
|
|
1031
|
+
},
|
|
1032
|
+
default: 20,
|
|
1033
|
+
description: 'Minimum number of rows per chunk (default: 20)',
|
|
1034
|
+
},
|
|
1035
|
+
{
|
|
1036
|
+
displayName: 'Strategy Type',
|
|
1037
|
+
name: 'strategyType',
|
|
1038
|
+
type: 'options',
|
|
1039
|
+
options: [
|
|
1040
|
+
{
|
|
1041
|
+
name: 'None',
|
|
1042
|
+
value: 'none',
|
|
1043
|
+
description: 'No table extraction',
|
|
1044
|
+
},
|
|
1045
|
+
{
|
|
1046
|
+
name: 'LLM Table Extraction',
|
|
1047
|
+
value: 'llm',
|
|
1048
|
+
description: 'Extract tables using LLM (handles complex tables with rowspan/colspan)',
|
|
1049
|
+
},
|
|
1050
|
+
{
|
|
1051
|
+
name: 'Default Table Extraction',
|
|
1052
|
+
value: 'default',
|
|
1053
|
+
description: 'Extract tables using heuristics (faster, simpler tables only)',
|
|
1054
|
+
},
|
|
1055
|
+
],
|
|
1056
|
+
default: 'none',
|
|
1057
|
+
description: 'Table extraction strategy to use',
|
|
1058
|
+
},
|
|
1059
|
+
{
|
|
1060
|
+
displayName: 'Table Score Threshold',
|
|
1061
|
+
name: 'tableScoreThreshold',
|
|
1062
|
+
type: 'number',
|
|
1063
|
+
displayOptions: {
|
|
1064
|
+
show: {
|
|
1065
|
+
strategyType: ['default'],
|
|
1066
|
+
},
|
|
1067
|
+
},
|
|
1068
|
+
default: 5,
|
|
1069
|
+
description: 'Minimum score for table to be included in results (default: 5, range: 0-10)',
|
|
1070
|
+
},
|
|
1071
|
+
{
|
|
1072
|
+
displayName: 'Verbose',
|
|
1073
|
+
name: 'verbose',
|
|
1074
|
+
type: 'boolean',
|
|
1075
|
+
displayOptions: {
|
|
1076
|
+
show: {
|
|
1077
|
+
strategyType: ['llm', 'default'],
|
|
1078
|
+
},
|
|
1079
|
+
},
|
|
1080
|
+
default: false,
|
|
1081
|
+
description: 'Whether to enable verbose logging for table extraction',
|
|
240
1082
|
},
|
|
241
1083
|
],
|
|
242
1084
|
},
|
|
243
1085
|
];
|
|
244
1086
|
async function execute(items, nodeOptions) {
|
|
245
|
-
var _a;
|
|
1087
|
+
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
|
|
246
1088
|
const allResults = [];
|
|
247
1089
|
for (let i = 0; i < items.length; i++) {
|
|
248
1090
|
try {
|
|
249
|
-
const urlsString = this.getNodeParameter('urls', i, '');
|
|
250
1091
|
const browserOptions = this.getNodeParameter('browserOptions', i, {});
|
|
1092
|
+
const sessionOptions = this.getNodeParameter('sessionOptions', i, {});
|
|
251
1093
|
const crawlerOptions = this.getNodeParameter('crawlerOptions', i, {});
|
|
252
|
-
const
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
}
|
|
256
|
-
|
|
257
|
-
.
|
|
258
|
-
.
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'No valid URLs provided.', { itemIndex: i });
|
|
1094
|
+
const outputOptions = this.getNodeParameter('outputOptions', i, {});
|
|
1095
|
+
const contentFilter = this.getNodeParameter('contentFilter', i, {});
|
|
1096
|
+
const advancedOptions = this.getNodeParameter('advancedOptions', i, {});
|
|
1097
|
+
let mergedBrowserOptions = { ...browserOptions, ...sessionOptions };
|
|
1098
|
+
if (browserOptions.extraArgs && typeof browserOptions.extraArgs === 'object') {
|
|
1099
|
+
const extraArgsCollection = browserOptions.extraArgs;
|
|
1100
|
+
if (extraArgsCollection.args && Array.isArray(extraArgsCollection.args)) {
|
|
1101
|
+
mergedBrowserOptions.extraArgs = extraArgsCollection.args.map((arg) => arg.value).filter((v) => v);
|
|
1102
|
+
}
|
|
262
1103
|
}
|
|
263
|
-
const
|
|
264
|
-
|
|
265
|
-
|
|
1104
|
+
const crawlMode = this.getNodeParameter('crawlMode', i, 'manual');
|
|
1105
|
+
let urls = [];
|
|
1106
|
+
if (crawlMode === 'manual') {
|
|
1107
|
+
const urlsString = this.getNodeParameter('urls', i, '');
|
|
1108
|
+
if (!urlsString) {
|
|
1109
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'URLs cannot be empty.', { itemIndex: i });
|
|
1110
|
+
}
|
|
1111
|
+
urls = urlsString
|
|
1112
|
+
.split(',')
|
|
1113
|
+
.map(url => url.trim())
|
|
1114
|
+
.filter(url => url);
|
|
1115
|
+
if (urls.length === 0) {
|
|
1116
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'No valid URLs provided.', { itemIndex: i });
|
|
1117
|
+
}
|
|
1118
|
+
const invalidUrls = urls.filter(url => !(0, utils_1.isValidUrl)(url));
|
|
1119
|
+
if (invalidUrls.length > 0) {
|
|
1120
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid URLs: ${invalidUrls.join(', ')}`, { itemIndex: i });
|
|
1121
|
+
}
|
|
266
1122
|
}
|
|
267
|
-
const browserConfig = (0, utils_1.createBrowserConfig)(
|
|
268
|
-
const
|
|
1123
|
+
const browserConfig = (0, utils_1.createBrowserConfig)(mergedBrowserOptions);
|
|
1124
|
+
const antiBotFeatures = ((_a = advancedOptions.antiBotFeatures) === null || _a === void 0 ? void 0 : _a.features) || {};
|
|
1125
|
+
const mergedCrawlerOptions = {
|
|
269
1126
|
...crawlerOptions,
|
|
270
1127
|
...browserConfig,
|
|
271
|
-
maxConcurrent:
|
|
272
|
-
|
|
1128
|
+
maxConcurrent: outputOptions.maxConcurrent ? Number(outputOptions.maxConcurrent) : 5,
|
|
1129
|
+
screenshot: outputOptions.screenshot,
|
|
1130
|
+
pdf: outputOptions.pdf,
|
|
1131
|
+
fetchSslCertificate: outputOptions.fetchSslCertificate,
|
|
1132
|
+
magic: antiBotFeatures.magic,
|
|
1133
|
+
simulateUser: antiBotFeatures.simulateUser,
|
|
1134
|
+
overrideNavigator: antiBotFeatures.overrideNavigator,
|
|
1135
|
+
excludeSocialMediaLinks: advancedOptions.excludeSocialMediaLinks,
|
|
1136
|
+
excludeExternalImages: advancedOptions.excludeExternalImages,
|
|
1137
|
+
delayBeforeReturnHtml: advancedOptions.delayBeforeReturnHtml,
|
|
1138
|
+
waitUntil: advancedOptions.waitUntil,
|
|
1139
|
+
verbose: advancedOptions.verbose,
|
|
1140
|
+
};
|
|
1141
|
+
if (contentFilter.filterType && contentFilter.filterType !== 'none') {
|
|
1142
|
+
const enrichedFilterConfig = { ...contentFilter };
|
|
1143
|
+
if (contentFilter.filterType === 'llm') {
|
|
1144
|
+
const credentials = await this.getCredentials('crawl4aiPlusApi');
|
|
1145
|
+
if (!credentials.enableLlm) {
|
|
1146
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'LLM features must be enabled in Crawl4AI credentials to use LLM content filtering.', { itemIndex: i });
|
|
1147
|
+
}
|
|
1148
|
+
let provider = 'openai/gpt-4o';
|
|
1149
|
+
let apiKey = '';
|
|
1150
|
+
if (credentials.llmProvider === 'openai') {
|
|
1151
|
+
const model = credentials.llmModel || 'gpt-4o';
|
|
1152
|
+
provider = `openai/${model}`;
|
|
1153
|
+
apiKey = credentials.apiKey || '';
|
|
1154
|
+
}
|
|
1155
|
+
else if (credentials.llmProvider === 'anthropic') {
|
|
1156
|
+
const model = credentials.llmModel || 'claude-3-haiku-20240307';
|
|
1157
|
+
provider = `anthropic/${model}`;
|
|
1158
|
+
apiKey = credentials.apiKey || '';
|
|
1159
|
+
}
|
|
1160
|
+
else if (credentials.llmProvider === 'groq') {
|
|
1161
|
+
const model = credentials.llmModel || 'llama3-70b-8192';
|
|
1162
|
+
provider = `groq/${model}`;
|
|
1163
|
+
apiKey = credentials.apiKey || '';
|
|
1164
|
+
}
|
|
1165
|
+
else if (credentials.llmProvider === 'ollama') {
|
|
1166
|
+
const model = credentials.ollamaModel || 'llama3';
|
|
1167
|
+
provider = `ollama/${model}`;
|
|
1168
|
+
}
|
|
1169
|
+
else if (credentials.llmProvider === 'other') {
|
|
1170
|
+
provider = credentials.customProvider || 'custom/model';
|
|
1171
|
+
apiKey = credentials.customApiKey || '';
|
|
1172
|
+
}
|
|
1173
|
+
if (!apiKey && credentials.llmProvider !== 'ollama') {
|
|
1174
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `API key is required for ${credentials.llmProvider} provider. Please configure it in the Crawl4AI credentials.`, { itemIndex: i });
|
|
1175
|
+
}
|
|
1176
|
+
enrichedFilterConfig.llmConfig = {
|
|
1177
|
+
type: 'LLMConfig',
|
|
1178
|
+
params: {
|
|
1179
|
+
provider,
|
|
1180
|
+
api_token: apiKey,
|
|
1181
|
+
...(credentials.llmProvider === 'other' && credentials.customBaseUrl ?
|
|
1182
|
+
{ api_base: credentials.customBaseUrl } : {}),
|
|
1183
|
+
...(credentials.llmProvider === 'ollama' && credentials.ollamaUrl ?
|
|
1184
|
+
{ api_base: credentials.ollamaUrl } : {})
|
|
1185
|
+
}
|
|
1186
|
+
};
|
|
1187
|
+
}
|
|
1188
|
+
mergedCrawlerOptions.markdownGenerator = (0, utils_1.createMarkdownGenerator)(enrichedFilterConfig);
|
|
1189
|
+
}
|
|
1190
|
+
const tableExtractionConfig = this.getNodeParameter('tableExtraction', i, {});
|
|
1191
|
+
if (tableExtractionConfig.strategyType && tableExtractionConfig.strategyType !== 'none') {
|
|
1192
|
+
const enrichedTableConfig = { ...tableExtractionConfig };
|
|
1193
|
+
if (tableExtractionConfig.strategyType === 'llm') {
|
|
1194
|
+
const credentials = await this.getCredentials('crawl4aiPlusApi');
|
|
1195
|
+
if (!credentials.enableLlm) {
|
|
1196
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'LLM features must be enabled in Crawl4AI credentials to use LLM table extraction.', { itemIndex: i });
|
|
1197
|
+
}
|
|
1198
|
+
let provider = 'openai/gpt-4o';
|
|
1199
|
+
let apiKey = '';
|
|
1200
|
+
if (credentials.llmProvider === 'openai') {
|
|
1201
|
+
const model = credentials.llmModel || 'gpt-4o';
|
|
1202
|
+
provider = `openai/${model}`;
|
|
1203
|
+
apiKey = credentials.apiKey || '';
|
|
1204
|
+
}
|
|
1205
|
+
else if (credentials.llmProvider === 'anthropic') {
|
|
1206
|
+
const model = credentials.llmModel || 'claude-3-haiku-20240307';
|
|
1207
|
+
provider = `anthropic/${model}`;
|
|
1208
|
+
apiKey = credentials.apiKey || '';
|
|
1209
|
+
}
|
|
1210
|
+
else if (credentials.llmProvider === 'groq') {
|
|
1211
|
+
const model = credentials.llmModel || 'llama3-70b-8192';
|
|
1212
|
+
provider = `groq/${model}`;
|
|
1213
|
+
apiKey = credentials.apiKey || '';
|
|
1214
|
+
}
|
|
1215
|
+
else if (credentials.llmProvider === 'ollama') {
|
|
1216
|
+
const model = credentials.ollamaModel || 'llama3';
|
|
1217
|
+
provider = `ollama/${model}`;
|
|
1218
|
+
}
|
|
1219
|
+
else if (credentials.llmProvider === 'other') {
|
|
1220
|
+
provider = credentials.customProvider || 'custom/model';
|
|
1221
|
+
apiKey = credentials.customApiKey || '';
|
|
1222
|
+
}
|
|
1223
|
+
if (!apiKey && credentials.llmProvider !== 'ollama') {
|
|
1224
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `API key is required for ${credentials.llmProvider} provider. Please configure it in the Crawl4AI credentials.`, { itemIndex: i });
|
|
1225
|
+
}
|
|
1226
|
+
enrichedTableConfig.llmConfig = {
|
|
1227
|
+
type: 'LLMConfig',
|
|
1228
|
+
params: {
|
|
1229
|
+
provider,
|
|
1230
|
+
api_token: apiKey,
|
|
1231
|
+
...(credentials.llmProvider === 'other' && credentials.customBaseUrl ?
|
|
1232
|
+
{ api_base: credentials.customBaseUrl } : {}),
|
|
1233
|
+
...(credentials.llmProvider === 'ollama' && credentials.ollamaUrl ?
|
|
1234
|
+
{ api_base: credentials.ollamaUrl } : {})
|
|
1235
|
+
}
|
|
1236
|
+
};
|
|
1237
|
+
}
|
|
1238
|
+
mergedCrawlerOptions.tableExtraction = (0, utils_1.createTableExtractionStrategy)(enrichedTableConfig);
|
|
1239
|
+
}
|
|
1240
|
+
if (crawlMode === 'discover') {
|
|
1241
|
+
const discoveryOptions = this.getNodeParameter('discoveryOptions', i, {});
|
|
1242
|
+
const seedUrl = String((_b = discoveryOptions.seedUrl) !== null && _b !== void 0 ? _b : '').trim();
|
|
1243
|
+
const query = String((_c = discoveryOptions.query) !== null && _c !== void 0 ? _c : '').trim();
|
|
1244
|
+
if (!seedUrl) {
|
|
1245
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Seed URL is required when discovery mode is enabled.', { itemIndex: i });
|
|
1246
|
+
}
|
|
1247
|
+
if (!(0, utils_1.isValidUrl)(seedUrl)) {
|
|
1248
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid Seed URL: ${seedUrl}`, { itemIndex: i });
|
|
1249
|
+
}
|
|
1250
|
+
if (!query) {
|
|
1251
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Discovery query cannot be empty.', { itemIndex: i });
|
|
1252
|
+
}
|
|
1253
|
+
const maxDepthRaw = (_d = discoveryOptions.maxDepth) !== null && _d !== void 0 ? _d : 2;
|
|
1254
|
+
const maxDepth = Math.min(Math.max(Number(maxDepthRaw), 1), 5);
|
|
1255
|
+
if (Number(maxDepthRaw) !== maxDepth) {
|
|
1256
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Maximum Depth must be between 1 and 5.', { itemIndex: i });
|
|
1257
|
+
}
|
|
1258
|
+
const maxPagesRaw = (_e = discoveryOptions.maxPages) !== null && _e !== void 0 ? _e : 50;
|
|
1259
|
+
const maxPages = Math.min(Math.max(Number(maxPagesRaw), 1), 200);
|
|
1260
|
+
if (Number(maxPagesRaw) !== maxPages) {
|
|
1261
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Maximum Pages must be between 1 and 200.', { itemIndex: i });
|
|
1262
|
+
}
|
|
1263
|
+
const includeExternal = discoveryOptions.includeExternal === true;
|
|
1264
|
+
const respectRobotsTxt = discoveryOptions.respectRobotsTxt !== false;
|
|
1265
|
+
const includePatternsRaw = discoveryOptions.includePatterns;
|
|
1266
|
+
const includePatterns = Array.isArray(includePatternsRaw)
|
|
1267
|
+
? includePatternsRaw
|
|
1268
|
+
: typeof includePatternsRaw === 'string'
|
|
1269
|
+
? includePatternsRaw
|
|
1270
|
+
.split(',')
|
|
1271
|
+
.map(value => value.trim())
|
|
1272
|
+
.filter(value => value.length > 0)
|
|
1273
|
+
: [];
|
|
1274
|
+
const excludePatternsRaw = discoveryOptions.excludePatterns;
|
|
1275
|
+
const excludePatterns = Array.isArray(excludePatternsRaw)
|
|
1276
|
+
? excludePatternsRaw
|
|
1277
|
+
: typeof excludePatternsRaw === 'string'
|
|
1278
|
+
? excludePatternsRaw
|
|
1279
|
+
.split(',')
|
|
1280
|
+
.map(value => value.trim())
|
|
1281
|
+
.filter(value => value.length > 0)
|
|
1282
|
+
: [];
|
|
1283
|
+
const excludeDomainsRaw = discoveryOptions.excludeDomains;
|
|
1284
|
+
const excludeDomains = Array.isArray(excludeDomainsRaw)
|
|
1285
|
+
? excludeDomainsRaw
|
|
1286
|
+
: typeof excludeDomainsRaw === 'string'
|
|
1287
|
+
? excludeDomainsRaw
|
|
1288
|
+
.split(',')
|
|
1289
|
+
.map(value => value.trim())
|
|
1290
|
+
.filter(value => value.length > 0)
|
|
1291
|
+
: [];
|
|
1292
|
+
const resultLimitRaw = (_f = discoveryOptions.resultLimit) !== null && _f !== void 0 ? _f : 0;
|
|
1293
|
+
const resultLimit = Math.max(Number(resultLimitRaw), 0);
|
|
1294
|
+
const filters = [];
|
|
1295
|
+
if (excludeDomains.length > 0) {
|
|
1296
|
+
filters.push({
|
|
1297
|
+
type: 'DomainFilter',
|
|
1298
|
+
params: {
|
|
1299
|
+
blocked_domains: excludeDomains,
|
|
1300
|
+
},
|
|
1301
|
+
});
|
|
1302
|
+
}
|
|
1303
|
+
if (excludePatterns.length > 0) {
|
|
1304
|
+
filters.push({
|
|
1305
|
+
type: 'URLPatternFilter',
|
|
1306
|
+
params: {
|
|
1307
|
+
patterns: excludePatterns,
|
|
1308
|
+
reverse: true,
|
|
1309
|
+
},
|
|
1310
|
+
});
|
|
1311
|
+
}
|
|
1312
|
+
if (includePatterns.length > 0) {
|
|
1313
|
+
filters.push({
|
|
1314
|
+
type: 'URLPatternFilter',
|
|
1315
|
+
params: {
|
|
1316
|
+
patterns: includePatterns,
|
|
1317
|
+
reverse: false,
|
|
1318
|
+
},
|
|
1319
|
+
});
|
|
1320
|
+
}
|
|
1321
|
+
const urlScorer = query
|
|
1322
|
+
? {
|
|
1323
|
+
type: 'KeywordRelevanceScorer',
|
|
1324
|
+
params: {
|
|
1325
|
+
keywords: query.split(/\s+OR\s+|\s+/).filter(k => k.trim()),
|
|
1326
|
+
weight: 1.0,
|
|
1327
|
+
},
|
|
1328
|
+
}
|
|
1329
|
+
: undefined;
|
|
1330
|
+
const strategyType = String((_g = discoveryOptions.crawlStrategy) !== null && _g !== void 0 ? _g : 'BestFirstCrawlingStrategy');
|
|
1331
|
+
const strategyParams = {
|
|
1332
|
+
max_depth: maxDepth,
|
|
1333
|
+
max_pages: maxPages,
|
|
1334
|
+
include_external: includeExternal,
|
|
1335
|
+
...(filters.length > 0 ? {
|
|
1336
|
+
filter_chain: {
|
|
1337
|
+
type: 'FilterChain',
|
|
1338
|
+
params: {
|
|
1339
|
+
filters,
|
|
1340
|
+
},
|
|
1341
|
+
},
|
|
1342
|
+
} : {}),
|
|
1343
|
+
...(urlScorer ? { url_scorer: urlScorer } : {}),
|
|
1344
|
+
};
|
|
1345
|
+
if (strategyType !== 'BestFirstCrawlingStrategy') {
|
|
1346
|
+
const scoreThreshold = Number((_h = discoveryOptions.scoreThreshold) !== null && _h !== void 0 ? _h : 0);
|
|
1347
|
+
if (scoreThreshold > 0) {
|
|
1348
|
+
strategyParams.score_threshold = scoreThreshold;
|
|
1349
|
+
}
|
|
1350
|
+
}
|
|
1351
|
+
const deepCrawlStrategy = {
|
|
1352
|
+
type: strategyType,
|
|
1353
|
+
params: strategyParams,
|
|
1354
|
+
};
|
|
1355
|
+
mergedCrawlerOptions.deepCrawlStrategy = deepCrawlStrategy;
|
|
1356
|
+
if (respectRobotsTxt) {
|
|
1357
|
+
mergedCrawlerOptions.checkRobotsTxt = true;
|
|
1358
|
+
}
|
|
1359
|
+
urls = [seedUrl];
|
|
1360
|
+
mergedCrawlerOptions.__resultLimit = resultLimit;
|
|
1361
|
+
const extractionOptions = this.getNodeParameter('extractionOptions', i, {});
|
|
1362
|
+
if (extractionOptions.enableExtraction === true) {
|
|
1363
|
+
const extractionType = extractionOptions.extractionType || 'css';
|
|
1364
|
+
if (extractionType === 'css') {
|
|
1365
|
+
const baseSelector = (extractionOptions.baseSelector || '').trim();
|
|
1366
|
+
const fieldSelectorsJson = (extractionOptions.fieldSelectors || '[]').trim();
|
|
1367
|
+
if (!baseSelector) {
|
|
1368
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Base selector is required for CSS extraction.', { itemIndex: i });
|
|
1369
|
+
}
|
|
1370
|
+
let fields = [];
|
|
1371
|
+
try {
|
|
1372
|
+
fields = JSON.parse(fieldSelectorsJson);
|
|
1373
|
+
if (!Array.isArray(fields) || fields.length === 0) {
|
|
1374
|
+
throw new Error('Field selectors must be a non-empty array');
|
|
1375
|
+
}
|
|
1376
|
+
}
|
|
1377
|
+
catch (e) {
|
|
1378
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid field selectors JSON: ${e.message}`, { itemIndex: i });
|
|
1379
|
+
}
|
|
1380
|
+
mergedCrawlerOptions.extractionStrategy = {
|
|
1381
|
+
type: 'JsonCssExtractionStrategy',
|
|
1382
|
+
params: {
|
|
1383
|
+
schema: {
|
|
1384
|
+
type: 'dict',
|
|
1385
|
+
value: {
|
|
1386
|
+
name: 'ExtractedData',
|
|
1387
|
+
baseSelector,
|
|
1388
|
+
fields,
|
|
1389
|
+
},
|
|
1390
|
+
},
|
|
1391
|
+
},
|
|
1392
|
+
};
|
|
1393
|
+
}
|
|
1394
|
+
else if (extractionType === 'llm') {
|
|
1395
|
+
const llmInstruction = (extractionOptions.llmInstruction || '').trim();
|
|
1396
|
+
const llmSchemaJson = (extractionOptions.llmSchema || '{}').trim();
|
|
1397
|
+
if (!llmInstruction) {
|
|
1398
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'Extraction instructions are required for LLM extraction.', { itemIndex: i });
|
|
1399
|
+
}
|
|
1400
|
+
let schema = {};
|
|
1401
|
+
try {
|
|
1402
|
+
schema = JSON.parse(llmSchemaJson);
|
|
1403
|
+
}
|
|
1404
|
+
catch (e) {
|
|
1405
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `Invalid LLM schema JSON: ${e.message}`, { itemIndex: i });
|
|
1406
|
+
}
|
|
1407
|
+
const credentials = await this.getCredentials('crawl4aiPlusApi');
|
|
1408
|
+
if (!credentials.enableLlm) {
|
|
1409
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), 'LLM features must be enabled in Crawl4AI credentials to use LLM extraction.', { itemIndex: i });
|
|
1410
|
+
}
|
|
1411
|
+
let provider = 'openai/gpt-4o';
|
|
1412
|
+
let apiKey = '';
|
|
1413
|
+
if (credentials.llmProvider === 'openai') {
|
|
1414
|
+
const model = credentials.llmModel || 'gpt-4o';
|
|
1415
|
+
provider = `openai/${model}`;
|
|
1416
|
+
apiKey = credentials.apiKey || '';
|
|
1417
|
+
}
|
|
1418
|
+
else if (credentials.llmProvider === 'anthropic') {
|
|
1419
|
+
const model = credentials.llmModel || 'claude-3-haiku-20240307';
|
|
1420
|
+
provider = `anthropic/${model}`;
|
|
1421
|
+
apiKey = credentials.apiKey || '';
|
|
1422
|
+
}
|
|
1423
|
+
else if (credentials.llmProvider === 'groq') {
|
|
1424
|
+
const model = credentials.llmModel || 'llama3-70b-8192';
|
|
1425
|
+
provider = `groq/${model}`;
|
|
1426
|
+
apiKey = credentials.apiKey || '';
|
|
1427
|
+
}
|
|
1428
|
+
else if (credentials.llmProvider === 'ollama') {
|
|
1429
|
+
const model = credentials.ollamaModel || 'llama3';
|
|
1430
|
+
provider = `ollama/${model}`;
|
|
1431
|
+
}
|
|
1432
|
+
else if (credentials.llmProvider === 'other') {
|
|
1433
|
+
provider = credentials.customProvider || 'custom/model';
|
|
1434
|
+
apiKey = credentials.customApiKey || '';
|
|
1435
|
+
}
|
|
1436
|
+
if (!apiKey && credentials.llmProvider !== 'ollama') {
|
|
1437
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), `API key is required for ${credentials.llmProvider} provider.`, { itemIndex: i });
|
|
1438
|
+
}
|
|
1439
|
+
mergedCrawlerOptions.extractionStrategy = {
|
|
1440
|
+
type: 'LLMExtractionStrategy',
|
|
1441
|
+
params: {
|
|
1442
|
+
llm_config: {
|
|
1443
|
+
type: 'LLMConfig',
|
|
1444
|
+
params: {
|
|
1445
|
+
provider,
|
|
1446
|
+
api_token: apiKey,
|
|
1447
|
+
...(credentials.llmProvider === 'other' && credentials.customBaseUrl ?
|
|
1448
|
+
{ api_base: credentials.customBaseUrl } : {}),
|
|
1449
|
+
...(credentials.llmProvider === 'ollama' && credentials.ollamaUrl ?
|
|
1450
|
+
{ api_base: credentials.ollamaUrl } : {})
|
|
1451
|
+
}
|
|
1452
|
+
},
|
|
1453
|
+
schema: { type: 'dict', value: schema },
|
|
1454
|
+
instruction: llmInstruction,
|
|
1455
|
+
extraction_type: 'schema',
|
|
1456
|
+
},
|
|
1457
|
+
};
|
|
1458
|
+
}
|
|
1459
|
+
}
|
|
1460
|
+
}
|
|
1461
|
+
const crawlerConfig = (0, utils_1.createCrawlerRunConfig)(mergedCrawlerOptions);
|
|
273
1462
|
const crawler = await (0, utils_1.getCrawl4aiClient)(this);
|
|
274
1463
|
const results = await crawler.crawlMultipleUrls(urls, crawlerConfig);
|
|
275
|
-
|
|
276
|
-
|
|
1464
|
+
const resultLimit = crawlMode === 'discover'
|
|
1465
|
+
? Number((_j = mergedCrawlerOptions.__resultLimit) !== null && _j !== void 0 ? _j : 0)
|
|
1466
|
+
: 0;
|
|
1467
|
+
const limitedResults = resultLimit > 0 ? results.slice(0, resultLimit) : results;
|
|
1468
|
+
for (const result of limitedResults) {
|
|
1469
|
+
const formattedResult = (0, formatters_1.formatCrawlResult)(result, outputOptions.includeMedia, outputOptions.verboseResponse, {
|
|
1470
|
+
markdownOutput: outputOptions.markdownOutput,
|
|
1471
|
+
includeLinks: outputOptions.includeLinks,
|
|
1472
|
+
includeScreenshot: outputOptions.screenshot,
|
|
1473
|
+
includePdf: outputOptions.pdf,
|
|
1474
|
+
includeSslCertificate: outputOptions.fetchSslCertificate,
|
|
1475
|
+
includeTables: outputOptions.includeTables,
|
|
1476
|
+
});
|
|
277
1477
|
allResults.push({
|
|
278
1478
|
json: formattedResult,
|
|
279
1479
|
pairedItem: { item: i },
|
|
@@ -283,7 +1483,7 @@ async function execute(items, nodeOptions) {
|
|
|
283
1483
|
catch (error) {
|
|
284
1484
|
if (this.continueOnFail()) {
|
|
285
1485
|
const node = this.getNode();
|
|
286
|
-
const errorItemIndex = (
|
|
1486
|
+
const errorItemIndex = (_k = error.itemIndex) !== null && _k !== void 0 ? _k : i;
|
|
287
1487
|
allResults.push({
|
|
288
1488
|
json: items[i].json,
|
|
289
1489
|
error: new n8n_workflow_1.NodeOperationError(node, error.message, { itemIndex: errorItemIndex }),
|