n8n-nodes-firecrawl-latest 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +19 -0
- package/README.md +232 -0
- package/dist/credentials/FirecrawlApi.credentials.js +22 -0
- package/dist/icons/flames-icon.svg +144 -0
- package/dist/nodes/Firecrawl/FireCrawlScraper.node.js +156 -0
- package/dist/nodes/Firecrawl/resources/batchScrape/batchScrape.methods.js +253 -0
- package/dist/nodes/Firecrawl/resources/batchScrape/batchScrape.properties.js +205 -0
- package/dist/nodes/Firecrawl/resources/crawler/crawler.methods.js +281 -0
- package/dist/nodes/Firecrawl/resources/crawler/crawler.properties.js +313 -0
- package/dist/nodes/Firecrawl/resources/deepResearch/deepResearch.methods.js +171 -0
- package/dist/nodes/Firecrawl/resources/deepResearch/deepResearch.properties.js +200 -0
- package/dist/nodes/Firecrawl/resources/extract/extract.methods.js +424 -0
- package/dist/nodes/Firecrawl/resources/extract/extract.properties.js +339 -0
- package/dist/nodes/Firecrawl/resources/llmsText/llmsText.methods.js +124 -0
- package/dist/nodes/Firecrawl/resources/llmsText/llmsText.properties.js +87 -0
- package/dist/nodes/Firecrawl/resources/map/map.methods.js +52 -0
- package/dist/nodes/Firecrawl/resources/map/map.properties.js +22 -0
- package/dist/nodes/Firecrawl/resources/scrape/scrape.methods.js +203 -0
- package/dist/nodes/Firecrawl/resources/scrape/scrape.properties.js +348 -0
- package/dist/nodes/HttpBin/HttpBin.node.js +59 -0
- package/dist/nodes/HttpBin/HttpVerbDescription.js +246 -0
- package/dist/nodes/HttpBin/httpbin.svg +18 -0
- package/index.js +7 -0
- package/package.json +58 -0
@@ -0,0 +1,281 @@
|
|
1
|
+
"use strict";
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
4
|
+
};
|
5
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
6
|
+
exports.crawlerMethods = void 0;
|
7
|
+
const firecrawl_js_1 = __importDefault(require("@mendable/firecrawl-js"));
|
8
|
+
const n8n_workflow_1 = require("n8n-workflow");
|
9
|
+
// Helper function to generate schema from JSON example
|
10
|
+
function generateSchemaFromExample(jsonExample) {
|
11
|
+
if (jsonExample === null) {
|
12
|
+
return { type: 'null' };
|
13
|
+
}
|
14
|
+
if (typeof jsonExample === 'string') {
|
15
|
+
return { type: 'string' };
|
16
|
+
}
|
17
|
+
if (typeof jsonExample === 'number') {
|
18
|
+
return { type: 'number' };
|
19
|
+
}
|
20
|
+
if (typeof jsonExample === 'boolean') {
|
21
|
+
return { type: 'boolean' };
|
22
|
+
}
|
23
|
+
if (Array.isArray(jsonExample)) {
|
24
|
+
if (jsonExample.length === 0) {
|
25
|
+
return {
|
26
|
+
type: 'array',
|
27
|
+
items: { type: 'string' }, // Default to string items for empty arrays
|
28
|
+
};
|
29
|
+
}
|
30
|
+
// Use the first item as a sample for the items schema
|
31
|
+
const itemSchema = generateSchemaFromExample(jsonExample[0]);
|
32
|
+
return {
|
33
|
+
type: 'array',
|
34
|
+
items: itemSchema,
|
35
|
+
};
|
36
|
+
}
|
37
|
+
if (typeof jsonExample === 'object') {
|
38
|
+
const properties = {};
|
39
|
+
for (const [key, value] of Object.entries(jsonExample)) {
|
40
|
+
properties[key] = generateSchemaFromExample(value);
|
41
|
+
}
|
42
|
+
return {
|
43
|
+
type: 'object',
|
44
|
+
properties,
|
45
|
+
required: Object.keys(properties),
|
46
|
+
};
|
47
|
+
}
|
48
|
+
// Default fallback
|
49
|
+
return { type: 'string' };
|
50
|
+
}
|
51
|
+
exports.crawlerMethods = {
|
52
|
+
async execute() {
|
53
|
+
const items = this.getInputData();
|
54
|
+
const returnData = [];
|
55
|
+
// Get credentials
|
56
|
+
const credentials = await this.getCredentials('firecrawlApi');
|
57
|
+
const apiKey = credentials.apiKey;
|
58
|
+
// Initialize Firecrawl app
|
59
|
+
const firecrawl = new firecrawl_js_1.default({ apiKey });
|
60
|
+
// Process each item
|
61
|
+
for (let i = 0; i < items.length; i++) {
|
62
|
+
try {
|
63
|
+
// Get parameters
|
64
|
+
const url = this.getNodeParameter('url', i);
|
65
|
+
const limit = this.getNodeParameter('limit', i);
|
66
|
+
const operationMode = this.getNodeParameter('operationMode', i);
|
67
|
+
const outputFormats = this.getNodeParameter('outputFormats', i, ['markdown']);
|
68
|
+
const enableDebugLogs = this.getNodeParameter('enableDebugLogs', i, false);
|
69
|
+
const trackChanges = this.getNodeParameter('trackChanges', i, false);
|
70
|
+
const enableLlmExtraction = this.getNodeParameter('enableLlmExtraction', i, false);
|
71
|
+
// Optional path parameters
|
72
|
+
const includePath = this.getNodeParameter('includePath', i, '');
|
73
|
+
const excludePath = this.getNodeParameter('excludePath', i, '');
|
74
|
+
const maxDepth = this.getNodeParameter('maxDepth', i, 5);
|
75
|
+
// Prepare crawl parameters
|
76
|
+
const crawlParams = {
|
77
|
+
limit,
|
78
|
+
maxDepth,
|
79
|
+
};
|
80
|
+
// Add output formats
|
81
|
+
crawlParams.scrapeOptions = {
|
82
|
+
formats: outputFormats,
|
83
|
+
};
|
84
|
+
// Add path filters if provided
|
85
|
+
if (includePath) {
|
86
|
+
// Convert to array if it's not already
|
87
|
+
crawlParams.includePaths = includePath.includes(',')
|
88
|
+
? includePath.split(',').map(path => path.trim())
|
89
|
+
: [includePath.trim()];
|
90
|
+
}
|
91
|
+
if (excludePath) {
|
92
|
+
// Convert to array if it's not already
|
93
|
+
crawlParams.excludePaths = excludePath.includes(',')
|
94
|
+
? excludePath.split(',').map(path => path.trim())
|
95
|
+
: [excludePath.trim()];
|
96
|
+
}
|
97
|
+
// Add change tracking if enabled
|
98
|
+
if (trackChanges) {
|
99
|
+
const changeTrackingModes = this.getNodeParameter('changeTrackingMode', i, ['git-diff']);
|
100
|
+
if (!crawlParams.scrapeOptions.formats.includes('changeTracking')) {
|
101
|
+
crawlParams.scrapeOptions.formats.push('changeTracking');
|
102
|
+
}
|
103
|
+
crawlParams.scrapeOptions.changeTrackingOptions = {
|
104
|
+
modes: changeTrackingModes,
|
105
|
+
};
|
106
|
+
// Add JSON schema for change tracking if JSON mode is selected
|
107
|
+
if (changeTrackingModes.includes('json')) {
|
108
|
+
const changeTrackingSchema = JSON.parse(this.getNodeParameter('changeTrackingSchema', i, '{}'));
|
109
|
+
crawlParams.scrapeOptions.changeTrackingOptions.schema = changeTrackingSchema;
|
110
|
+
}
|
111
|
+
}
|
112
|
+
// Add LLM extraction if enabled
|
113
|
+
if (enableLlmExtraction) {
|
114
|
+
const extractionPrompt = this.getNodeParameter('extractionPrompt', i);
|
115
|
+
const schemaDefinitionType = this.getNodeParameter('schemaDefinitionType', i);
|
116
|
+
// Add 'json' to the formats if not already included
|
117
|
+
if (!crawlParams.scrapeOptions.formats.includes('json')) {
|
118
|
+
crawlParams.scrapeOptions.formats.push('json');
|
119
|
+
}
|
120
|
+
crawlParams.scrapeOptions.jsonOptions = {
|
121
|
+
prompt: extractionPrompt,
|
122
|
+
};
|
123
|
+
// Add schema for extraction
|
124
|
+
let schema;
|
125
|
+
if (schemaDefinitionType === 'example') {
|
126
|
+
const jsonExample = JSON.parse(this.getNodeParameter('jsonExample', i));
|
127
|
+
schema = generateSchemaFromExample(jsonExample);
|
128
|
+
}
|
129
|
+
else {
|
130
|
+
// Manual schema definition
|
131
|
+
schema = JSON.parse(this.getNodeParameter('schemaDefinition', i));
|
132
|
+
}
|
133
|
+
crawlParams.scrapeOptions.jsonOptions.schema = schema;
|
134
|
+
}
|
135
|
+
// Log the crawler parameters if debug is enabled
|
136
|
+
if (enableDebugLogs) {
|
137
|
+
console.log('URL:', url);
|
138
|
+
console.log('Crawl parameters:', JSON.stringify(crawlParams, null, 2));
|
139
|
+
}
|
140
|
+
// Handle different operation modes
|
141
|
+
if (operationMode === 'sync') {
|
142
|
+
// Synchronous mode - wait for the crawling to complete
|
143
|
+
const results = await firecrawl.crawlUrl(url, crawlParams);
|
144
|
+
// Log the results if debug is enabled
|
145
|
+
if (enableDebugLogs) {
|
146
|
+
console.log('Crawl results:', JSON.stringify(results, null, 2));
|
147
|
+
}
|
148
|
+
if (!results.success && 'error' in results) {
|
149
|
+
returnData.push({
|
150
|
+
json: {
|
151
|
+
success: false,
|
152
|
+
status: 'error',
|
153
|
+
error: results.error,
|
154
|
+
debug: enableDebugLogs
|
155
|
+
? {
|
156
|
+
url,
|
157
|
+
params: crawlParams,
|
158
|
+
}
|
159
|
+
: undefined,
|
160
|
+
},
|
161
|
+
});
|
162
|
+
}
|
163
|
+
else {
|
164
|
+
returnData.push({
|
165
|
+
json: {
|
166
|
+
success: true,
|
167
|
+
status: 'status' in results ? results.status : 'completed',
|
168
|
+
data: 'data' in results ? results.data : undefined,
|
169
|
+
completed: 'completed' in results ? results.completed : undefined,
|
170
|
+
total: 'total' in results ? results.total : undefined,
|
171
|
+
creditsUsed: 'creditsUsed' in results ? results.creditsUsed : undefined,
|
172
|
+
expiresAt: 'expiresAt' in results ? results.expiresAt : undefined,
|
173
|
+
debug: enableDebugLogs
|
174
|
+
? {
|
175
|
+
url,
|
176
|
+
params: crawlParams,
|
177
|
+
}
|
178
|
+
: undefined,
|
179
|
+
},
|
180
|
+
});
|
181
|
+
}
|
182
|
+
}
|
183
|
+
else {
|
184
|
+
// Asynchronous mode - start the process or check status
|
185
|
+
const jobId = this.getNodeParameter('jobId', i, '');
|
186
|
+
if (jobId && jobId.trim() !== '') {
|
187
|
+
// Check status of an existing job
|
188
|
+
const status = await firecrawl.checkCrawlStatus(jobId);
|
189
|
+
// Log the status if debug is enabled
|
190
|
+
if (enableDebugLogs) {
|
191
|
+
console.log('Crawl status:', JSON.stringify(status, null, 2));
|
192
|
+
}
|
193
|
+
if (!status.success && 'error' in status) {
|
194
|
+
returnData.push({
|
195
|
+
json: {
|
196
|
+
success: false,
|
197
|
+
status: 'error',
|
198
|
+
error: status.error,
|
199
|
+
jobId,
|
200
|
+
debug: enableDebugLogs ? { jobId } : undefined,
|
201
|
+
},
|
202
|
+
});
|
203
|
+
}
|
204
|
+
else {
|
205
|
+
returnData.push({
|
206
|
+
json: {
|
207
|
+
success: true,
|
208
|
+
status: 'status' in status ? status.status : 'unknown',
|
209
|
+
jobId,
|
210
|
+
completed: 'completed' in status ? status.completed : undefined,
|
211
|
+
total: 'total' in status ? status.total : undefined,
|
212
|
+
creditsUsed: 'creditsUsed' in status ? status.creditsUsed : undefined,
|
213
|
+
expiresAt: 'expiresAt' in status ? status.expiresAt : undefined,
|
214
|
+
data: 'data' in status ? status.data : [],
|
215
|
+
next: 'next' in status ? status.next : undefined,
|
216
|
+
debug: enableDebugLogs ? { jobId } : undefined,
|
217
|
+
},
|
218
|
+
});
|
219
|
+
}
|
220
|
+
}
|
221
|
+
else {
|
222
|
+
// Start a new asynchronous job
|
223
|
+
const job = await firecrawl.asyncCrawlUrl(url, crawlParams);
|
224
|
+
// Log the job if debug is enabled
|
225
|
+
if (enableDebugLogs) {
|
226
|
+
console.log('Crawl job:', JSON.stringify(job, null, 2));
|
227
|
+
}
|
228
|
+
if (!job.success && 'error' in job) {
|
229
|
+
returnData.push({
|
230
|
+
json: {
|
231
|
+
success: false,
|
232
|
+
status: 'error',
|
233
|
+
error: job.error,
|
234
|
+
debug: enableDebugLogs
|
235
|
+
? {
|
236
|
+
url,
|
237
|
+
params: crawlParams,
|
238
|
+
}
|
239
|
+
: undefined,
|
240
|
+
},
|
241
|
+
});
|
242
|
+
}
|
243
|
+
else {
|
244
|
+
returnData.push({
|
245
|
+
json: {
|
246
|
+
success: true,
|
247
|
+
status: 'started',
|
248
|
+
jobId: 'id' in job ? job.id : undefined,
|
249
|
+
message: 'Crawling started successfully. Use the job ID to check status.',
|
250
|
+
debug: enableDebugLogs
|
251
|
+
? {
|
252
|
+
url,
|
253
|
+
params: crawlParams,
|
254
|
+
}
|
255
|
+
: undefined,
|
256
|
+
},
|
257
|
+
});
|
258
|
+
}
|
259
|
+
}
|
260
|
+
}
|
261
|
+
}
|
262
|
+
catch (error) {
|
263
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
264
|
+
console.error('Crawler error:', errorMessage);
|
265
|
+
if (this.continueOnFail()) {
|
266
|
+
returnData.push({
|
267
|
+
json: {
|
268
|
+
success: false,
|
269
|
+
error: errorMessage,
|
270
|
+
},
|
271
|
+
});
|
272
|
+
continue;
|
273
|
+
}
|
274
|
+
throw new n8n_workflow_1.NodeOperationError(this.getNode(), error, {
|
275
|
+
itemIndex: i,
|
276
|
+
});
|
277
|
+
}
|
278
|
+
}
|
279
|
+
return [returnData];
|
280
|
+
},
|
281
|
+
};
|
@@ -0,0 +1,313 @@
|
|
1
|
+
"use strict";
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
3
|
+
exports.crawlerProperties = void 0;
|
4
|
+
// Fields for the Crawler resource
|
5
|
+
const crawlerFields = [
|
6
|
+
{
|
7
|
+
displayName: 'URL',
|
8
|
+
name: 'url',
|
9
|
+
type: 'string',
|
10
|
+
displayOptions: {
|
11
|
+
show: {
|
12
|
+
resource: ['crawler'],
|
13
|
+
},
|
14
|
+
},
|
15
|
+
default: '',
|
16
|
+
required: true,
|
17
|
+
description: 'The starting URL to crawl',
|
18
|
+
placeholder: 'https://example.com',
|
19
|
+
},
|
20
|
+
{
|
21
|
+
displayName: 'Limit',
|
22
|
+
name: 'limit',
|
23
|
+
type: 'number',
|
24
|
+
typeOptions: {
|
25
|
+
minValue: 1,
|
26
|
+
},
|
27
|
+
displayOptions: {
|
28
|
+
show: {
|
29
|
+
resource: ['crawler'],
|
30
|
+
},
|
31
|
+
},
|
32
|
+
default: 50,
|
33
|
+
description: 'Max number of results to return',
|
34
|
+
},
|
35
|
+
{
|
36
|
+
displayName: 'Output Formats',
|
37
|
+
name: 'outputFormats',
|
38
|
+
type: 'multiOptions',
|
39
|
+
displayOptions: {
|
40
|
+
show: {
|
41
|
+
resource: ['crawler'],
|
42
|
+
},
|
43
|
+
},
|
44
|
+
options: [
|
45
|
+
{
|
46
|
+
name: 'Full Page Screenshot',
|
47
|
+
value: 'screenshot@fullPage',
|
48
|
+
description: 'Return a screenshot of the entire page',
|
49
|
+
},
|
50
|
+
{
|
51
|
+
name: 'HTML',
|
52
|
+
value: 'html',
|
53
|
+
description: 'Return the content in HTML format (with some cleaning)',
|
54
|
+
},
|
55
|
+
{
|
56
|
+
name: 'Links',
|
57
|
+
value: 'links',
|
58
|
+
description: 'Return a list of links found on the page',
|
59
|
+
},
|
60
|
+
{
|
61
|
+
name: 'Markdown',
|
62
|
+
value: 'markdown',
|
63
|
+
description: 'Return the content in Markdown format',
|
64
|
+
},
|
65
|
+
{
|
66
|
+
name: 'Raw HTML',
|
67
|
+
value: 'rawHtml',
|
68
|
+
description: 'Return the raw HTML content with no modifications',
|
69
|
+
},
|
70
|
+
{
|
71
|
+
name: 'Screenshot',
|
72
|
+
value: 'screenshot',
|
73
|
+
description: 'Return a screenshot of the visible part of the page',
|
74
|
+
},
|
75
|
+
],
|
76
|
+
default: ['markdown'],
|
77
|
+
description: 'The formats in which to return the crawled content',
|
78
|
+
},
|
79
|
+
{
|
80
|
+
displayName: 'Track Changes',
|
81
|
+
name: 'trackChanges',
|
82
|
+
type: 'boolean',
|
83
|
+
displayOptions: {
|
84
|
+
show: {
|
85
|
+
resource: ['crawler'],
|
86
|
+
},
|
87
|
+
},
|
88
|
+
default: false,
|
89
|
+
description: 'Whether to track changes between the current crawl and previous ones',
|
90
|
+
},
|
91
|
+
{
|
92
|
+
displayName: 'Change Tracking Mode',
|
93
|
+
name: 'changeTrackingMode',
|
94
|
+
type: 'multiOptions',
|
95
|
+
displayOptions: {
|
96
|
+
show: {
|
97
|
+
resource: ['crawler'],
|
98
|
+
trackChanges: [true],
|
99
|
+
},
|
100
|
+
},
|
101
|
+
options: [
|
102
|
+
{
|
103
|
+
name: 'Git-Diff',
|
104
|
+
value: 'git-diff',
|
105
|
+
description: 'Track changes using Git-style diff format',
|
106
|
+
},
|
107
|
+
{
|
108
|
+
name: 'JSON',
|
109
|
+
value: 'json',
|
110
|
+
description: 'Track changes in structured JSON format',
|
111
|
+
},
|
112
|
+
],
|
113
|
+
default: ['git-diff'],
|
114
|
+
description: 'The modes to use for change tracking',
|
115
|
+
},
|
116
|
+
{
|
117
|
+
displayName: 'JSON Schema for Change Tracking',
|
118
|
+
name: 'changeTrackingSchema',
|
119
|
+
type: 'json',
|
120
|
+
typeOptions: {
|
121
|
+
alwaysOpenEditWindow: true,
|
122
|
+
rows: 8,
|
123
|
+
},
|
124
|
+
displayOptions: {
|
125
|
+
show: {
|
126
|
+
resource: ['crawler'],
|
127
|
+
trackChanges: [true],
|
128
|
+
changeTrackingMode: ['json'],
|
129
|
+
},
|
130
|
+
},
|
131
|
+
default: '{\n "type": "object",\n "properties": {\n "title": {\n "type": "string",\n "description": "The title of the page"\n },\n "content": {\n "type": "string",\n "description": "The main content of the page"\n }\n }\n}',
|
132
|
+
description: 'Schema for JSON-based change tracking',
|
133
|
+
},
|
134
|
+
{
|
135
|
+
displayName: 'Include Path Pattern',
|
136
|
+
name: 'includePath',
|
137
|
+
type: 'string',
|
138
|
+
displayOptions: {
|
139
|
+
show: {
|
140
|
+
resource: ['crawler'],
|
141
|
+
},
|
142
|
+
},
|
143
|
+
default: '',
|
144
|
+
description: 'Regex pattern for paths to include (e.g., /blog/.*)',
|
145
|
+
},
|
146
|
+
{
|
147
|
+
displayName: 'Exclude Path Pattern',
|
148
|
+
name: 'excludePath',
|
149
|
+
type: 'string',
|
150
|
+
displayOptions: {
|
151
|
+
show: {
|
152
|
+
resource: ['crawler'],
|
153
|
+
},
|
154
|
+
},
|
155
|
+
default: '',
|
156
|
+
description: 'Regex pattern for paths to exclude (e.g., /wp-admin/.*)',
|
157
|
+
},
|
158
|
+
{
|
159
|
+
displayName: 'Maximum Depth',
|
160
|
+
name: 'maxDepth',
|
161
|
+
type: 'number',
|
162
|
+
typeOptions: {
|
163
|
+
minValue: 1,
|
164
|
+
maxValue: 10,
|
165
|
+
},
|
166
|
+
displayOptions: {
|
167
|
+
show: {
|
168
|
+
resource: ['crawler'],
|
169
|
+
},
|
170
|
+
},
|
171
|
+
default: 5,
|
172
|
+
description: 'Maximum depth of links to follow (1-10)',
|
173
|
+
},
|
174
|
+
{
|
175
|
+
displayName: 'Operation Mode',
|
176
|
+
name: 'operationMode',
|
177
|
+
type: 'options',
|
178
|
+
displayOptions: {
|
179
|
+
show: {
|
180
|
+
resource: ['crawler'],
|
181
|
+
},
|
182
|
+
},
|
183
|
+
options: [
|
184
|
+
{
|
185
|
+
name: 'Synchronous',
|
186
|
+
value: 'sync',
|
187
|
+
description: 'Wait for the crawling to complete (suitable for smaller sites)',
|
188
|
+
},
|
189
|
+
{
|
190
|
+
name: 'Asynchronous',
|
191
|
+
value: 'async',
|
192
|
+
description: 'Start the crawling process and return a job ID (suitable for larger sites)',
|
193
|
+
},
|
194
|
+
],
|
195
|
+
default: 'sync',
|
196
|
+
description: 'Whether to wait for the crawling to complete or just start the process',
|
197
|
+
},
|
198
|
+
{
|
199
|
+
displayName: 'Job ID',
|
200
|
+
name: 'jobId',
|
201
|
+
type: 'string',
|
202
|
+
displayOptions: {
|
203
|
+
show: {
|
204
|
+
resource: ['crawler'],
|
205
|
+
operationMode: ['async'],
|
206
|
+
},
|
207
|
+
},
|
208
|
+
default: '',
|
209
|
+
description: 'Job ID for checking the status of an existing crawl (leave empty to start a new job)',
|
210
|
+
},
|
211
|
+
{
|
212
|
+
displayName: 'Enable LLM Extraction',
|
213
|
+
name: 'enableLlmExtraction',
|
214
|
+
type: 'boolean',
|
215
|
+
displayOptions: {
|
216
|
+
show: {
|
217
|
+
resource: ['crawler'],
|
218
|
+
},
|
219
|
+
},
|
220
|
+
default: false,
|
221
|
+
description: 'Whether to extract structured data from crawled pages using LLMs',
|
222
|
+
},
|
223
|
+
{
|
224
|
+
displayName: 'Extraction Prompt',
|
225
|
+
name: 'extractionPrompt',
|
226
|
+
type: 'string',
|
227
|
+
displayOptions: {
|
228
|
+
show: {
|
229
|
+
resource: ['crawler'],
|
230
|
+
enableLlmExtraction: [true],
|
231
|
+
},
|
232
|
+
},
|
233
|
+
default: '',
|
234
|
+
required: true,
|
235
|
+
description: 'The prompt to guide the extraction process',
|
236
|
+
placeholder: 'Extract the title and main content from each page',
|
237
|
+
},
|
238
|
+
{
|
239
|
+
displayName: 'Schema Definition Type',
|
240
|
+
name: 'schemaDefinitionType',
|
241
|
+
type: 'options',
|
242
|
+
displayOptions: {
|
243
|
+
show: {
|
244
|
+
resource: ['crawler'],
|
245
|
+
enableLlmExtraction: [true],
|
246
|
+
},
|
247
|
+
},
|
248
|
+
options: [
|
249
|
+
{
|
250
|
+
name: 'Generate From JSON Example',
|
251
|
+
value: 'example',
|
252
|
+
description: 'Generate schema from a JSON example',
|
253
|
+
},
|
254
|
+
{
|
255
|
+
name: 'Define Below',
|
256
|
+
value: 'manual',
|
257
|
+
description: 'Define schema manually in JSON Schema format',
|
258
|
+
},
|
259
|
+
],
|
260
|
+
default: 'manual',
|
261
|
+
description: 'How to define the schema for extraction',
|
262
|
+
},
|
263
|
+
{
|
264
|
+
displayName: 'JSON Example',
|
265
|
+
name: 'jsonExample',
|
266
|
+
type: 'json',
|
267
|
+
typeOptions: {
|
268
|
+
alwaysOpenEditWindow: true,
|
269
|
+
rows: 8,
|
270
|
+
},
|
271
|
+
displayOptions: {
|
272
|
+
show: {
|
273
|
+
resource: ['crawler'],
|
274
|
+
enableLlmExtraction: [true],
|
275
|
+
schemaDefinitionType: ['example'],
|
276
|
+
},
|
277
|
+
},
|
278
|
+
default: '{\n "title": "Example Page Title",\n "content": "This is the main content of the page"\n}',
|
279
|
+
description: 'A JSON example that represents the data structure you want to extract',
|
280
|
+
},
|
281
|
+
{
|
282
|
+
displayName: 'Schema Definition',
|
283
|
+
name: 'schemaDefinition',
|
284
|
+
type: 'json',
|
285
|
+
typeOptions: {
|
286
|
+
alwaysOpenEditWindow: true,
|
287
|
+
rows: 10,
|
288
|
+
},
|
289
|
+
displayOptions: {
|
290
|
+
show: {
|
291
|
+
resource: ['crawler'],
|
292
|
+
enableLlmExtraction: [true],
|
293
|
+
schemaDefinitionType: ['manual'],
|
294
|
+
},
|
295
|
+
},
|
296
|
+
default: '{\n "type": "object",\n "properties": {\n "title": {\n "type": "string",\n "description": "The title of the page"\n },\n "content": {\n "type": "string",\n "description": "The main content of the page"\n }\n },\n "required": ["title", "content"]\n}',
|
297
|
+
description: 'The schema definition in standard JSON Schema format',
|
298
|
+
},
|
299
|
+
{
|
300
|
+
displayName: 'Enable Debug Logs',
|
301
|
+
name: 'enableDebugLogs',
|
302
|
+
type: 'boolean',
|
303
|
+
displayOptions: {
|
304
|
+
show: {
|
305
|
+
resource: ['crawler'],
|
306
|
+
},
|
307
|
+
},
|
308
|
+
default: false,
|
309
|
+
description: 'Whether to enable debug logs in the output',
|
310
|
+
},
|
311
|
+
];
|
312
|
+
// Export all properties for the Crawler resource
|
313
|
+
exports.crawlerProperties = [...crawlerFields];
|