@apteva/integrations 0.3.39 → 0.3.42
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/apps/firecrawl.json +363 -105
- package/src/apps/gigs-marketplace.json +95 -152
- package/src/apps/omnikit-cms.json +18 -0
- package/src/apps/omnikit-code-ops.json +18 -0
- package/src/apps/omnikit-messaging.json +18 -0
- package/src/apps/omnikit-storage.json +29 -199
- package/src/apps/pushover.json +92 -15
- package/src/apps/sendgrid.json +226 -33
- package/src/apps/socialcast.json +13 -1
- package/src/apps/stripe.json +609 -151
- package/src/apps/pushover-notifications.json +0 -119
- package/src/apps/sendgrid-email.json +0 -275
- package/src/apps/stripe-payments.json +0 -911
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@apteva/integrations",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.42",
|
|
4
4
|
"description": "Local integrations, connections, and webhooks for Apteva. Self-contained app templates, OAuth engine, and trigger provider.",
|
|
5
5
|
"author": "Apteva <hello@apteva.com>",
|
|
6
6
|
"license": "Elastic-2.0",
|
package/src/apps/firecrawl.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"slug": "firecrawl",
|
|
3
3
|
"name": "Firecrawl",
|
|
4
|
-
"description": "Web scraping and crawling API
|
|
4
|
+
"description": "Web scraping and crawling API. Turns any website into LLM-ready markdown, HTML, screenshots, links, or structured JSON. Handles JavaScript rendering, anti-bot bypass, sitemap discovery, web search, and natural-language structured extraction. v2 API.",
|
|
5
5
|
"logo": "https://www.google.com/s2/favicons?domain=firecrawl.dev&sz=128",
|
|
6
6
|
"categories": [
|
|
7
7
|
"scraping",
|
|
@@ -10,28 +10,27 @@
|
|
|
10
10
|
"markdown",
|
|
11
11
|
"ai",
|
|
12
12
|
"llm",
|
|
13
|
-
"search"
|
|
13
|
+
"search",
|
|
14
|
+
"extraction"
|
|
14
15
|
],
|
|
15
|
-
"base_url": "",
|
|
16
|
+
"base_url": "https://api.firecrawl.dev/v2",
|
|
16
17
|
"auth": {
|
|
17
|
-
"types": [
|
|
18
|
-
"bearer"
|
|
19
|
-
],
|
|
18
|
+
"types": ["bearer"],
|
|
20
19
|
"headers": {
|
|
21
20
|
"Authorization": "Bearer {{token}}"
|
|
22
21
|
},
|
|
23
22
|
"credential_fields": [
|
|
24
23
|
{
|
|
25
24
|
"name": "token",
|
|
26
|
-
"label": "
|
|
25
|
+
"label": "API key (starts with fc-)"
|
|
27
26
|
}
|
|
28
27
|
]
|
|
29
28
|
},
|
|
30
29
|
"tools": [
|
|
31
30
|
{
|
|
32
31
|
"name": "scrape",
|
|
33
|
-
"description": "Scrape a single URL and
|
|
34
|
-
"method": "
|
|
32
|
+
"description": "Scrape a single URL and return its content in LLM-ready formats (markdown, HTML, links, screenshots, or structured JSON). Handles JavaScript rendering and anti-bot bypass. Use for one-off page extraction. For multiple URLs, use crawl or batch_scrape.",
|
|
33
|
+
"method": "POST",
|
|
35
34
|
"path": "/scrape",
|
|
36
35
|
"input_schema": {
|
|
37
36
|
"type": "object",
|
|
@@ -44,74 +43,99 @@
|
|
|
44
43
|
"type": "array",
|
|
45
44
|
"items": {
|
|
46
45
|
"type": "string",
|
|
47
|
-
"enum": [
|
|
48
|
-
"markdown",
|
|
49
|
-
"html",
|
|
50
|
-
"rawHtml",
|
|
51
|
-
"links",
|
|
52
|
-
"screenshot",
|
|
53
|
-
"screenshot@fullPage"
|
|
54
|
-
]
|
|
46
|
+
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "json"]
|
|
55
47
|
},
|
|
56
|
-
"default": [
|
|
57
|
-
|
|
58
|
-
],
|
|
59
|
-
"description": "Output formats to return"
|
|
48
|
+
"default": ["markdown"],
|
|
49
|
+
"description": "Output formats to return. Default is markdown only."
|
|
60
50
|
},
|
|
61
51
|
"onlyMainContent": {
|
|
62
52
|
"type": "boolean",
|
|
63
53
|
"default": true,
|
|
64
|
-
"description": "
|
|
54
|
+
"description": "Strip headers, navs, footers, and other boilerplate. Default true."
|
|
65
55
|
},
|
|
66
56
|
"includeTags": {
|
|
67
57
|
"type": "array",
|
|
68
|
-
"items": {
|
|
69
|
-
|
|
70
|
-
},
|
|
71
|
-
"description": "HTML tags to include (e.g., ['article', 'main'])"
|
|
58
|
+
"items": { "type": "string" },
|
|
59
|
+
"description": "HTML tags or CSS selectors to include (e.g. ['article', 'main', '.post-body'])"
|
|
72
60
|
},
|
|
73
61
|
"excludeTags": {
|
|
74
62
|
"type": "array",
|
|
75
|
-
"items": {
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
63
|
+
"items": { "type": "string" },
|
|
64
|
+
"description": "HTML tags or CSS selectors to exclude (e.g. ['nav', 'footer', '.ads'])"
|
|
65
|
+
},
|
|
66
|
+
"headers": {
|
|
67
|
+
"type": "object",
|
|
68
|
+
"description": "Custom request headers (cookies, user-agent, etc.)"
|
|
79
69
|
},
|
|
80
70
|
"waitFor": {
|
|
81
71
|
"type": "integer",
|
|
82
72
|
"default": 0,
|
|
83
|
-
"description": "Milliseconds to wait for JavaScript to
|
|
73
|
+
"description": "Milliseconds to wait for JavaScript to finish rendering before scraping"
|
|
84
74
|
},
|
|
85
75
|
"mobile": {
|
|
86
76
|
"type": "boolean",
|
|
87
77
|
"default": false,
|
|
88
|
-
"description": "
|
|
78
|
+
"description": "Emulate a mobile device"
|
|
79
|
+
},
|
|
80
|
+
"skipTlsVerification": {
|
|
81
|
+
"type": "boolean",
|
|
82
|
+
"default": true,
|
|
83
|
+
"description": "Skip TLS certificate verification"
|
|
89
84
|
},
|
|
90
85
|
"timeout": {
|
|
91
86
|
"type": "integer",
|
|
92
|
-
"default":
|
|
93
|
-
"
|
|
87
|
+
"default": 60000,
|
|
88
|
+
"minimum": 1000,
|
|
89
|
+
"maximum": 300000,
|
|
90
|
+
"description": "Request timeout in milliseconds (1000–300000)"
|
|
94
91
|
},
|
|
95
92
|
"removeBase64Images": {
|
|
96
93
|
"type": "boolean",
|
|
97
94
|
"default": true,
|
|
98
|
-
"description": "
|
|
95
|
+
"description": "Strip base64-encoded images from the markdown output"
|
|
99
96
|
},
|
|
100
97
|
"blockAds": {
|
|
101
98
|
"type": "boolean",
|
|
102
99
|
"default": true,
|
|
103
|
-
"description": "Block ads and
|
|
100
|
+
"description": "Block ads, trackers, and cookie-consent popups"
|
|
101
|
+
},
|
|
102
|
+
"proxy": {
|
|
103
|
+
"type": "string",
|
|
104
|
+
"enum": ["basic", "enhanced", "auto"],
|
|
105
|
+
"default": "auto",
|
|
106
|
+
"description": "Proxy type. 'enhanced' is best for anti-bot sites; 'auto' picks per-request."
|
|
107
|
+
},
|
|
108
|
+
"actions": {
|
|
109
|
+
"type": "array",
|
|
110
|
+
"items": { "type": "object" },
|
|
111
|
+
"description": "Pre-scrape browser actions to perform (click, type, scroll, wait, etc.). See Firecrawl docs for the action schema."
|
|
112
|
+
},
|
|
113
|
+
"location": {
|
|
114
|
+
"type": "object",
|
|
115
|
+
"description": "Geographic location settings (country, languages) for proxy and emulation"
|
|
116
|
+
},
|
|
117
|
+
"maxAge": {
|
|
118
|
+
"type": "integer",
|
|
119
|
+
"default": 172800000,
|
|
120
|
+
"description": "Use cached content if it's no older than N milliseconds. Default 48h."
|
|
121
|
+
},
|
|
122
|
+
"storeInCache": {
|
|
123
|
+
"type": "boolean",
|
|
124
|
+
"default": true,
|
|
125
|
+
"description": "Whether to store this scrape in Firecrawl's cache"
|
|
126
|
+
},
|
|
127
|
+
"jsonOptions": {
|
|
128
|
+
"type": "object",
|
|
129
|
+
"description": "When 'json' is in formats, structured-extraction options: { schema, prompt, systemPrompt }"
|
|
104
130
|
}
|
|
105
131
|
},
|
|
106
|
-
"required": [
|
|
107
|
-
"url"
|
|
108
|
-
]
|
|
132
|
+
"required": ["url"]
|
|
109
133
|
}
|
|
110
134
|
},
|
|
111
135
|
{
|
|
112
136
|
"name": "crawl",
|
|
113
|
-
"description": "Start
|
|
114
|
-
"method": "
|
|
137
|
+
"description": "Start an asynchronous crawl of an entire website. Returns a job id immediately — poll firecrawl_crawl_status with that id to retrieve scraped pages as they complete. Use for multi-page extraction. For one URL use scrape; for a fixed URL list use batch_scrape.",
|
|
138
|
+
"method": "POST",
|
|
115
139
|
"path": "/crawl",
|
|
116
140
|
"input_schema": {
|
|
117
141
|
"type": "object",
|
|
@@ -120,81 +144,116 @@
|
|
|
120
144
|
"type": "string",
|
|
121
145
|
"description": "The starting URL to crawl"
|
|
122
146
|
},
|
|
123
|
-
"
|
|
124
|
-
"type": "
|
|
125
|
-
"
|
|
126
|
-
"description": "Maximum link depth to crawl"
|
|
127
|
-
},
|
|
128
|
-
"maxDiscoveryDepth": {
|
|
129
|
-
"type": "integer",
|
|
130
|
-
"description": "Maximum depth for URL discovery (separate from content depth)"
|
|
147
|
+
"prompt": {
|
|
148
|
+
"type": "string",
|
|
149
|
+
"description": "Natural-language description of what to crawl. Firecrawl uses this to auto-derive include/exclude paths and depth."
|
|
131
150
|
},
|
|
132
151
|
"limit": {
|
|
133
152
|
"type": "integer",
|
|
134
|
-
"default":
|
|
153
|
+
"default": 10000,
|
|
135
154
|
"description": "Maximum number of pages to crawl"
|
|
136
155
|
},
|
|
156
|
+
"maxDiscoveryDepth": {
|
|
157
|
+
"type": "integer",
|
|
158
|
+
"description": "Maximum link-discovery depth (independent from page-content depth)"
|
|
159
|
+
},
|
|
137
160
|
"includePaths": {
|
|
138
161
|
"type": "array",
|
|
139
|
-
"items": {
|
|
140
|
-
|
|
141
|
-
},
|
|
142
|
-
"description": "URL patterns to include (e.g., ['/blog/*', '/docs/*'])"
|
|
162
|
+
"items": { "type": "string" },
|
|
163
|
+
"description": "Regex patterns of URL pathnames to include (e.g. ['/blog/.*', '/docs/.*'])"
|
|
143
164
|
},
|
|
144
165
|
"excludePaths": {
|
|
145
166
|
"type": "array",
|
|
146
|
-
"items": {
|
|
147
|
-
|
|
148
|
-
},
|
|
149
|
-
"description": "URL patterns to exclude (e.g., ['/admin/*'])"
|
|
167
|
+
"items": { "type": "string" },
|
|
168
|
+
"description": "Regex patterns of URL pathnames to exclude (e.g. ['/admin/.*', '/login.*'])"
|
|
150
169
|
},
|
|
151
|
-
"
|
|
170
|
+
"regexOnFullURL": {
|
|
171
|
+
"type": "boolean",
|
|
172
|
+
"default": false,
|
|
173
|
+
"description": "Match include/exclude regexes against the full URL instead of just the pathname"
|
|
174
|
+
},
|
|
175
|
+
"ignoreQueryParameters": {
|
|
152
176
|
"type": "boolean",
|
|
153
177
|
"default": false,
|
|
154
|
-
"description": "
|
|
178
|
+
"description": "Treat URLs that differ only by query string as the same page"
|
|
179
|
+
},
|
|
180
|
+
"sitemap": {
|
|
181
|
+
"type": "string",
|
|
182
|
+
"enum": ["skip", "include", "only"],
|
|
183
|
+
"default": "include",
|
|
184
|
+
"description": "Sitemap handling: 'skip' ignores it, 'include' uses it as a hint, 'only' restricts crawl to sitemap URLs"
|
|
155
185
|
},
|
|
156
|
-
"
|
|
186
|
+
"crawlEntireDomain": {
|
|
157
187
|
"type": "boolean",
|
|
158
188
|
"default": false,
|
|
159
|
-
"description": "Allow
|
|
189
|
+
"description": "Allow following sibling and parent URLs, not just children"
|
|
190
|
+
},
|
|
191
|
+
"allowSubdomains": {
|
|
192
|
+
"type": "boolean",
|
|
193
|
+
"default": false,
|
|
194
|
+
"description": "Follow links into subdomains of the starting URL"
|
|
160
195
|
},
|
|
161
196
|
"allowExternalLinks": {
|
|
162
197
|
"type": "boolean",
|
|
163
198
|
"default": false,
|
|
164
|
-
"description": "
|
|
199
|
+
"description": "Follow links to other domains"
|
|
200
|
+
},
|
|
201
|
+
"delay": {
|
|
202
|
+
"type": "number",
|
|
203
|
+
"description": "Seconds to wait between scrapes — use this to respect rate limits"
|
|
204
|
+
},
|
|
205
|
+
"maxConcurrency": {
|
|
206
|
+
"type": "integer",
|
|
207
|
+
"description": "Maximum number of pages scraped in parallel"
|
|
165
208
|
},
|
|
166
209
|
"webhook": {
|
|
167
|
-
"type": "
|
|
168
|
-
"description": "Webhook
|
|
210
|
+
"type": "object",
|
|
211
|
+
"description": "Webhook config for crawl events: { url, headers, metadata }"
|
|
212
|
+
},
|
|
213
|
+
"scrapeOptions": {
|
|
214
|
+
"type": "object",
|
|
215
|
+
"description": "Per-page scrape options applied to every crawled URL — same shape as the scrape tool's body (formats, onlyMainContent, headers, waitFor, etc.)"
|
|
169
216
|
}
|
|
170
217
|
},
|
|
171
|
-
"required": [
|
|
172
|
-
"url"
|
|
173
|
-
]
|
|
218
|
+
"required": ["url"]
|
|
174
219
|
}
|
|
175
220
|
},
|
|
176
221
|
{
|
|
177
222
|
"name": "crawl_status",
|
|
178
|
-
"description": "
|
|
223
|
+
"description": "Poll a running crawl job by id. Returns counts (total/completed), status (scraping/completed/failed), credit usage, and any pages scraped so far. Call repeatedly until status='completed'.",
|
|
179
224
|
"method": "GET",
|
|
180
|
-
"path": "/crawl
|
|
225
|
+
"path": "/crawl/{id}",
|
|
226
|
+
"input_schema": {
|
|
227
|
+
"type": "object",
|
|
228
|
+
"properties": {
|
|
229
|
+
"id": {
|
|
230
|
+
"type": "string",
|
|
231
|
+
"description": "The crawl job id returned by firecrawl_crawl"
|
|
232
|
+
}
|
|
233
|
+
},
|
|
234
|
+
"required": ["id"]
|
|
235
|
+
}
|
|
236
|
+
},
|
|
237
|
+
{
|
|
238
|
+
"name": "crawl_cancel",
|
|
239
|
+
"description": "Cancel a running crawl job by id. Useful when a crawl is taking longer than expected or hitting unexpected pages.",
|
|
240
|
+
"method": "DELETE",
|
|
241
|
+
"path": "/crawl/{id}",
|
|
181
242
|
"input_schema": {
|
|
182
243
|
"type": "object",
|
|
183
244
|
"properties": {
|
|
184
|
-
"
|
|
245
|
+
"id": {
|
|
185
246
|
"type": "string",
|
|
186
|
-
"description": "The crawl job
|
|
247
|
+
"description": "The crawl job id returned by firecrawl_crawl"
|
|
187
248
|
}
|
|
188
249
|
},
|
|
189
|
-
"required": [
|
|
190
|
-
"jobId"
|
|
191
|
-
]
|
|
250
|
+
"required": ["id"]
|
|
192
251
|
}
|
|
193
252
|
},
|
|
194
253
|
{
|
|
195
254
|
"name": "map",
|
|
196
|
-
"description": "
|
|
197
|
-
"method": "
|
|
255
|
+
"description": "Discover all URLs on a website without scraping their content. Fast — uses sitemap.xml + light crawling. Use this to plan a targeted scrape, audit site structure, or filter by keyword via 'search'. 1 credit per website.",
|
|
256
|
+
"method": "POST",
|
|
198
257
|
"path": "/map",
|
|
199
258
|
"input_schema": {
|
|
200
259
|
"type": "object",
|
|
@@ -205,73 +264,272 @@
|
|
|
205
264
|
},
|
|
206
265
|
"search": {
|
|
207
266
|
"type": "string",
|
|
208
|
-
"description": "Filter URLs
|
|
267
|
+
"description": "Filter/order returned URLs by relevance to this search query (e.g. 'pricing', 'blog')"
|
|
209
268
|
},
|
|
210
|
-
"
|
|
269
|
+
"sitemap": {
|
|
270
|
+
"type": "string",
|
|
271
|
+
"enum": ["skip", "include", "only"],
|
|
272
|
+
"default": "include",
|
|
273
|
+
"description": "Sitemap handling: 'skip' bypasses it, 'include' uses it + crawls, 'only' uses sitemap exclusively"
|
|
274
|
+
},
|
|
275
|
+
"includeSubdomains": {
|
|
211
276
|
"type": "boolean",
|
|
212
|
-
"default":
|
|
213
|
-
"description": "
|
|
277
|
+
"default": true,
|
|
278
|
+
"description": "Include URLs from subdomains"
|
|
214
279
|
},
|
|
215
|
-
"
|
|
280
|
+
"ignoreQueryParameters": {
|
|
216
281
|
"type": "boolean",
|
|
217
|
-
"default":
|
|
218
|
-
"description": "
|
|
282
|
+
"default": true,
|
|
283
|
+
"description": "Drop query parameters from returned URLs"
|
|
219
284
|
},
|
|
220
|
-
"
|
|
285
|
+
"ignoreCache": {
|
|
221
286
|
"type": "boolean",
|
|
222
287
|
"default": false,
|
|
223
|
-
"description": "
|
|
288
|
+
"description": "Bypass Firecrawl's sitemap cache and re-fetch fresh URLs"
|
|
224
289
|
},
|
|
225
290
|
"limit": {
|
|
226
291
|
"type": "integer",
|
|
227
292
|
"default": 5000,
|
|
228
|
-
"
|
|
293
|
+
"maximum": 100000,
|
|
294
|
+
"description": "Maximum URLs to return"
|
|
295
|
+
},
|
|
296
|
+
"timeout": {
|
|
297
|
+
"type": "integer",
|
|
298
|
+
"description": "Request timeout in milliseconds"
|
|
299
|
+
},
|
|
300
|
+
"location": {
|
|
301
|
+
"type": "object",
|
|
302
|
+
"description": "Geographic location settings (country, languages) for proxy and emulation"
|
|
229
303
|
}
|
|
230
304
|
},
|
|
231
|
-
"required": [
|
|
232
|
-
"url"
|
|
233
|
-
]
|
|
305
|
+
"required": ["url"]
|
|
234
306
|
}
|
|
235
307
|
},
|
|
236
308
|
{
|
|
237
309
|
"name": "search",
|
|
238
|
-
"description": "Search the web and scrape
|
|
239
|
-
"method": "
|
|
310
|
+
"description": "Search the web and optionally scrape result pages in one call. Perfect for research, competitive analysis, lead generation, and SEO. Set scrapeOptions to also receive the full content of each result.",
|
|
311
|
+
"method": "POST",
|
|
240
312
|
"path": "/search",
|
|
241
313
|
"input_schema": {
|
|
242
314
|
"type": "object",
|
|
243
315
|
"properties": {
|
|
244
316
|
"query": {
|
|
245
317
|
"type": "string",
|
|
246
|
-
"description": "
|
|
318
|
+
"description": "Search query (max 500 characters)"
|
|
247
319
|
},
|
|
248
320
|
"limit": {
|
|
249
321
|
"type": "integer",
|
|
250
322
|
"default": 5,
|
|
251
|
-
"
|
|
323
|
+
"minimum": 1,
|
|
324
|
+
"maximum": 100,
|
|
325
|
+
"description": "Number of results to return (1 credit per result if scrapeOptions is set)"
|
|
326
|
+
},
|
|
327
|
+
"sources": {
|
|
328
|
+
"type": "array",
|
|
329
|
+
"items": {
|
|
330
|
+
"type": "object",
|
|
331
|
+
"properties": {
|
|
332
|
+
"type": {
|
|
333
|
+
"type": "string",
|
|
334
|
+
"enum": ["web", "images", "news"]
|
|
335
|
+
}
|
|
336
|
+
}
|
|
337
|
+
},
|
|
338
|
+
"default": [{ "type": "web" }],
|
|
339
|
+
"description": "Search sources to query. Default is web."
|
|
252
340
|
},
|
|
253
|
-
"
|
|
341
|
+
"categories": {
|
|
342
|
+
"type": "array",
|
|
343
|
+
"items": {
|
|
344
|
+
"type": "string",
|
|
345
|
+
"enum": ["github", "research", "pdf"]
|
|
346
|
+
},
|
|
347
|
+
"description": "Filter results by category"
|
|
348
|
+
},
|
|
349
|
+
"tbs": {
|
|
254
350
|
"type": "string",
|
|
255
|
-
"
|
|
256
|
-
"description": "Language code (e.g., 'en', 'es', 'fr')"
|
|
351
|
+
"description": "Time-based search filter (Google syntax: 'qdr:d' = past day, 'qdr:w' = past week, 'qdr:m' = past month, 'qdr:y' = past year)"
|
|
257
352
|
},
|
|
258
353
|
"country": {
|
|
259
354
|
"type": "string",
|
|
260
|
-
"default": "
|
|
261
|
-
"description": "
|
|
355
|
+
"default": "US",
|
|
356
|
+
"description": "ISO country code for localized results (e.g. 'US', 'GB', 'DE')"
|
|
262
357
|
},
|
|
263
358
|
"location": {
|
|
264
359
|
"type": "string",
|
|
265
|
-
"description": "Specific location for local search results"
|
|
360
|
+
"description": "Specific location string for local search results (e.g. 'San Francisco, California, United States')"
|
|
266
361
|
},
|
|
267
|
-
"
|
|
362
|
+
"timeout": {
|
|
363
|
+
"type": "integer",
|
|
364
|
+
"default": 60000,
|
|
365
|
+
"description": "Request timeout in milliseconds"
|
|
366
|
+
},
|
|
367
|
+
"ignoreInvalidURLs": {
|
|
368
|
+
"type": "boolean",
|
|
369
|
+
"default": false,
|
|
370
|
+
"description": "Skip results with invalid URLs instead of erroring"
|
|
371
|
+
},
|
|
372
|
+
"scrapeOptions": {
|
|
373
|
+
"type": "object",
|
|
374
|
+
"description": "When set, scrape each search result and return its content. Same shape as the scrape tool's body."
|
|
375
|
+
}
|
|
376
|
+
},
|
|
377
|
+
"required": ["query"]
|
|
378
|
+
}
|
|
379
|
+
},
|
|
380
|
+
{
|
|
381
|
+
"name": "extract",
|
|
382
|
+
"description": "Extract structured JSON from one or more URLs using natural language and/or a JSON schema. Returns a job id — poll firecrawl_extract_status to retrieve results. Use this when you want typed data instead of raw markdown.",
|
|
383
|
+
"method": "POST",
|
|
384
|
+
"path": "/extract",
|
|
385
|
+
"input_schema": {
|
|
386
|
+
"type": "object",
|
|
387
|
+
"properties": {
|
|
388
|
+
"urls": {
|
|
389
|
+
"type": "array",
|
|
390
|
+
"items": { "type": "string" },
|
|
391
|
+
"description": "URLs to extract from. Glob patterns supported (e.g. 'https://example.com/blog/*')."
|
|
392
|
+
},
|
|
393
|
+
"prompt": {
|
|
394
|
+
"type": "string",
|
|
395
|
+
"description": "Natural-language description of what to extract. Used alongside 'schema' to guide extraction."
|
|
396
|
+
},
|
|
397
|
+
"schema": {
|
|
398
|
+
"type": "object",
|
|
399
|
+
"description": "JSON Schema describing the structure of the desired output"
|
|
400
|
+
},
|
|
401
|
+
"enableWebSearch": {
|
|
402
|
+
"type": "boolean",
|
|
403
|
+
"default": false,
|
|
404
|
+
"description": "Allow Firecrawl to perform supplementary web searches if the target pages don't contain enough info"
|
|
405
|
+
},
|
|
406
|
+
"ignoreSitemap": {
|
|
407
|
+
"type": "boolean",
|
|
408
|
+
"default": false,
|
|
409
|
+
"description": "Bypass sitemap.xml when scanning the target sites"
|
|
410
|
+
},
|
|
411
|
+
"includeSubdomains": {
|
|
412
|
+
"type": "boolean",
|
|
413
|
+
"default": true,
|
|
414
|
+
"description": "Extend extraction to subdomains of the provided URLs"
|
|
415
|
+
},
|
|
416
|
+
"showSources": {
|
|
417
|
+
"type": "boolean",
|
|
418
|
+
"default": false,
|
|
419
|
+
"description": "Include the source URLs that contributed to each extracted field in the response"
|
|
420
|
+
},
|
|
421
|
+
"ignoreInvalidURLs": {
|
|
422
|
+
"type": "boolean",
|
|
423
|
+
"default": true,
|
|
424
|
+
"description": "Skip invalid URLs and continue with the rest"
|
|
425
|
+
},
|
|
426
|
+
"scrapeOptions": {
|
|
427
|
+
"type": "object",
|
|
428
|
+
"description": "Per-page scrape options applied while extracting"
|
|
429
|
+
}
|
|
430
|
+
},
|
|
431
|
+
"required": ["urls"]
|
|
432
|
+
}
|
|
433
|
+
},
|
|
434
|
+
{
|
|
435
|
+
"name": "extract_status",
|
|
436
|
+
"description": "Poll an extract job by id. Returns the structured data once status='completed'.",
|
|
437
|
+
"method": "GET",
|
|
438
|
+
"path": "/extract/{id}",
|
|
439
|
+
"input_schema": {
|
|
440
|
+
"type": "object",
|
|
441
|
+
"properties": {
|
|
442
|
+
"id": {
|
|
443
|
+
"type": "string",
|
|
444
|
+
"description": "The extract job id returned by firecrawl_extract"
|
|
445
|
+
}
|
|
446
|
+
},
|
|
447
|
+
"required": ["id"]
|
|
448
|
+
}
|
|
449
|
+
},
|
|
450
|
+
{
|
|
451
|
+
"name": "batch_scrape",
|
|
452
|
+
"description": "Scrape a fixed list of URLs in one job. Returns a job id — poll firecrawl_batch_scrape_status to retrieve results. Use this when you already have the list of URLs (vs crawl, which discovers them).",
|
|
453
|
+
"method": "POST",
|
|
454
|
+
"path": "/batch/scrape",
|
|
455
|
+
"input_schema": {
|
|
456
|
+
"type": "object",
|
|
457
|
+
"properties": {
|
|
458
|
+
"urls": {
|
|
459
|
+
"type": "array",
|
|
460
|
+
"items": { "type": "string" },
|
|
461
|
+
"description": "List of URLs to scrape"
|
|
462
|
+
},
|
|
463
|
+
"formats": {
|
|
464
|
+
"type": "array",
|
|
465
|
+
"items": {
|
|
466
|
+
"type": "string",
|
|
467
|
+
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "json"]
|
|
468
|
+
},
|
|
469
|
+
"default": ["markdown"],
|
|
470
|
+
"description": "Output formats applied to every URL"
|
|
471
|
+
},
|
|
472
|
+
"onlyMainContent": {
|
|
473
|
+
"type": "boolean",
|
|
474
|
+
"default": true,
|
|
475
|
+
"description": "Strip headers, navs, footers, and other boilerplate"
|
|
476
|
+
},
|
|
477
|
+
"includeTags": {
|
|
478
|
+
"type": "array",
|
|
479
|
+
"items": { "type": "string" }
|
|
480
|
+
},
|
|
481
|
+
"excludeTags": {
|
|
482
|
+
"type": "array",
|
|
483
|
+
"items": { "type": "string" }
|
|
484
|
+
},
|
|
485
|
+
"headers": { "type": "object" },
|
|
486
|
+
"waitFor": { "type": "integer", "default": 0 },
|
|
487
|
+
"timeout": { "type": "integer", "default": 60000 },
|
|
488
|
+
"blockAds": { "type": "boolean", "default": true },
|
|
489
|
+
"removeBase64Images": { "type": "boolean", "default": true },
|
|
490
|
+
"ignoreInvalidURLs": {
|
|
491
|
+
"type": "boolean",
|
|
492
|
+
"default": false,
|
|
493
|
+
"description": "Skip invalid URLs instead of failing the whole batch"
|
|
494
|
+
},
|
|
495
|
+
"webhook": {
|
|
496
|
+
"type": "object",
|
|
497
|
+
"description": "Webhook config for batch events: { url, headers, metadata }"
|
|
498
|
+
}
|
|
499
|
+
},
|
|
500
|
+
"required": ["urls"]
|
|
501
|
+
}
|
|
502
|
+
},
|
|
503
|
+
{
|
|
504
|
+
"name": "batch_scrape_status",
|
|
505
|
+
"description": "Poll a batch scrape job by id. Returns counts and any scraped pages so far.",
|
|
506
|
+
"method": "GET",
|
|
507
|
+
"path": "/batch/scrape/{id}",
|
|
508
|
+
"input_schema": {
|
|
509
|
+
"type": "object",
|
|
510
|
+
"properties": {
|
|
511
|
+
"id": {
|
|
512
|
+
"type": "string",
|
|
513
|
+
"description": "The batch scrape job id returned by firecrawl_batch_scrape"
|
|
514
|
+
}
|
|
515
|
+
},
|
|
516
|
+
"required": ["id"]
|
|
517
|
+
}
|
|
518
|
+
},
|
|
519
|
+
{
|
|
520
|
+
"name": "batch_scrape_cancel",
|
|
521
|
+
"description": "Cancel a running batch scrape job.",
|
|
522
|
+
"method": "DELETE",
|
|
523
|
+
"path": "/batch/scrape/{id}",
|
|
524
|
+
"input_schema": {
|
|
525
|
+
"type": "object",
|
|
526
|
+
"properties": {
|
|
527
|
+
"id": {
|
|
268
528
|
"type": "string",
|
|
269
|
-
"description": "
|
|
529
|
+
"description": "The batch scrape job id returned by firecrawl_batch_scrape"
|
|
270
530
|
}
|
|
271
531
|
},
|
|
272
|
-
"required": [
|
|
273
|
-
"query"
|
|
274
|
-
]
|
|
532
|
+
"required": ["id"]
|
|
275
533
|
}
|
|
276
534
|
}
|
|
277
535
|
]
|