firecrawl-aisdk 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +181 -0
- package/dist/index.d.ts +603 -0
- package/dist/index.js +539 -0
- package/package.json +53 -0
package/README.md
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
# Firecrawl AI SDK Tools
|
|
2
|
+
|
|
3
|
+
Firecrawl tools for Vercel AI SDK v5. Web scraping, search, crawling, and data extraction for AI applications.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
npm install @developersdigest/fc-ai-sdk ai @ai-sdk/openai
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
Set environment variables:
|
|
12
|
+
```bash
|
|
13
|
+
FIRECRAWL_API_KEY=fc-your-key
|
|
14
|
+
OPENAI_API_KEY=sk-your-key
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Quick Start
|
|
18
|
+
|
|
19
|
+
```typescript
|
|
20
|
+
import { generateText } from 'ai';
|
|
21
|
+
import { openai } from '@ai-sdk/openai';
|
|
22
|
+
import { scrapeTool } from '@developersdigest/fc-ai-sdk';
|
|
23
|
+
|
|
24
|
+
const { text } = await generateText({
|
|
25
|
+
model: openai('gpt-5-mini'),
|
|
26
|
+
prompt: 'Scrape https://firecrawl.dev and summarize what it does',
|
|
27
|
+
tools: { scrape: scrapeTool },
|
|
28
|
+
});
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Available Tools
|
|
32
|
+
|
|
33
|
+
```typescript
|
|
34
|
+
import {
|
|
35
|
+
scrapeTool, // Scrape single URL
|
|
36
|
+
searchTool, // Search the web
|
|
37
|
+
mapTool, // Discover URLs on a site
|
|
38
|
+
crawlTool, // Crawl multiple pages
|
|
39
|
+
batchScrapeTool, // Scrape multiple URLs
|
|
40
|
+
extractTool, // Extract structured data
|
|
41
|
+
pollTool, // Poll async jobs
|
|
42
|
+
statusTool, // Check job status
|
|
43
|
+
cancelTool, // Cancel jobs
|
|
44
|
+
} from '@developersdigest/fc-ai-sdk';
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Examples
|
|
48
|
+
|
|
49
|
+
**Scrape**
|
|
50
|
+
```typescript
|
|
51
|
+
import { generateText } from 'ai';
|
|
52
|
+
import { openai } from '@ai-sdk/openai';
|
|
53
|
+
import { scrapeTool } from '@developersdigest/fc-ai-sdk';
|
|
54
|
+
|
|
55
|
+
const { text } = await generateText({
|
|
56
|
+
model: openai('gpt-5-mini'),
|
|
57
|
+
prompt: 'Scrape https://firecrawl.dev and summarize what it does',
|
|
58
|
+
tools: { scrape: scrapeTool },
|
|
59
|
+
});
|
|
60
|
+
|
|
61
|
+
console.log(text);
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
**Search**
|
|
65
|
+
```typescript
|
|
66
|
+
import { generateText } from 'ai';
|
|
67
|
+
import { openai } from '@ai-sdk/openai';
|
|
68
|
+
import { searchTool } from '@developersdigest/fc-ai-sdk';
|
|
69
|
+
|
|
70
|
+
const { text } = await generateText({
|
|
71
|
+
model: openai('gpt-5-mini'),
|
|
72
|
+
prompt: 'Search for Firecrawl and summarize what you find',
|
|
73
|
+
tools: { search: searchTool },
|
|
74
|
+
});
|
|
75
|
+
|
|
76
|
+
console.log(text);
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
**Map**
|
|
80
|
+
```typescript
|
|
81
|
+
import { generateText } from 'ai';
|
|
82
|
+
import { openai } from '@ai-sdk/openai';
|
|
83
|
+
import { mapTool } from '@developersdigest/fc-ai-sdk';
|
|
84
|
+
|
|
85
|
+
const { text } = await generateText({
|
|
86
|
+
model: openai('gpt-5-mini'),
|
|
87
|
+
prompt: 'Map https://docs.firecrawl.dev and list the main sections',
|
|
88
|
+
tools: { map: mapTool },
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
console.log(text);
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Crawl** (async - requires polling)
|
|
95
|
+
```typescript
|
|
96
|
+
import { generateText } from 'ai';
|
|
97
|
+
import { openai } from '@ai-sdk/openai';
|
|
98
|
+
import { crawlTool, pollTool } from '@developersdigest/fc-ai-sdk';
|
|
99
|
+
|
|
100
|
+
const { text } = await generateText({
|
|
101
|
+
model: openai('gpt-5-mini'),
|
|
102
|
+
prompt: 'Crawl https://docs.firecrawl.dev (limit 3 pages) and summarize',
|
|
103
|
+
tools: { crawl: crawlTool, poll: pollTool },
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
console.log(text);
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
**Batch** (async - requires polling)
|
|
110
|
+
```typescript
|
|
111
|
+
import { generateText } from 'ai';
|
|
112
|
+
import { openai } from '@ai-sdk/openai';
|
|
113
|
+
import { batchScrapeTool, pollTool } from '@developersdigest/fc-ai-sdk';
|
|
114
|
+
|
|
115
|
+
const { text } = await generateText({
|
|
116
|
+
model: openai('gpt-5-mini'),
|
|
117
|
+
prompt: 'Scrape https://firecrawl.dev and https://docs.firecrawl.dev, then compare',
|
|
118
|
+
tools: { batchScrape: batchScrapeTool, poll: pollTool },
|
|
119
|
+
});
|
|
120
|
+
|
|
121
|
+
console.log(text);
|
|
122
|
+
```
|
|
123
|
+
|
|
124
|
+
**Extract** (async - requires polling)
|
|
125
|
+
```typescript
|
|
126
|
+
import { generateText } from 'ai';
|
|
127
|
+
import { openai } from '@ai-sdk/openai';
|
|
128
|
+
import { extractTool, pollTool } from '@developersdigest/fc-ai-sdk';
|
|
129
|
+
|
|
130
|
+
const { text } = await generateText({
|
|
131
|
+
model: openai('gpt-5-mini'),
|
|
132
|
+
prompt: 'Extract the main features from https://firecrawl.dev',
|
|
133
|
+
tools: { extract: extractTool, poll: pollTool },
|
|
134
|
+
});
|
|
135
|
+
|
|
136
|
+
console.log(text);
|
|
137
|
+
```
|
|
138
|
+
|
|
139
|
+
**Search + Scrape**
|
|
140
|
+
```typescript
|
|
141
|
+
import { generateText } from 'ai';
|
|
142
|
+
import { openai } from '@ai-sdk/openai';
|
|
143
|
+
import { searchTool, scrapeTool } from '@developersdigest/fc-ai-sdk';
|
|
144
|
+
|
|
145
|
+
const { text } = await generateText({
|
|
146
|
+
model: openai('gpt-5-mini'),
|
|
147
|
+
prompt: 'Search for Firecrawl, scrape the top result, and explain what it does',
|
|
148
|
+
tools: { search: searchTool, scrape: scrapeTool },
|
|
149
|
+
});
|
|
150
|
+
|
|
151
|
+
console.log(text);
|
|
152
|
+
```
|
|
153
|
+
|
|
154
|
+
**Stream**
|
|
155
|
+
```typescript
|
|
156
|
+
import { streamText } from 'ai';
|
|
157
|
+
import { openai } from '@ai-sdk/openai';
|
|
158
|
+
import { scrapeTool } from '@developersdigest/fc-ai-sdk';
|
|
159
|
+
|
|
160
|
+
const result = streamText({
|
|
161
|
+
model: openai('gpt-5-mini'),
|
|
162
|
+
prompt: 'Scrape https://firecrawl.dev and explain what it does',
|
|
163
|
+
tools: { scrape: scrapeTool },
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
for await (const chunk of result.textStream) {
|
|
167
|
+
process.stdout.write(chunk);
|
|
168
|
+
}
|
|
169
|
+
```
|
|
170
|
+
|
|
171
|
+
## Development
|
|
172
|
+
|
|
173
|
+
Run tests:
|
|
174
|
+
```bash
|
|
175
|
+
pnpm test # Run all tests
|
|
176
|
+
pnpm test:watch # Watch mode
|
|
177
|
+
```
|
|
178
|
+
|
|
179
|
+
## License
|
|
180
|
+
|
|
181
|
+
MIT
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,603 @@
|
|
|
1
|
+
import * as ai from 'ai';
|
|
2
|
+
import * as _mendable_firecrawl_js from '@mendable/firecrawl-js';
|
|
3
|
+
import { z } from 'zod';
|
|
4
|
+
|
|
5
|
+
declare const scrapeTool: ai.Tool<{
|
|
6
|
+
url: string;
|
|
7
|
+
formats?: ("markdown" | "html" | "rawHtml" | "screenshot" | "links" | "summary" | "changeTracking" | "branding" | {
|
|
8
|
+
type: "json";
|
|
9
|
+
prompt?: string | undefined;
|
|
10
|
+
schema?: Record<string, any> | undefined;
|
|
11
|
+
} | {
|
|
12
|
+
type: "screenshot";
|
|
13
|
+
fullPage?: boolean | undefined;
|
|
14
|
+
quality?: number | undefined;
|
|
15
|
+
viewport?: {
|
|
16
|
+
width: number;
|
|
17
|
+
height: number;
|
|
18
|
+
} | undefined;
|
|
19
|
+
})[] | undefined;
|
|
20
|
+
parsers?: ("pdf" | {
|
|
21
|
+
type: "pdf";
|
|
22
|
+
maxPages?: number | undefined;
|
|
23
|
+
})[] | undefined;
|
|
24
|
+
onlyMainContent?: boolean | undefined;
|
|
25
|
+
includeTags?: string[] | undefined;
|
|
26
|
+
excludeTags?: string[] | undefined;
|
|
27
|
+
waitFor?: number | undefined;
|
|
28
|
+
actions?: {
|
|
29
|
+
type: "screenshot" | "wait" | "scroll" | "scrape" | "click" | "write" | "press" | "executeJavascript" | "generatePDF";
|
|
30
|
+
fullPage?: boolean | undefined;
|
|
31
|
+
selector?: string | undefined;
|
|
32
|
+
milliseconds?: number | undefined;
|
|
33
|
+
text?: string | undefined;
|
|
34
|
+
key?: string | undefined;
|
|
35
|
+
direction?: "up" | "down" | undefined;
|
|
36
|
+
script?: string | undefined;
|
|
37
|
+
}[] | undefined;
|
|
38
|
+
mobile?: boolean | undefined;
|
|
39
|
+
skipTlsVerification?: boolean | undefined;
|
|
40
|
+
removeBase64Images?: boolean | undefined;
|
|
41
|
+
location?: {
|
|
42
|
+
country?: string | undefined;
|
|
43
|
+
languages?: string[] | undefined;
|
|
44
|
+
} | undefined;
|
|
45
|
+
storeInCache?: boolean | undefined;
|
|
46
|
+
maxAge?: number | undefined;
|
|
47
|
+
headers?: Record<string, string> | undefined;
|
|
48
|
+
blockAds?: boolean | undefined;
|
|
49
|
+
proxy?: "basic" | "stealth" | "auto" | undefined;
|
|
50
|
+
zeroDataRetention?: boolean | undefined;
|
|
51
|
+
timeout?: number | undefined;
|
|
52
|
+
}, Omit<_mendable_firecrawl_js.Document, "json"> & {
|
|
53
|
+
json?: unknown;
|
|
54
|
+
}>;
|
|
55
|
+
|
|
56
|
+
declare const searchTool: ai.Tool<{
|
|
57
|
+
query: string;
|
|
58
|
+
country?: string | undefined;
|
|
59
|
+
location?: string | undefined;
|
|
60
|
+
timeout?: number | undefined;
|
|
61
|
+
limit?: number | undefined;
|
|
62
|
+
sources?: {
|
|
63
|
+
type: "web" | "images" | "news";
|
|
64
|
+
}[] | undefined;
|
|
65
|
+
categories?: {
|
|
66
|
+
type: "pdf" | "github" | "research";
|
|
67
|
+
}[] | undefined;
|
|
68
|
+
tbs?: string | undefined;
|
|
69
|
+
ignoreInvalidURLs?: boolean | undefined;
|
|
70
|
+
scrapeOptions?: {
|
|
71
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
72
|
+
onlyMainContent?: boolean | undefined;
|
|
73
|
+
} | undefined;
|
|
74
|
+
}, _mendable_firecrawl_js.SearchData>;
|
|
75
|
+
|
|
76
|
+
declare const mapTool: ai.Tool<{
|
|
77
|
+
url: string;
|
|
78
|
+
limit?: number | undefined;
|
|
79
|
+
search?: string | undefined;
|
|
80
|
+
includeSubdomains?: boolean | undefined;
|
|
81
|
+
ignoreSitemap?: boolean | undefined;
|
|
82
|
+
}, _mendable_firecrawl_js.MapData>;
|
|
83
|
+
|
|
84
|
+
declare const extractTool: ai.Tool<{
|
|
85
|
+
urls: string[];
|
|
86
|
+
prompt?: string | undefined;
|
|
87
|
+
schema?: Record<string, any> | undefined;
|
|
88
|
+
ignoreInvalidURLs?: boolean | undefined;
|
|
89
|
+
scrapeOptions?: {
|
|
90
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
91
|
+
onlyMainContent?: boolean | undefined;
|
|
92
|
+
includeTags?: string[] | undefined;
|
|
93
|
+
excludeTags?: string[] | undefined;
|
|
94
|
+
waitFor?: number | undefined;
|
|
95
|
+
mobile?: boolean | undefined;
|
|
96
|
+
} | undefined;
|
|
97
|
+
systemPrompt?: string | undefined;
|
|
98
|
+
allowExternalLinks?: boolean | undefined;
|
|
99
|
+
enableWebSearch?: boolean | undefined;
|
|
100
|
+
showSources?: boolean | undefined;
|
|
101
|
+
}, _mendable_firecrawl_js.ExtractResponse>;
|
|
102
|
+
|
|
103
|
+
declare const batchScrapeTool: ai.Tool<{
|
|
104
|
+
urls: string[];
|
|
105
|
+
formats?: ("markdown" | "html" | "rawHtml" | "screenshot" | "links")[] | undefined;
|
|
106
|
+
onlyMainContent?: boolean | undefined;
|
|
107
|
+
}, _mendable_firecrawl_js.BatchScrapeJob>;
|
|
108
|
+
|
|
109
|
+
declare const crawlTool: ai.Tool<{
|
|
110
|
+
url: string;
|
|
111
|
+
limit?: number | undefined;
|
|
112
|
+
scrapeOptions?: {
|
|
113
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
114
|
+
onlyMainContent?: boolean | undefined;
|
|
115
|
+
} | undefined;
|
|
116
|
+
allowExternalLinks?: boolean | undefined;
|
|
117
|
+
maxDepth?: number | undefined;
|
|
118
|
+
allowBackwardLinks?: boolean | undefined;
|
|
119
|
+
includePaths?: string[] | undefined;
|
|
120
|
+
excludePaths?: string[] | undefined;
|
|
121
|
+
}, _mendable_firecrawl_js.CrawlResponse>;
|
|
122
|
+
|
|
123
|
+
declare const pollTool: ai.Tool<{
|
|
124
|
+
jobType: "crawl" | "batchScrape" | "extract";
|
|
125
|
+
jobId: string;
|
|
126
|
+
pollInterval?: number | undefined;
|
|
127
|
+
maxWaitTime?: number | undefined;
|
|
128
|
+
}, {
|
|
129
|
+
success: boolean;
|
|
130
|
+
status: string;
|
|
131
|
+
data: any;
|
|
132
|
+
total?: undefined;
|
|
133
|
+
creditsUsed?: undefined;
|
|
134
|
+
error?: undefined;
|
|
135
|
+
} | {
|
|
136
|
+
success: boolean;
|
|
137
|
+
status: string;
|
|
138
|
+
data: any;
|
|
139
|
+
total: any;
|
|
140
|
+
creditsUsed: any;
|
|
141
|
+
error?: undefined;
|
|
142
|
+
} | {
|
|
143
|
+
success: boolean;
|
|
144
|
+
status: string;
|
|
145
|
+
error: any;
|
|
146
|
+
data?: undefined;
|
|
147
|
+
total?: undefined;
|
|
148
|
+
creditsUsed?: undefined;
|
|
149
|
+
}>;
|
|
150
|
+
declare const statusTool: ai.Tool<{
|
|
151
|
+
id: string;
|
|
152
|
+
jobType?: "crawl" | "batchScrape" | "extract" | undefined;
|
|
153
|
+
}, _mendable_firecrawl_js.ExtractResponse | _mendable_firecrawl_js.BatchScrapeJob | undefined>;
|
|
154
|
+
declare const cancelTool: ai.Tool<{
|
|
155
|
+
jobType: "crawl" | "batchScrape";
|
|
156
|
+
jobId: string;
|
|
157
|
+
}, {
|
|
158
|
+
success: boolean;
|
|
159
|
+
jobId: string;
|
|
160
|
+
jobType: "crawl" | "batchScrape";
|
|
161
|
+
message: string;
|
|
162
|
+
}>;
|
|
163
|
+
|
|
164
|
+
interface PollOptions {
|
|
165
|
+
pollInterval?: number;
|
|
166
|
+
timeout?: number;
|
|
167
|
+
onProgress?: (status: any) => void;
|
|
168
|
+
}
|
|
169
|
+
declare function pollCrawlJob(jobId: string, options?: PollOptions): Promise<any>;
|
|
170
|
+
declare function pollBatchScrapeJob(jobId: string, options?: PollOptions): Promise<any>;
|
|
171
|
+
declare function pollExtractJob(jobId: string, options?: PollOptions): Promise<any>;
|
|
172
|
+
|
|
173
|
+
declare const scrapeSchema: z.ZodObject<{
|
|
174
|
+
url: z.ZodString;
|
|
175
|
+
formats: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<["markdown", "html", "rawHtml", "screenshot", "links", "summary", "changeTracking", "branding"]>, z.ZodObject<{
|
|
176
|
+
type: z.ZodLiteral<"json">;
|
|
177
|
+
prompt: z.ZodOptional<z.ZodString>;
|
|
178
|
+
schema: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
|
|
179
|
+
}, "strip", z.ZodTypeAny, {
|
|
180
|
+
type: "json";
|
|
181
|
+
prompt?: string | undefined;
|
|
182
|
+
schema?: Record<string, any> | undefined;
|
|
183
|
+
}, {
|
|
184
|
+
type: "json";
|
|
185
|
+
prompt?: string | undefined;
|
|
186
|
+
schema?: Record<string, any> | undefined;
|
|
187
|
+
}>, z.ZodObject<{
|
|
188
|
+
type: z.ZodLiteral<"screenshot">;
|
|
189
|
+
fullPage: z.ZodOptional<z.ZodBoolean>;
|
|
190
|
+
quality: z.ZodOptional<z.ZodNumber>;
|
|
191
|
+
viewport: z.ZodOptional<z.ZodObject<{
|
|
192
|
+
width: z.ZodNumber;
|
|
193
|
+
height: z.ZodNumber;
|
|
194
|
+
}, "strip", z.ZodTypeAny, {
|
|
195
|
+
width: number;
|
|
196
|
+
height: number;
|
|
197
|
+
}, {
|
|
198
|
+
width: number;
|
|
199
|
+
height: number;
|
|
200
|
+
}>>;
|
|
201
|
+
}, "strip", z.ZodTypeAny, {
|
|
202
|
+
type: "screenshot";
|
|
203
|
+
fullPage?: boolean | undefined;
|
|
204
|
+
quality?: number | undefined;
|
|
205
|
+
viewport?: {
|
|
206
|
+
width: number;
|
|
207
|
+
height: number;
|
|
208
|
+
} | undefined;
|
|
209
|
+
}, {
|
|
210
|
+
type: "screenshot";
|
|
211
|
+
fullPage?: boolean | undefined;
|
|
212
|
+
quality?: number | undefined;
|
|
213
|
+
viewport?: {
|
|
214
|
+
width: number;
|
|
215
|
+
height: number;
|
|
216
|
+
} | undefined;
|
|
217
|
+
}>]>, "many">>;
|
|
218
|
+
parsers: z.ZodOptional<z.ZodArray<z.ZodUnion<[z.ZodEnum<["pdf"]>, z.ZodObject<{
|
|
219
|
+
type: z.ZodEnum<["pdf"]>;
|
|
220
|
+
maxPages: z.ZodOptional<z.ZodNumber>;
|
|
221
|
+
}, "strip", z.ZodTypeAny, {
|
|
222
|
+
type: "pdf";
|
|
223
|
+
maxPages?: number | undefined;
|
|
224
|
+
}, {
|
|
225
|
+
type: "pdf";
|
|
226
|
+
maxPages?: number | undefined;
|
|
227
|
+
}>]>, "many">>;
|
|
228
|
+
onlyMainContent: z.ZodOptional<z.ZodBoolean>;
|
|
229
|
+
includeTags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
230
|
+
excludeTags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
231
|
+
waitFor: z.ZodOptional<z.ZodNumber>;
|
|
232
|
+
actions: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
233
|
+
type: z.ZodEnum<["wait", "screenshot", "scroll", "scrape", "click", "write", "press", "executeJavascript", "generatePDF"]>;
|
|
234
|
+
selector: z.ZodOptional<z.ZodString>;
|
|
235
|
+
milliseconds: z.ZodOptional<z.ZodNumber>;
|
|
236
|
+
text: z.ZodOptional<z.ZodString>;
|
|
237
|
+
key: z.ZodOptional<z.ZodString>;
|
|
238
|
+
direction: z.ZodOptional<z.ZodEnum<["up", "down"]>>;
|
|
239
|
+
script: z.ZodOptional<z.ZodString>;
|
|
240
|
+
fullPage: z.ZodOptional<z.ZodBoolean>;
|
|
241
|
+
}, "strip", z.ZodTypeAny, {
|
|
242
|
+
type: "screenshot" | "wait" | "scroll" | "scrape" | "click" | "write" | "press" | "executeJavascript" | "generatePDF";
|
|
243
|
+
fullPage?: boolean | undefined;
|
|
244
|
+
selector?: string | undefined;
|
|
245
|
+
milliseconds?: number | undefined;
|
|
246
|
+
text?: string | undefined;
|
|
247
|
+
key?: string | undefined;
|
|
248
|
+
direction?: "up" | "down" | undefined;
|
|
249
|
+
script?: string | undefined;
|
|
250
|
+
}, {
|
|
251
|
+
type: "screenshot" | "wait" | "scroll" | "scrape" | "click" | "write" | "press" | "executeJavascript" | "generatePDF";
|
|
252
|
+
fullPage?: boolean | undefined;
|
|
253
|
+
selector?: string | undefined;
|
|
254
|
+
milliseconds?: number | undefined;
|
|
255
|
+
text?: string | undefined;
|
|
256
|
+
key?: string | undefined;
|
|
257
|
+
direction?: "up" | "down" | undefined;
|
|
258
|
+
script?: string | undefined;
|
|
259
|
+
}>, "many">>;
|
|
260
|
+
mobile: z.ZodOptional<z.ZodBoolean>;
|
|
261
|
+
skipTlsVerification: z.ZodOptional<z.ZodBoolean>;
|
|
262
|
+
removeBase64Images: z.ZodOptional<z.ZodBoolean>;
|
|
263
|
+
location: z.ZodOptional<z.ZodObject<{
|
|
264
|
+
country: z.ZodOptional<z.ZodString>;
|
|
265
|
+
languages: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
266
|
+
}, "strip", z.ZodTypeAny, {
|
|
267
|
+
country?: string | undefined;
|
|
268
|
+
languages?: string[] | undefined;
|
|
269
|
+
}, {
|
|
270
|
+
country?: string | undefined;
|
|
271
|
+
languages?: string[] | undefined;
|
|
272
|
+
}>>;
|
|
273
|
+
storeInCache: z.ZodOptional<z.ZodBoolean>;
|
|
274
|
+
maxAge: z.ZodOptional<z.ZodNumber>;
|
|
275
|
+
headers: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodString>>;
|
|
276
|
+
blockAds: z.ZodOptional<z.ZodBoolean>;
|
|
277
|
+
proxy: z.ZodOptional<z.ZodEnum<["basic", "stealth", "auto"]>>;
|
|
278
|
+
zeroDataRetention: z.ZodOptional<z.ZodBoolean>;
|
|
279
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
280
|
+
}, "strip", z.ZodTypeAny, {
|
|
281
|
+
url: string;
|
|
282
|
+
formats?: ("markdown" | "html" | "rawHtml" | "screenshot" | "links" | "summary" | "changeTracking" | "branding" | {
|
|
283
|
+
type: "json";
|
|
284
|
+
prompt?: string | undefined;
|
|
285
|
+
schema?: Record<string, any> | undefined;
|
|
286
|
+
} | {
|
|
287
|
+
type: "screenshot";
|
|
288
|
+
fullPage?: boolean | undefined;
|
|
289
|
+
quality?: number | undefined;
|
|
290
|
+
viewport?: {
|
|
291
|
+
width: number;
|
|
292
|
+
height: number;
|
|
293
|
+
} | undefined;
|
|
294
|
+
})[] | undefined;
|
|
295
|
+
parsers?: ("pdf" | {
|
|
296
|
+
type: "pdf";
|
|
297
|
+
maxPages?: number | undefined;
|
|
298
|
+
})[] | undefined;
|
|
299
|
+
onlyMainContent?: boolean | undefined;
|
|
300
|
+
includeTags?: string[] | undefined;
|
|
301
|
+
excludeTags?: string[] | undefined;
|
|
302
|
+
waitFor?: number | undefined;
|
|
303
|
+
actions?: {
|
|
304
|
+
type: "screenshot" | "wait" | "scroll" | "scrape" | "click" | "write" | "press" | "executeJavascript" | "generatePDF";
|
|
305
|
+
fullPage?: boolean | undefined;
|
|
306
|
+
selector?: string | undefined;
|
|
307
|
+
milliseconds?: number | undefined;
|
|
308
|
+
text?: string | undefined;
|
|
309
|
+
key?: string | undefined;
|
|
310
|
+
direction?: "up" | "down" | undefined;
|
|
311
|
+
script?: string | undefined;
|
|
312
|
+
}[] | undefined;
|
|
313
|
+
mobile?: boolean | undefined;
|
|
314
|
+
skipTlsVerification?: boolean | undefined;
|
|
315
|
+
removeBase64Images?: boolean | undefined;
|
|
316
|
+
location?: {
|
|
317
|
+
country?: string | undefined;
|
|
318
|
+
languages?: string[] | undefined;
|
|
319
|
+
} | undefined;
|
|
320
|
+
storeInCache?: boolean | undefined;
|
|
321
|
+
maxAge?: number | undefined;
|
|
322
|
+
headers?: Record<string, string> | undefined;
|
|
323
|
+
blockAds?: boolean | undefined;
|
|
324
|
+
proxy?: "basic" | "stealth" | "auto" | undefined;
|
|
325
|
+
zeroDataRetention?: boolean | undefined;
|
|
326
|
+
timeout?: number | undefined;
|
|
327
|
+
}, {
|
|
328
|
+
url: string;
|
|
329
|
+
formats?: ("markdown" | "html" | "rawHtml" | "screenshot" | "links" | "summary" | "changeTracking" | "branding" | {
|
|
330
|
+
type: "json";
|
|
331
|
+
prompt?: string | undefined;
|
|
332
|
+
schema?: Record<string, any> | undefined;
|
|
333
|
+
} | {
|
|
334
|
+
type: "screenshot";
|
|
335
|
+
fullPage?: boolean | undefined;
|
|
336
|
+
quality?: number | undefined;
|
|
337
|
+
viewport?: {
|
|
338
|
+
width: number;
|
|
339
|
+
height: number;
|
|
340
|
+
} | undefined;
|
|
341
|
+
})[] | undefined;
|
|
342
|
+
parsers?: ("pdf" | {
|
|
343
|
+
type: "pdf";
|
|
344
|
+
maxPages?: number | undefined;
|
|
345
|
+
})[] | undefined;
|
|
346
|
+
onlyMainContent?: boolean | undefined;
|
|
347
|
+
includeTags?: string[] | undefined;
|
|
348
|
+
excludeTags?: string[] | undefined;
|
|
349
|
+
waitFor?: number | undefined;
|
|
350
|
+
actions?: {
|
|
351
|
+
type: "screenshot" | "wait" | "scroll" | "scrape" | "click" | "write" | "press" | "executeJavascript" | "generatePDF";
|
|
352
|
+
fullPage?: boolean | undefined;
|
|
353
|
+
selector?: string | undefined;
|
|
354
|
+
milliseconds?: number | undefined;
|
|
355
|
+
text?: string | undefined;
|
|
356
|
+
key?: string | undefined;
|
|
357
|
+
direction?: "up" | "down" | undefined;
|
|
358
|
+
script?: string | undefined;
|
|
359
|
+
}[] | undefined;
|
|
360
|
+
mobile?: boolean | undefined;
|
|
361
|
+
skipTlsVerification?: boolean | undefined;
|
|
362
|
+
removeBase64Images?: boolean | undefined;
|
|
363
|
+
location?: {
|
|
364
|
+
country?: string | undefined;
|
|
365
|
+
languages?: string[] | undefined;
|
|
366
|
+
} | undefined;
|
|
367
|
+
storeInCache?: boolean | undefined;
|
|
368
|
+
maxAge?: number | undefined;
|
|
369
|
+
headers?: Record<string, string> | undefined;
|
|
370
|
+
blockAds?: boolean | undefined;
|
|
371
|
+
proxy?: "basic" | "stealth" | "auto" | undefined;
|
|
372
|
+
zeroDataRetention?: boolean | undefined;
|
|
373
|
+
timeout?: number | undefined;
|
|
374
|
+
}>;
|
|
375
|
+
declare const searchSchema: z.ZodObject<{
|
|
376
|
+
query: z.ZodString;
|
|
377
|
+
limit: z.ZodOptional<z.ZodNumber>;
|
|
378
|
+
sources: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
379
|
+
type: z.ZodEnum<["web", "images", "news"]>;
|
|
380
|
+
}, "strip", z.ZodTypeAny, {
|
|
381
|
+
type: "web" | "images" | "news";
|
|
382
|
+
}, {
|
|
383
|
+
type: "web" | "images" | "news";
|
|
384
|
+
}>, "many">>;
|
|
385
|
+
categories: z.ZodOptional<z.ZodArray<z.ZodObject<{
|
|
386
|
+
type: z.ZodEnum<["github", "research", "pdf"]>;
|
|
387
|
+
}, "strip", z.ZodTypeAny, {
|
|
388
|
+
type: "pdf" | "github" | "research";
|
|
389
|
+
}, {
|
|
390
|
+
type: "pdf" | "github" | "research";
|
|
391
|
+
}>, "many">>;
|
|
392
|
+
tbs: z.ZodOptional<z.ZodString>;
|
|
393
|
+
location: z.ZodOptional<z.ZodString>;
|
|
394
|
+
country: z.ZodOptional<z.ZodString>;
|
|
395
|
+
timeout: z.ZodOptional<z.ZodNumber>;
|
|
396
|
+
ignoreInvalidURLs: z.ZodOptional<z.ZodBoolean>;
|
|
397
|
+
scrapeOptions: z.ZodOptional<z.ZodObject<{
|
|
398
|
+
formats: z.ZodOptional<z.ZodArray<z.ZodEnum<["markdown", "html", "rawHtml"]>, "many">>;
|
|
399
|
+
onlyMainContent: z.ZodOptional<z.ZodBoolean>;
|
|
400
|
+
}, "strip", z.ZodTypeAny, {
|
|
401
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
402
|
+
onlyMainContent?: boolean | undefined;
|
|
403
|
+
}, {
|
|
404
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
405
|
+
onlyMainContent?: boolean | undefined;
|
|
406
|
+
}>>;
|
|
407
|
+
}, "strip", z.ZodTypeAny, {
|
|
408
|
+
query: string;
|
|
409
|
+
country?: string | undefined;
|
|
410
|
+
location?: string | undefined;
|
|
411
|
+
timeout?: number | undefined;
|
|
412
|
+
limit?: number | undefined;
|
|
413
|
+
sources?: {
|
|
414
|
+
type: "web" | "images" | "news";
|
|
415
|
+
}[] | undefined;
|
|
416
|
+
categories?: {
|
|
417
|
+
type: "pdf" | "github" | "research";
|
|
418
|
+
}[] | undefined;
|
|
419
|
+
tbs?: string | undefined;
|
|
420
|
+
ignoreInvalidURLs?: boolean | undefined;
|
|
421
|
+
scrapeOptions?: {
|
|
422
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
423
|
+
onlyMainContent?: boolean | undefined;
|
|
424
|
+
} | undefined;
|
|
425
|
+
}, {
|
|
426
|
+
query: string;
|
|
427
|
+
country?: string | undefined;
|
|
428
|
+
location?: string | undefined;
|
|
429
|
+
timeout?: number | undefined;
|
|
430
|
+
limit?: number | undefined;
|
|
431
|
+
sources?: {
|
|
432
|
+
type: "web" | "images" | "news";
|
|
433
|
+
}[] | undefined;
|
|
434
|
+
categories?: {
|
|
435
|
+
type: "pdf" | "github" | "research";
|
|
436
|
+
}[] | undefined;
|
|
437
|
+
tbs?: string | undefined;
|
|
438
|
+
ignoreInvalidURLs?: boolean | undefined;
|
|
439
|
+
scrapeOptions?: {
|
|
440
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
441
|
+
onlyMainContent?: boolean | undefined;
|
|
442
|
+
} | undefined;
|
|
443
|
+
}>;
|
|
444
|
+
declare const mapSchema: z.ZodObject<{
|
|
445
|
+
url: z.ZodString;
|
|
446
|
+
search: z.ZodOptional<z.ZodString>;
|
|
447
|
+
limit: z.ZodOptional<z.ZodNumber>;
|
|
448
|
+
includeSubdomains: z.ZodOptional<z.ZodBoolean>;
|
|
449
|
+
ignoreSitemap: z.ZodOptional<z.ZodBoolean>;
|
|
450
|
+
}, "strip", z.ZodTypeAny, {
|
|
451
|
+
url: string;
|
|
452
|
+
limit?: number | undefined;
|
|
453
|
+
search?: string | undefined;
|
|
454
|
+
includeSubdomains?: boolean | undefined;
|
|
455
|
+
ignoreSitemap?: boolean | undefined;
|
|
456
|
+
}, {
|
|
457
|
+
url: string;
|
|
458
|
+
limit?: number | undefined;
|
|
459
|
+
search?: string | undefined;
|
|
460
|
+
includeSubdomains?: boolean | undefined;
|
|
461
|
+
ignoreSitemap?: boolean | undefined;
|
|
462
|
+
}>;
|
|
463
|
+
declare const extractSchema: z.ZodObject<{
|
|
464
|
+
urls: z.ZodArray<z.ZodString, "many">;
|
|
465
|
+
prompt: z.ZodOptional<z.ZodString>;
|
|
466
|
+
schema: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
|
|
467
|
+
systemPrompt: z.ZodOptional<z.ZodString>;
|
|
468
|
+
allowExternalLinks: z.ZodOptional<z.ZodBoolean>;
|
|
469
|
+
enableWebSearch: z.ZodOptional<z.ZodBoolean>;
|
|
470
|
+
showSources: z.ZodOptional<z.ZodBoolean>;
|
|
471
|
+
ignoreInvalidURLs: z.ZodOptional<z.ZodBoolean>;
|
|
472
|
+
scrapeOptions: z.ZodOptional<z.ZodObject<{
|
|
473
|
+
formats: z.ZodOptional<z.ZodArray<z.ZodEnum<["markdown", "html", "rawHtml"]>, "many">>;
|
|
474
|
+
onlyMainContent: z.ZodOptional<z.ZodBoolean>;
|
|
475
|
+
includeTags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
476
|
+
excludeTags: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
477
|
+
waitFor: z.ZodOptional<z.ZodNumber>;
|
|
478
|
+
mobile: z.ZodOptional<z.ZodBoolean>;
|
|
479
|
+
}, "strip", z.ZodTypeAny, {
|
|
480
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
481
|
+
onlyMainContent?: boolean | undefined;
|
|
482
|
+
includeTags?: string[] | undefined;
|
|
483
|
+
excludeTags?: string[] | undefined;
|
|
484
|
+
waitFor?: number | undefined;
|
|
485
|
+
mobile?: boolean | undefined;
|
|
486
|
+
}, {
|
|
487
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
488
|
+
onlyMainContent?: boolean | undefined;
|
|
489
|
+
includeTags?: string[] | undefined;
|
|
490
|
+
excludeTags?: string[] | undefined;
|
|
491
|
+
waitFor?: number | undefined;
|
|
492
|
+
mobile?: boolean | undefined;
|
|
493
|
+
}>>;
|
|
494
|
+
}, "strip", z.ZodTypeAny, {
|
|
495
|
+
urls: string[];
|
|
496
|
+
prompt?: string | undefined;
|
|
497
|
+
schema?: Record<string, any> | undefined;
|
|
498
|
+
ignoreInvalidURLs?: boolean | undefined;
|
|
499
|
+
scrapeOptions?: {
|
|
500
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
501
|
+
onlyMainContent?: boolean | undefined;
|
|
502
|
+
includeTags?: string[] | undefined;
|
|
503
|
+
excludeTags?: string[] | undefined;
|
|
504
|
+
waitFor?: number | undefined;
|
|
505
|
+
mobile?: boolean | undefined;
|
|
506
|
+
} | undefined;
|
|
507
|
+
systemPrompt?: string | undefined;
|
|
508
|
+
allowExternalLinks?: boolean | undefined;
|
|
509
|
+
enableWebSearch?: boolean | undefined;
|
|
510
|
+
showSources?: boolean | undefined;
|
|
511
|
+
}, {
|
|
512
|
+
urls: string[];
|
|
513
|
+
prompt?: string | undefined;
|
|
514
|
+
schema?: Record<string, any> | undefined;
|
|
515
|
+
ignoreInvalidURLs?: boolean | undefined;
|
|
516
|
+
scrapeOptions?: {
|
|
517
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
518
|
+
onlyMainContent?: boolean | undefined;
|
|
519
|
+
includeTags?: string[] | undefined;
|
|
520
|
+
excludeTags?: string[] | undefined;
|
|
521
|
+
waitFor?: number | undefined;
|
|
522
|
+
mobile?: boolean | undefined;
|
|
523
|
+
} | undefined;
|
|
524
|
+
systemPrompt?: string | undefined;
|
|
525
|
+
allowExternalLinks?: boolean | undefined;
|
|
526
|
+
enableWebSearch?: boolean | undefined;
|
|
527
|
+
showSources?: boolean | undefined;
|
|
528
|
+
}>;
|
|
529
|
+
declare const batchScrapeSchema: z.ZodObject<{
|
|
530
|
+
urls: z.ZodArray<z.ZodString, "many">;
|
|
531
|
+
formats: z.ZodOptional<z.ZodArray<z.ZodEnum<["markdown", "html", "rawHtml", "screenshot", "links"]>, "many">>;
|
|
532
|
+
onlyMainContent: z.ZodOptional<z.ZodBoolean>;
|
|
533
|
+
}, "strip", z.ZodTypeAny, {
|
|
534
|
+
urls: string[];
|
|
535
|
+
formats?: ("markdown" | "html" | "rawHtml" | "screenshot" | "links")[] | undefined;
|
|
536
|
+
onlyMainContent?: boolean | undefined;
|
|
537
|
+
}, {
|
|
538
|
+
urls: string[];
|
|
539
|
+
formats?: ("markdown" | "html" | "rawHtml" | "screenshot" | "links")[] | undefined;
|
|
540
|
+
onlyMainContent?: boolean | undefined;
|
|
541
|
+
}>;
|
|
542
|
+
declare const crawlSchema: z.ZodObject<{
|
|
543
|
+
url: z.ZodString;
|
|
544
|
+
limit: z.ZodOptional<z.ZodNumber>;
|
|
545
|
+
maxDepth: z.ZodOptional<z.ZodNumber>;
|
|
546
|
+
allowExternalLinks: z.ZodOptional<z.ZodBoolean>;
|
|
547
|
+
allowBackwardLinks: z.ZodOptional<z.ZodBoolean>;
|
|
548
|
+
includePaths: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
549
|
+
excludePaths: z.ZodOptional<z.ZodArray<z.ZodString, "many">>;
|
|
550
|
+
scrapeOptions: z.ZodOptional<z.ZodObject<{
|
|
551
|
+
formats: z.ZodOptional<z.ZodArray<z.ZodEnum<["markdown", "html", "rawHtml"]>, "many">>;
|
|
552
|
+
onlyMainContent: z.ZodOptional<z.ZodBoolean>;
|
|
553
|
+
}, "strip", z.ZodTypeAny, {
|
|
554
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
555
|
+
onlyMainContent?: boolean | undefined;
|
|
556
|
+
}, {
|
|
557
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
558
|
+
onlyMainContent?: boolean | undefined;
|
|
559
|
+
}>>;
|
|
560
|
+
}, "strip", z.ZodTypeAny, {
|
|
561
|
+
url: string;
|
|
562
|
+
limit?: number | undefined;
|
|
563
|
+
scrapeOptions?: {
|
|
564
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
565
|
+
onlyMainContent?: boolean | undefined;
|
|
566
|
+
} | undefined;
|
|
567
|
+
allowExternalLinks?: boolean | undefined;
|
|
568
|
+
maxDepth?: number | undefined;
|
|
569
|
+
allowBackwardLinks?: boolean | undefined;
|
|
570
|
+
includePaths?: string[] | undefined;
|
|
571
|
+
excludePaths?: string[] | undefined;
|
|
572
|
+
}, {
|
|
573
|
+
url: string;
|
|
574
|
+
limit?: number | undefined;
|
|
575
|
+
scrapeOptions?: {
|
|
576
|
+
formats?: ("markdown" | "html" | "rawHtml")[] | undefined;
|
|
577
|
+
onlyMainContent?: boolean | undefined;
|
|
578
|
+
} | undefined;
|
|
579
|
+
allowExternalLinks?: boolean | undefined;
|
|
580
|
+
maxDepth?: number | undefined;
|
|
581
|
+
allowBackwardLinks?: boolean | undefined;
|
|
582
|
+
includePaths?: string[] | undefined;
|
|
583
|
+
excludePaths?: string[] | undefined;
|
|
584
|
+
}>;
|
|
585
|
+
declare const checkStatusSchema: z.ZodObject<{
|
|
586
|
+
id: z.ZodString;
|
|
587
|
+
jobType: z.ZodOptional<z.ZodEnum<["crawl", "batchScrape", "extract"]>>;
|
|
588
|
+
}, "strip", z.ZodTypeAny, {
|
|
589
|
+
id: string;
|
|
590
|
+
jobType?: "crawl" | "batchScrape" | "extract" | undefined;
|
|
591
|
+
}, {
|
|
592
|
+
id: string;
|
|
593
|
+
jobType?: "crawl" | "batchScrape" | "extract" | undefined;
|
|
594
|
+
}>;
|
|
595
|
+
type ScrapeParams = z.infer<typeof scrapeSchema>;
|
|
596
|
+
type SearchParams = z.infer<typeof searchSchema>;
|
|
597
|
+
type MapParams = z.infer<typeof mapSchema>;
|
|
598
|
+
type ExtractParams = z.infer<typeof extractSchema>;
|
|
599
|
+
type BatchScrapeParams = z.infer<typeof batchScrapeSchema>;
|
|
600
|
+
type CrawlParams = z.infer<typeof crawlSchema>;
|
|
601
|
+
type CheckStatusParams = z.infer<typeof checkStatusSchema>;
|
|
602
|
+
|
|
603
|
+
export { type BatchScrapeParams, type CheckStatusParams, type CrawlParams, type ExtractParams, type MapParams, type PollOptions, type ScrapeParams, type SearchParams, batchScrapeSchema, batchScrapeTool, cancelTool, checkStatusSchema, crawlSchema, crawlTool, extractSchema, extractTool, mapSchema, mapTool, pollBatchScrapeJob, pollCrawlJob, pollExtractJob, pollTool, scrapeSchema, scrapeTool, searchSchema, searchTool, statusTool };
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,539 @@
|
|
|
1
|
+
// src/tools/scrape.ts
|
|
2
|
+
import { tool } from "ai";
|
|
3
|
+
import Firecrawl from "@mendable/firecrawl-js";
|
|
4
|
+
|
|
5
|
+
// src/schemas/firecrawl.ts
|
|
6
|
+
import { z } from "zod";
|
|
7
|
+
var scrapeSchema = z.object({
|
|
8
|
+
url: z.string().url().describe("The URL to scrape content from"),
|
|
9
|
+
formats: z.array(
|
|
10
|
+
z.union([
|
|
11
|
+
z.enum([
|
|
12
|
+
"markdown",
|
|
13
|
+
"html",
|
|
14
|
+
"rawHtml",
|
|
15
|
+
"screenshot",
|
|
16
|
+
"links",
|
|
17
|
+
"summary",
|
|
18
|
+
"changeTracking",
|
|
19
|
+
"branding"
|
|
20
|
+
]),
|
|
21
|
+
z.object({
|
|
22
|
+
type: z.literal("json"),
|
|
23
|
+
prompt: z.string().optional(),
|
|
24
|
+
schema: z.record(z.string(), z.any()).optional()
|
|
25
|
+
}),
|
|
26
|
+
z.object({
|
|
27
|
+
type: z.literal("screenshot"),
|
|
28
|
+
fullPage: z.boolean().optional(),
|
|
29
|
+
quality: z.number().optional(),
|
|
30
|
+
viewport: z.object({ width: z.number(), height: z.number() }).optional()
|
|
31
|
+
})
|
|
32
|
+
])
|
|
33
|
+
).optional().describe("Output formats for the scraped content"),
|
|
34
|
+
parsers: z.array(
|
|
35
|
+
z.union([
|
|
36
|
+
z.enum(["pdf"]),
|
|
37
|
+
z.object({
|
|
38
|
+
type: z.enum(["pdf"]),
|
|
39
|
+
maxPages: z.number().int().min(1).max(1e4).optional()
|
|
40
|
+
})
|
|
41
|
+
])
|
|
42
|
+
).optional().describe("Parsers to use for processing content (e.g., PDF parsing)"),
|
|
43
|
+
onlyMainContent: z.boolean().optional().describe("Extract only the main content, removing headers, footers, and navigation"),
|
|
44
|
+
includeTags: z.array(z.string()).optional().describe("HTML tags to include in the output"),
|
|
45
|
+
excludeTags: z.array(z.string()).optional().describe("HTML tags to exclude from the output"),
|
|
46
|
+
waitFor: z.number().optional().describe("Time in milliseconds to wait before scraping"),
|
|
47
|
+
actions: z.array(
|
|
48
|
+
z.object({
|
|
49
|
+
type: z.enum(["wait", "screenshot", "scroll", "scrape", "click", "write", "press", "executeJavascript", "generatePDF"]),
|
|
50
|
+
selector: z.string().optional(),
|
|
51
|
+
milliseconds: z.number().optional(),
|
|
52
|
+
text: z.string().optional(),
|
|
53
|
+
key: z.string().optional(),
|
|
54
|
+
direction: z.enum(["up", "down"]).optional(),
|
|
55
|
+
script: z.string().optional(),
|
|
56
|
+
fullPage: z.boolean().optional()
|
|
57
|
+
})
|
|
58
|
+
).optional().describe("Browser actions to perform before scraping (click, scroll, type, wait, etc.)"),
|
|
59
|
+
mobile: z.boolean().optional().describe("Emulate mobile device"),
|
|
60
|
+
skipTlsVerification: z.boolean().optional().describe("Skip TLS certificate verification"),
|
|
61
|
+
removeBase64Images: z.boolean().optional().describe("Remove base64 encoded images from the output"),
|
|
62
|
+
location: z.object({
|
|
63
|
+
country: z.string().optional(),
|
|
64
|
+
languages: z.array(z.string()).optional()
|
|
65
|
+
}).optional().describe("Location settings for the scraping request"),
|
|
66
|
+
storeInCache: z.boolean().optional().describe("Store the scraped content in cache"),
|
|
67
|
+
maxAge: z.number().optional().describe("Maximum age of cached content in milliseconds. Default: 172800000 (2 days)"),
|
|
68
|
+
headers: z.record(z.string()).optional().describe("Custom headers to send with the request (cookies, user-agent, etc.)"),
|
|
69
|
+
blockAds: z.boolean().optional().describe("Enable ad-blocking and cookie popup blocking. Default: true"),
|
|
70
|
+
proxy: z.enum(["basic", "stealth", "auto"]).optional().describe("Proxy type: basic (fast), stealth (reliable, 5 credits), auto (retry with stealth). Default: auto"),
|
|
71
|
+
zeroDataRetention: z.boolean().optional().describe("Enable zero data retention. Contact help@firecrawl.dev to enable this feature"),
|
|
72
|
+
timeout: z.number().optional().describe("Timeout in milliseconds. Default: 60000")
|
|
73
|
+
});
|
|
74
|
+
var searchSchema = z.object({
|
|
75
|
+
query: z.string().min(1).describe("The search query. Supports operators like site:, inurl:, intitle:, etc."),
|
|
76
|
+
limit: z.number().int().min(1).max(100).optional().describe("Maximum number of search results to return. Default: 5"),
|
|
77
|
+
sources: z.array(z.object({ type: z.enum(["web", "images", "news"]) })).optional().describe("Sources to search: web, images, news"),
|
|
78
|
+
categories: z.array(z.object({ type: z.enum(["github", "research", "pdf"]) })).optional().describe("Filter by categories: github (repositories), research (academic), pdf (PDFs)"),
|
|
79
|
+
tbs: z.string().optional().describe("Time-based search: qdr:h (hour), qdr:d (day), qdr:w (week), qdr:m (month), qdr:y (year)"),
|
|
80
|
+
location: z.string().optional().describe('Location for search results (e.g., "San Francisco,California,United States")'),
|
|
81
|
+
country: z.string().optional().describe('ISO country code for geo-targeting (e.g., "US", "DE"). Default: "US"'),
|
|
82
|
+
timeout: z.number().optional().describe("Timeout in milliseconds. Default: 60000"),
|
|
83
|
+
ignoreInvalidURLs: z.boolean().optional().describe("Exclude invalid URLs from results. Default: false"),
|
|
84
|
+
scrapeOptions: z.object({
|
|
85
|
+
formats: z.array(z.enum(["markdown", "html", "rawHtml"])).optional(),
|
|
86
|
+
onlyMainContent: z.boolean().optional()
|
|
87
|
+
}).optional().describe("Options for scraping the search results")
|
|
88
|
+
});
|
|
89
|
+
var mapSchema = z.object({
|
|
90
|
+
url: z.string().url().describe("The website URL to map and discover URLs from"),
|
|
91
|
+
search: z.string().optional().describe("Filter URLs containing this keyword"),
|
|
92
|
+
limit: z.number().int().positive().optional().describe("Maximum number of URLs to discover"),
|
|
93
|
+
includeSubdomains: z.boolean().optional().describe("Include URLs from subdomains"),
|
|
94
|
+
ignoreSitemap: z.boolean().optional().describe("Ignore the sitemap and crawl the site directly")
|
|
95
|
+
});
|
|
96
|
+
var extractSchema = z.object({
|
|
97
|
+
urls: z.array(z.string().url()).min(1).describe("Array of URLs to extract structured data from"),
|
|
98
|
+
prompt: z.string().optional().describe("Custom prompt to guide the LLM extraction"),
|
|
99
|
+
schema: z.record(z.any()).optional().describe("JSON schema defining the structure of data to extract"),
|
|
100
|
+
systemPrompt: z.string().optional().describe("System prompt to guide the extraction behavior"),
|
|
101
|
+
allowExternalLinks: z.boolean().optional().describe("Allow extraction from external links found on the page"),
|
|
102
|
+
enableWebSearch: z.boolean().optional().describe("Enable web search to enrich results"),
|
|
103
|
+
showSources: z.boolean().optional().describe("Include source URLs in the response"),
|
|
104
|
+
ignoreInvalidURLs: z.boolean().optional().describe("Continue extraction even if some URLs are invalid. Default: true"),
|
|
105
|
+
scrapeOptions: z.object({
|
|
106
|
+
formats: z.array(z.enum(["markdown", "html", "rawHtml"])).optional(),
|
|
107
|
+
onlyMainContent: z.boolean().optional(),
|
|
108
|
+
includeTags: z.array(z.string()).optional(),
|
|
109
|
+
excludeTags: z.array(z.string()).optional(),
|
|
110
|
+
waitFor: z.number().optional(),
|
|
111
|
+
mobile: z.boolean().optional()
|
|
112
|
+
}).optional().describe("Advanced scraping options for extraction")
|
|
113
|
+
});
|
|
114
|
+
var batchScrapeSchema = z.object({
|
|
115
|
+
urls: z.array(z.string().url()).min(1).max(10).describe("Array of URLs to scrape (max 10)"),
|
|
116
|
+
formats: z.array(z.enum(["markdown", "html", "rawHtml", "screenshot", "links"])).optional().describe("Output formats for the scraped content"),
|
|
117
|
+
onlyMainContent: z.boolean().optional().describe("Extract only the main content from each URL")
|
|
118
|
+
});
|
|
119
|
+
var crawlSchema = z.object({
|
|
120
|
+
url: z.string().url().describe("The starting URL to crawl from"),
|
|
121
|
+
limit: z.number().int().positive().optional().describe("Maximum number of pages to crawl"),
|
|
122
|
+
maxDepth: z.number().int().positive().optional().describe("Maximum depth of crawling from the starting URL"),
|
|
123
|
+
allowExternalLinks: z.boolean().optional().describe("Allow crawling external links"),
|
|
124
|
+
allowBackwardLinks: z.boolean().optional().describe("Allow crawling backward links"),
|
|
125
|
+
includePaths: z.array(z.string()).optional().describe("Only crawl URLs matching these path patterns"),
|
|
126
|
+
excludePaths: z.array(z.string()).optional().describe("Exclude URLs matching these path patterns"),
|
|
127
|
+
scrapeOptions: z.object({
|
|
128
|
+
formats: z.array(z.enum(["markdown", "html", "rawHtml"])).optional(),
|
|
129
|
+
onlyMainContent: z.boolean().optional()
|
|
130
|
+
}).optional().describe("Options for scraping crawled pages")
|
|
131
|
+
});
|
|
132
|
+
var checkStatusSchema = z.object({
|
|
133
|
+
id: z.string().describe("The job ID to check status for"),
|
|
134
|
+
jobType: z.enum(["crawl", "batchScrape", "extract"]).optional().describe("Type of job (crawl, batchScrape, or extract). Optional but recommended.")
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
// src/tools/scrape.ts
|
|
138
|
+
var scrapeTool = tool({
|
|
139
|
+
description: `Scrape content from a single URL with advanced options.
|
|
140
|
+
|
|
141
|
+
Best for: Single page content extraction when you know the exact URL.
|
|
142
|
+
Returns: Content in specified formats (markdown, html, rawHtml, screenshot, or links).
|
|
143
|
+
|
|
144
|
+
Example use cases:
|
|
145
|
+
- Extract article content from a blog post
|
|
146
|
+
- Get product information from an e-commerce page
|
|
147
|
+
- Scrape documentation from a specific page`,
|
|
148
|
+
inputSchema: scrapeSchema,
|
|
149
|
+
execute: async function(args) {
|
|
150
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
151
|
+
if (!apiKey) {
|
|
152
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
153
|
+
}
|
|
154
|
+
const firecrawl = new Firecrawl({ apiKey });
|
|
155
|
+
const result = await firecrawl.scrape(args.url, {
|
|
156
|
+
formats: args.formats || ["markdown"],
|
|
157
|
+
onlyMainContent: args.onlyMainContent,
|
|
158
|
+
includeTags: args.includeTags,
|
|
159
|
+
excludeTags: args.excludeTags,
|
|
160
|
+
waitFor: args.waitFor,
|
|
161
|
+
timeout: args.timeout
|
|
162
|
+
});
|
|
163
|
+
return result;
|
|
164
|
+
}
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
// src/tools/search.ts
|
|
168
|
+
import { tool as tool2 } from "ai";
|
|
169
|
+
import Firecrawl2 from "@mendable/firecrawl-js";
|
|
170
|
+
var searchTool = tool2({
|
|
171
|
+
description: `Search the web and optionally scrape content from search results.
|
|
172
|
+
|
|
173
|
+
Best for: Finding information across multiple websites when you don't know the exact URL.
|
|
174
|
+
Supports search operators: site:, inurl:, intitle:, related:, etc.
|
|
175
|
+
|
|
176
|
+
Example use cases:
|
|
177
|
+
- Find the latest news about a topic
|
|
178
|
+
- Search for specific information across multiple sources
|
|
179
|
+
- Discover content related to a query`,
|
|
180
|
+
inputSchema: searchSchema,
|
|
181
|
+
execute: async (args) => {
|
|
182
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
183
|
+
if (!apiKey) {
|
|
184
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
185
|
+
}
|
|
186
|
+
const firecrawl = new Firecrawl2({ apiKey });
|
|
187
|
+
const result = await firecrawl.search(args.query, {
|
|
188
|
+
limit: args.limit || 5,
|
|
189
|
+
sources: args.sources,
|
|
190
|
+
categories: args.categories,
|
|
191
|
+
tbs: args.tbs,
|
|
192
|
+
location: args.location,
|
|
193
|
+
timeout: args.timeout,
|
|
194
|
+
ignoreInvalidURLs: args.ignoreInvalidURLs,
|
|
195
|
+
scrapeOptions: args.scrapeOptions
|
|
196
|
+
});
|
|
197
|
+
return result;
|
|
198
|
+
}
|
|
199
|
+
});
|
|
200
|
+
|
|
201
|
+
// src/tools/map.ts
|
|
202
|
+
import { tool as tool3 } from "ai";
|
|
203
|
+
import Firecrawl3 from "@mendable/firecrawl-js";
|
|
204
|
+
var mapTool = tool3({
|
|
205
|
+
description: `Map a website to discover all indexed URLs.
|
|
206
|
+
|
|
207
|
+
Best for: Discovering URLs on a website before deciding what to scrape or analyzing site structure.
|
|
208
|
+
|
|
209
|
+
Example use cases:
|
|
210
|
+
- Find all pages on a website
|
|
211
|
+
- Discover blog posts or articles
|
|
212
|
+
- Analyze website structure
|
|
213
|
+
- Find specific sections of a website`,
|
|
214
|
+
inputSchema: mapSchema,
|
|
215
|
+
execute: async (args) => {
|
|
216
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
217
|
+
if (!apiKey) {
|
|
218
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
219
|
+
}
|
|
220
|
+
const firecrawl = new Firecrawl3({ apiKey });
|
|
221
|
+
const result = await firecrawl.map(args.url, {
|
|
222
|
+
search: args.search,
|
|
223
|
+
limit: args.limit,
|
|
224
|
+
includeSubdomains: args.includeSubdomains
|
|
225
|
+
});
|
|
226
|
+
return result;
|
|
227
|
+
}
|
|
228
|
+
});
|
|
229
|
+
|
|
230
|
+
// src/tools/extract.ts
|
|
231
|
+
import { tool as tool4 } from "ai";
|
|
232
|
+
import Firecrawl4 from "@mendable/firecrawl-js";
|
|
233
|
+
var extractTool = tool4({
|
|
234
|
+
description: `Extract structured information from web pages using LLM.
|
|
235
|
+
|
|
236
|
+
Best for: Extracting specific structured data like prices, names, details from web pages.
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
- Immediately returns data for small requests.
|
|
240
|
+
- Returns a job ID for larger requests (use pollTool with the ID to get results).
|
|
241
|
+
|
|
242
|
+
Example use cases:
|
|
243
|
+
- Extract product information (name, price, description)
|
|
244
|
+
- Get contact details from company websites
|
|
245
|
+
- Parse event information (date, location, speakers)
|
|
246
|
+
- Scrape job listings with structured fields`,
|
|
247
|
+
inputSchema: extractSchema,
|
|
248
|
+
execute: async (args) => {
|
|
249
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
250
|
+
if (!apiKey) {
|
|
251
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
252
|
+
}
|
|
253
|
+
const firecrawl = new Firecrawl4({ apiKey });
|
|
254
|
+
const result = await firecrawl.startExtract({
|
|
255
|
+
urls: args.urls,
|
|
256
|
+
prompt: args.prompt,
|
|
257
|
+
schema: args.schema,
|
|
258
|
+
systemPrompt: args.systemPrompt,
|
|
259
|
+
allowExternalLinks: args.allowExternalLinks,
|
|
260
|
+
enableWebSearch: args.enableWebSearch,
|
|
261
|
+
showSources: args.showSources,
|
|
262
|
+
ignoreInvalidURLs: args.ignoreInvalidURLs,
|
|
263
|
+
scrapeOptions: args.scrapeOptions
|
|
264
|
+
});
|
|
265
|
+
return result;
|
|
266
|
+
}
|
|
267
|
+
});
|
|
268
|
+
|
|
269
|
+
// src/tools/batch.ts
|
|
270
|
+
import { tool as tool5 } from "ai";
|
|
271
|
+
import Firecrawl5 from "@mendable/firecrawl-js";
|
|
272
|
+
var batchScrapeTool = tool5({
|
|
273
|
+
description: `Scrape multiple URLs in a single request (max 10 URLs).
|
|
274
|
+
|
|
275
|
+
Best for: Scraping content from multiple known URLs efficiently.
|
|
276
|
+
|
|
277
|
+
Example use cases:
|
|
278
|
+
- Scrape multiple articles from a list of URLs
|
|
279
|
+
- Get product pages in bulk
|
|
280
|
+
- Extract content from several documentation pages
|
|
281
|
+
- Batch process a list of web pages`,
|
|
282
|
+
inputSchema: batchScrapeSchema,
|
|
283
|
+
execute: async (args) => {
|
|
284
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
285
|
+
if (!apiKey) {
|
|
286
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
287
|
+
}
|
|
288
|
+
const firecrawl = new Firecrawl5({ apiKey });
|
|
289
|
+
const result = await firecrawl.batchScrape(args.urls);
|
|
290
|
+
return result;
|
|
291
|
+
}
|
|
292
|
+
});
|
|
293
|
+
|
|
294
|
+
// src/tools/crawl.ts
|
|
295
|
+
import { tool as tool6 } from "ai";
|
|
296
|
+
import Firecrawl6 from "@mendable/firecrawl-js";
|
|
297
|
+
var crawlTool = tool6({
|
|
298
|
+
description: `Start a crawl job to extract content from multiple related pages on a website.
|
|
299
|
+
|
|
300
|
+
Best for: Comprehensive content extraction from multiple pages with depth control.
|
|
301
|
+
Note: This is an asynchronous operation that returns a job ID. Use checkCrawlStatusTool to monitor progress.
|
|
302
|
+
|
|
303
|
+
Example use cases:
|
|
304
|
+
- Crawl an entire blog section
|
|
305
|
+
- Extract all documentation pages
|
|
306
|
+
- Scrape product catalog with pagination
|
|
307
|
+
- Comprehensive site analysis`,
|
|
308
|
+
inputSchema: crawlSchema,
|
|
309
|
+
execute: async (args) => {
|
|
310
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
311
|
+
if (!apiKey) {
|
|
312
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
313
|
+
}
|
|
314
|
+
const firecrawl = new Firecrawl6({ apiKey });
|
|
315
|
+
const result = await firecrawl.startCrawl(args.url, {
|
|
316
|
+
limit: args.limit,
|
|
317
|
+
allowExternalLinks: args.allowExternalLinks,
|
|
318
|
+
includePaths: args.includePaths,
|
|
319
|
+
excludePaths: args.excludePaths,
|
|
320
|
+
scrapeOptions: args.scrapeOptions
|
|
321
|
+
});
|
|
322
|
+
return result;
|
|
323
|
+
}
|
|
324
|
+
});
|
|
325
|
+
|
|
326
|
+
// src/tools/job-management.ts
|
|
327
|
+
import { tool as tool7 } from "ai";
|
|
328
|
+
import { z as z2 } from "zod";
|
|
329
|
+
import Firecrawl8 from "@mendable/firecrawl-js";
|
|
330
|
+
|
|
331
|
+
// src/helpers/poll-job.ts
|
|
332
|
+
import Firecrawl7 from "@mendable/firecrawl-js";
|
|
333
|
+
async function pollCrawlJob(jobId, options = {}) {
|
|
334
|
+
const { pollInterval = 2e3, timeout = 6e4, onProgress } = options;
|
|
335
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
336
|
+
if (!apiKey) {
|
|
337
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
338
|
+
}
|
|
339
|
+
const firecrawl = new Firecrawl7({ apiKey });
|
|
340
|
+
const startTime = Date.now();
|
|
341
|
+
while (Date.now() - startTime < timeout) {
|
|
342
|
+
const status = await firecrawl.getCrawlStatus(jobId);
|
|
343
|
+
if (onProgress) {
|
|
344
|
+
onProgress(status);
|
|
345
|
+
}
|
|
346
|
+
if (status.status === "completed") {
|
|
347
|
+
return status;
|
|
348
|
+
}
|
|
349
|
+
if (status.status === "failed" || status.status === "cancelled") {
|
|
350
|
+
const errorStatus = status;
|
|
351
|
+
throw new Error(`Crawl job ${status.status}: ${errorStatus.error || "Unknown error"}`);
|
|
352
|
+
}
|
|
353
|
+
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
|
354
|
+
}
|
|
355
|
+
throw new Error(`Crawl job timed out after ${timeout}ms`);
|
|
356
|
+
}
|
|
357
|
+
async function pollBatchScrapeJob(jobId, options = {}) {
|
|
358
|
+
const { pollInterval = 2e3, timeout = 6e4, onProgress } = options;
|
|
359
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
360
|
+
if (!apiKey) {
|
|
361
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
362
|
+
}
|
|
363
|
+
const firecrawl = new Firecrawl7({ apiKey });
|
|
364
|
+
const startTime = Date.now();
|
|
365
|
+
while (Date.now() - startTime < timeout) {
|
|
366
|
+
const status = await firecrawl.getBatchScrapeStatus(jobId);
|
|
367
|
+
if (onProgress) {
|
|
368
|
+
onProgress(status);
|
|
369
|
+
}
|
|
370
|
+
if (status.status === "completed") {
|
|
371
|
+
return status;
|
|
372
|
+
}
|
|
373
|
+
if (status.status === "failed" || status.status === "cancelled") {
|
|
374
|
+
const errorStatus = status;
|
|
375
|
+
throw new Error(`Batch scrape job ${status.status}: ${errorStatus.error || "Unknown error"}`);
|
|
376
|
+
}
|
|
377
|
+
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
|
378
|
+
}
|
|
379
|
+
throw new Error(`Batch scrape job timed out after ${timeout}ms`);
|
|
380
|
+
}
|
|
381
|
+
async function pollExtractJob(jobId, options = {}) {
|
|
382
|
+
const { pollInterval = 2e3, timeout = 12e4, onProgress } = options;
|
|
383
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
384
|
+
if (!apiKey) {
|
|
385
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
386
|
+
}
|
|
387
|
+
const firecrawl = new Firecrawl7({ apiKey });
|
|
388
|
+
const startTime = Date.now();
|
|
389
|
+
while (Date.now() - startTime < timeout) {
|
|
390
|
+
const status = await firecrawl.getExtractStatus(jobId);
|
|
391
|
+
if (onProgress) {
|
|
392
|
+
onProgress(status);
|
|
393
|
+
}
|
|
394
|
+
if (status.status === "completed") {
|
|
395
|
+
return status;
|
|
396
|
+
}
|
|
397
|
+
if (status.status === "failed" || status.status === "cancelled") {
|
|
398
|
+
const errorStatus = status;
|
|
399
|
+
throw new Error(`Extract job ${status.status}: ${errorStatus.error || "Unknown error"}`);
|
|
400
|
+
}
|
|
401
|
+
await new Promise((resolve) => setTimeout(resolve, pollInterval));
|
|
402
|
+
}
|
|
403
|
+
throw new Error(`Extract job timed out after ${timeout}ms`);
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
// src/tools/job-management.ts
|
|
407
|
+
var pollResultsSchema = z2.object({
|
|
408
|
+
jobId: z2.string().describe("The job ID returned from crawl, batchScrape, or extract operations"),
|
|
409
|
+
jobType: z2.enum(["crawl", "batchScrape", "extract"]).describe("Type of job to poll for"),
|
|
410
|
+
pollInterval: z2.number().optional().describe("How often to check status in seconds (default: 2)"),
|
|
411
|
+
maxWaitTime: z2.number().optional().describe("Maximum time to wait in seconds (default: 60)")
|
|
412
|
+
});
|
|
413
|
+
var pollTool = tool7({
|
|
414
|
+
description: `Poll for results from async Firecrawl jobs (crawl, batchScrape, or extract).
|
|
415
|
+
|
|
416
|
+
Use this tool after starting a crawl, batchScrape, or extract job to wait for completion and get results.
|
|
417
|
+
|
|
418
|
+
Best for: Getting results from async operations you've already started.
|
|
419
|
+
|
|
420
|
+
Example workflow:
|
|
421
|
+
1. Use crawlTool to start a crawl \u2192 get job ID
|
|
422
|
+
2. Use pollTool with that job ID \u2192 get completed results`,
|
|
423
|
+
inputSchema: pollResultsSchema,
|
|
424
|
+
execute: async function(args) {
|
|
425
|
+
const options = {
|
|
426
|
+
pollInterval: (args.pollInterval || 2) * 1e3,
|
|
427
|
+
timeout: (args.maxWaitTime || 60) * 1e3
|
|
428
|
+
};
|
|
429
|
+
try {
|
|
430
|
+
let result;
|
|
431
|
+
if (args.jobType === "crawl") {
|
|
432
|
+
result = await pollCrawlJob(args.jobId, options);
|
|
433
|
+
} else if (args.jobType === "batchScrape") {
|
|
434
|
+
result = await pollBatchScrapeJob(args.jobId, options);
|
|
435
|
+
} else if (args.jobType === "extract") {
|
|
436
|
+
result = await pollExtractJob(args.jobId, options);
|
|
437
|
+
}
|
|
438
|
+
if (args.jobType === "extract") {
|
|
439
|
+
return {
|
|
440
|
+
success: true,
|
|
441
|
+
status: "completed",
|
|
442
|
+
data: result.data
|
|
443
|
+
};
|
|
444
|
+
}
|
|
445
|
+
return {
|
|
446
|
+
success: true,
|
|
447
|
+
status: "completed",
|
|
448
|
+
data: result.data,
|
|
449
|
+
total: result.total,
|
|
450
|
+
creditsUsed: result.creditsUsed
|
|
451
|
+
};
|
|
452
|
+
} catch (error) {
|
|
453
|
+
return {
|
|
454
|
+
success: false,
|
|
455
|
+
status: "failed",
|
|
456
|
+
error: error.message
|
|
457
|
+
};
|
|
458
|
+
}
|
|
459
|
+
}
|
|
460
|
+
});
|
|
461
|
+
var statusTool = tool7({
|
|
462
|
+
description: `Check the status of a Firecrawl job (crawl, batch, or extract).
|
|
463
|
+
|
|
464
|
+
Use this tool to check the current progress of any async job.
|
|
465
|
+
Returns the status and partial/complete results if available.
|
|
466
|
+
|
|
467
|
+
Possible statuses: pending, scraping, completed, failed`,
|
|
468
|
+
inputSchema: checkStatusSchema,
|
|
469
|
+
execute: async (args) => {
|
|
470
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
471
|
+
if (!apiKey) {
|
|
472
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
473
|
+
}
|
|
474
|
+
const firecrawl = new Firecrawl8({ apiKey });
|
|
475
|
+
let result;
|
|
476
|
+
const jobType = args.jobType || "crawl";
|
|
477
|
+
if (jobType === "crawl") {
|
|
478
|
+
result = await firecrawl.getCrawlStatus(args.id);
|
|
479
|
+
} else if (jobType === "batchScrape") {
|
|
480
|
+
result = await firecrawl.getBatchScrapeStatus(args.id);
|
|
481
|
+
} else if (jobType === "extract") {
|
|
482
|
+
result = await firecrawl.getExtractStatus(args.id);
|
|
483
|
+
}
|
|
484
|
+
return result;
|
|
485
|
+
}
|
|
486
|
+
});
|
|
487
|
+
var cancelJobSchema = z2.object({
|
|
488
|
+
jobId: z2.string().describe("The job ID to cancel"),
|
|
489
|
+
jobType: z2.enum(["crawl", "batchScrape"]).describe("The type of job to cancel")
|
|
490
|
+
});
|
|
491
|
+
var cancelTool = tool7({
|
|
492
|
+
description: `Cancel a running Firecrawl job (crawl or batch scrape).
|
|
493
|
+
|
|
494
|
+
Use this tool to stop a long-running operation that is no longer needed.
|
|
495
|
+
Note: Extraction jobs cannot be cancelled via this API.
|
|
496
|
+
|
|
497
|
+
Returns the cancellation status.`,
|
|
498
|
+
inputSchema: cancelJobSchema,
|
|
499
|
+
execute: async function(args) {
|
|
500
|
+
const apiKey = process.env.FIRECRAWL_API_KEY;
|
|
501
|
+
if (!apiKey) {
|
|
502
|
+
throw new Error("FIRECRAWL_API_KEY environment variable is required");
|
|
503
|
+
}
|
|
504
|
+
const firecrawl = new Firecrawl8({ apiKey });
|
|
505
|
+
let result = false;
|
|
506
|
+
if (args.jobType === "crawl") {
|
|
507
|
+
result = await firecrawl.cancelCrawl(args.jobId);
|
|
508
|
+
} else if (args.jobType === "batchScrape") {
|
|
509
|
+
result = await firecrawl.cancelBatchScrape(args.jobId);
|
|
510
|
+
}
|
|
511
|
+
return {
|
|
512
|
+
success: result,
|
|
513
|
+
jobId: args.jobId,
|
|
514
|
+
jobType: args.jobType,
|
|
515
|
+
message: result ? `${args.jobType} cancelled successfully` : `Failed to cancel ${args.jobType}`
|
|
516
|
+
};
|
|
517
|
+
}
|
|
518
|
+
});
|
|
519
|
+
export {
|
|
520
|
+
batchScrapeSchema,
|
|
521
|
+
batchScrapeTool,
|
|
522
|
+
cancelTool,
|
|
523
|
+
checkStatusSchema,
|
|
524
|
+
crawlSchema,
|
|
525
|
+
crawlTool,
|
|
526
|
+
extractSchema,
|
|
527
|
+
extractTool,
|
|
528
|
+
mapSchema,
|
|
529
|
+
mapTool,
|
|
530
|
+
pollBatchScrapeJob,
|
|
531
|
+
pollCrawlJob,
|
|
532
|
+
pollExtractJob,
|
|
533
|
+
pollTool,
|
|
534
|
+
scrapeSchema,
|
|
535
|
+
scrapeTool,
|
|
536
|
+
searchSchema,
|
|
537
|
+
searchTool,
|
|
538
|
+
statusTool
|
|
539
|
+
};
|
package/package.json
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "firecrawl-aisdk",
|
|
3
|
+
"version": "0.7.1",
|
|
4
|
+
"description": "Firecrawl tools for Vercel AI SDK - Web scraping and search capabilities for AI applications",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/index.js",
|
|
7
|
+
"module": "./dist/index.js",
|
|
8
|
+
"types": "./dist/index.d.ts",
|
|
9
|
+
"exports": {
|
|
10
|
+
".": {
|
|
11
|
+
"import": "./dist/index.js",
|
|
12
|
+
"types": "./dist/index.d.ts"
|
|
13
|
+
}
|
|
14
|
+
},
|
|
15
|
+
"files": [
|
|
16
|
+
"dist"
|
|
17
|
+
],
|
|
18
|
+
"keywords": [
|
|
19
|
+
"ai",
|
|
20
|
+
"ai-sdk",
|
|
21
|
+
"vercel",
|
|
22
|
+
"firecrawl",
|
|
23
|
+
"web-scraping",
|
|
24
|
+
"web-search",
|
|
25
|
+
"tools",
|
|
26
|
+
"llm"
|
|
27
|
+
],
|
|
28
|
+
"author": "firecrawl",
|
|
29
|
+
"license": "MIT",
|
|
30
|
+
"engines": {
|
|
31
|
+
"node": ">=18.0.0"
|
|
32
|
+
},
|
|
33
|
+
"dependencies": {
|
|
34
|
+
"@mendable/firecrawl-js": "^4.6.1",
|
|
35
|
+
"ai": "^5.0.0",
|
|
36
|
+
"zod": "^3.0.0"
|
|
37
|
+
},
|
|
38
|
+
"devDependencies": {
|
|
39
|
+
"@ai-sdk/openai": "^2.0.69",
|
|
40
|
+
"@types/node": "^20.0.0",
|
|
41
|
+
"@vitest/ui": "^2.1.8",
|
|
42
|
+
"dotenv": "^17.2.3",
|
|
43
|
+
"tsup": "^8.5.0",
|
|
44
|
+
"tsx": "^4.20.6",
|
|
45
|
+
"typescript": "^5.9.3",
|
|
46
|
+
"vitest": "^2.1.8"
|
|
47
|
+
},
|
|
48
|
+
"scripts": {
|
|
49
|
+
"build": "tsup src/index.ts --format esm --dts",
|
|
50
|
+
"test": "vitest run",
|
|
51
|
+
"test:watch": "vitest"
|
|
52
|
+
}
|
|
53
|
+
}
|