firecrawl 1.29.3 → 3.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/.env.example +4 -2
  2. package/LICENSE +0 -0
  3. package/README.md +85 -78
  4. package/audit-ci.jsonc +4 -0
  5. package/dist/chunk-JFWW4BWA.js +85 -0
  6. package/dist/index.cjs +964 -39
  7. package/dist/index.d.cts +529 -11
  8. package/dist/index.d.ts +529 -11
  9. package/dist/index.js +952 -27
  10. package/dist/package-KYZ3HXR5.js +4 -0
  11. package/dump.rdb +0 -0
  12. package/jest.config.js +0 -0
  13. package/package.json +6 -6
  14. package/src/__tests__/e2e/v2/batch.test.ts +74 -0
  15. package/src/__tests__/e2e/v2/crawl.test.ts +182 -0
  16. package/src/__tests__/e2e/v2/extract.test.ts +70 -0
  17. package/src/__tests__/e2e/v2/map.test.ts +55 -0
  18. package/src/__tests__/e2e/v2/scrape.test.ts +130 -0
  19. package/src/__tests__/e2e/v2/search.test.ts +247 -0
  20. package/src/__tests__/e2e/v2/usage.test.ts +36 -0
  21. package/src/__tests__/e2e/v2/utils/idmux.ts +58 -0
  22. package/src/__tests__/e2e/v2/watcher.test.ts +96 -0
  23. package/src/__tests__/unit/v2/errorHandler.test.ts +19 -0
  24. package/src/__tests__/unit/v2/scrape.unit.test.ts +11 -0
  25. package/src/__tests__/unit/v2/validation.test.ts +59 -0
  26. package/src/index.backup.ts +2146 -0
  27. package/src/index.ts +27 -2134
  28. package/src/v1/index.ts +2158 -0
  29. package/src/v2/client.ts +281 -0
  30. package/src/v2/methods/batch.ts +131 -0
  31. package/src/v2/methods/crawl.ts +160 -0
  32. package/src/v2/methods/extract.ts +86 -0
  33. package/src/v2/methods/map.ts +37 -0
  34. package/src/v2/methods/scrape.ts +26 -0
  35. package/src/v2/methods/search.ts +69 -0
  36. package/src/v2/methods/usage.ts +39 -0
  37. package/src/v2/types.ts +308 -0
  38. package/src/v2/utils/errorHandler.ts +18 -0
  39. package/src/v2/utils/getVersion.ts +14 -0
  40. package/src/v2/utils/httpClient.ts +99 -0
  41. package/src/v2/utils/validation.ts +50 -0
  42. package/src/v2/watcher.ts +159 -0
  43. package/tsconfig.json +2 -1
  44. package/tsup.config.ts +0 -0
  45. package/dist/package-Z6F7JDXI.js +0 -111
  46. /package/src/__tests__/{v1/e2e_withAuth → e2e/v1}/index.test.ts +0 -0
  47. /package/src/__tests__/{v1/unit → unit/v1}/monitor-job-status-retry.test.ts +0 -0
@@ -0,0 +1,281 @@
1
+ import { HttpClient } from "./utils/httpClient";
2
+ import { scrape } from "./methods/scrape";
3
+ import { search } from "./methods/search";
4
+ import { map as mapMethod } from "./methods/map";
5
+ import {
6
+ startCrawl,
7
+ getCrawlStatus,
8
+ cancelCrawl,
9
+ crawl as crawlWaiter,
10
+ getCrawlErrors,
11
+ getActiveCrawls,
12
+ crawlParamsPreview,
13
+ } from "./methods/crawl";
14
+ import {
15
+ startBatchScrape,
16
+ getBatchScrapeStatus,
17
+ getBatchScrapeErrors,
18
+ cancelBatchScrape,
19
+ batchScrape as batchWaiter,
20
+ } from "./methods/batch";
21
+ import { startExtract, getExtractStatus, extract as extractWaiter } from "./methods/extract";
22
+ import { getConcurrency, getCreditUsage, getTokenUsage } from "./methods/usage";
23
+ import type {
24
+ Document,
25
+ ScrapeOptions,
26
+ SearchData,
27
+ SearchRequest,
28
+ MapData,
29
+ MapOptions,
30
+ CrawlResponse,
31
+ CrawlJob,
32
+ CrawlErrorsResponse,
33
+ ActiveCrawlsResponse,
34
+ BatchScrapeResponse,
35
+ BatchScrapeJob,
36
+ ExtractResponse,
37
+ } from "./types";
38
+ import { Watcher } from "./watcher";
39
+ import type { WatcherOptions } from "./watcher";
40
+ import type { ZodTypeAny, infer as ZodInfer } from "zod";
41
+
42
+ // Helper types to infer the `json` field from a Zod schema included in `formats`
43
+ type ExtractJsonSchemaFromFormats<Formats> = Formats extends readonly any[]
44
+ ? Extract<Formats[number], { type: "json"; schema?: unknown }>["schema"]
45
+ : never;
46
+
47
+ type InferredJsonFromOptions<Opts> = Opts extends { formats?: infer Fmts }
48
+ ? ExtractJsonSchemaFromFormats<Fmts> extends ZodTypeAny
49
+ ? ZodInfer<ExtractJsonSchemaFromFormats<Fmts>>
50
+ : unknown
51
+ : unknown;
52
+
53
+ /**
54
+ * Configuration for the v2 client transport.
55
+ */
56
+ export interface FirecrawlClientOptions {
57
+ /** API key (falls back to FIRECRAWL_API_KEY). */
58
+ apiKey?: string | null;
59
+ /** API base URL (falls back to FIRECRAWL_API_URL or https://api.firecrawl.dev). */
60
+ apiUrl?: string | null;
61
+ /** Per-request timeout in milliseconds (optional). */
62
+ timeoutMs?: number;
63
+ /** Max automatic retries for transient failures (optional). */
64
+ maxRetries?: number;
65
+ /** Exponential backoff factor for retries (optional). */
66
+ backoffFactor?: number;
67
+ }
68
+
69
+ /**
70
+ * Firecrawl v2 client. Provides typed access to all v2 endpoints and utilities.
71
+ */
72
+ export class FirecrawlClient {
73
+ private readonly http: HttpClient;
74
+
75
+ /**
76
+ * Create a v2 client.
77
+ * @param options Transport configuration (API key, base URL, timeouts, retries).
78
+ */
79
+ constructor(options: FirecrawlClientOptions = {}) {
80
+ const apiKey = options.apiKey ?? process.env.FIRECRAWL_API_KEY ?? "";
81
+ const apiUrl = (options.apiUrl ?? process.env.FIRECRAWL_API_URL ?? "https://api.firecrawl.dev").replace(/\/$/, "");
82
+ if (!apiKey) {
83
+ throw new Error("API key is required. Set FIRECRAWL_API_KEY env or pass apiKey.");
84
+ }
85
+ this.http = new HttpClient({
86
+ apiKey,
87
+ apiUrl,
88
+ timeoutMs: options.timeoutMs,
89
+ maxRetries: options.maxRetries,
90
+ backoffFactor: options.backoffFactor,
91
+ });
92
+ }
93
+
94
+ // Scrape
95
+ /**
96
+ * Scrape a single URL.
97
+ * @param url Target URL.
98
+ * @param options Optional scrape options (formats, headers, etc.).
99
+ * @returns Resolved document with requested formats.
100
+ */
101
+ async scrape<Opts extends ScrapeOptions>(
102
+ url: string,
103
+ options: Opts
104
+ ): Promise<Omit<Document, "json"> & { json?: InferredJsonFromOptions<Opts> }>;
105
+ async scrape(url: string, options?: ScrapeOptions): Promise<Document>;
106
+ async scrape(url: string, options?: ScrapeOptions): Promise<Document> {
107
+ return scrape(this.http, url, options);
108
+ }
109
+
110
+ // Search
111
+ /**
112
+ * Search the web and optionally scrape each result.
113
+ * @param query Search query string.
114
+ * @param req Additional search options (sources, limit, scrapeOptions, etc.).
115
+ * @returns Structured search results.
116
+ */
117
+ async search(query: string, req: Omit<SearchRequest, "query"> = {}): Promise<SearchData> {
118
+ return search(this.http, { query, ...req });
119
+ }
120
+
121
+ // Map
122
+ /**
123
+ * Map a site to discover URLs (sitemap-aware).
124
+ * @param url Root URL to map.
125
+ * @param options Mapping options (sitemap mode, includeSubdomains, limit, timeout).
126
+ * @returns Discovered links.
127
+ */
128
+ async map(url: string, options?: MapOptions): Promise<MapData> {
129
+ return mapMethod(this.http, url, options);
130
+ }
131
+
132
+ // Crawl
133
+ /**
134
+ * Start a crawl job (async).
135
+ * @param url Root URL to crawl.
136
+ * @param req Crawl configuration (paths, limits, scrapeOptions, webhook, etc.).
137
+ * @returns Job id and url.
138
+ */
139
+ async startCrawl(url: string, req: Omit<Parameters<typeof startCrawl>[1], "url"> = {}): Promise<CrawlResponse> {
140
+ return startCrawl(this.http, { url, ...(req as any) });
141
+ }
142
+ /**
143
+ * Get the status and partial data of a crawl job.
144
+ * @param jobId Crawl job id.
145
+ */
146
+ async getCrawlStatus(jobId: string): Promise<CrawlJob> {
147
+ return getCrawlStatus(this.http, jobId);
148
+ }
149
+ /**
150
+ * Cancel a crawl job.
151
+ * @param jobId Crawl job id.
152
+ * @returns True if cancelled.
153
+ */
154
+ async cancelCrawl(jobId: string): Promise<boolean> {
155
+ return cancelCrawl(this.http, jobId);
156
+ }
157
+ /**
158
+ * Convenience waiter: start a crawl and poll until it finishes.
159
+ * @param url Root URL to crawl.
160
+ * @param req Crawl configuration plus waiter controls (pollInterval, timeout seconds).
161
+ * @returns Final job snapshot.
162
+ */
163
+ async crawl(url: string, req: Omit<Parameters<typeof startCrawl>[1], "url"> & { pollInterval?: number; timeout?: number } = {}): Promise<CrawlJob> {
164
+ return crawlWaiter(this.http, { url, ...(req as any) }, req.pollInterval, req.timeout);
165
+ }
166
+ /**
167
+ * Retrieve crawl errors and robots.txt blocks.
168
+ * @param crawlId Crawl job id.
169
+ */
170
+ async getCrawlErrors(crawlId: string): Promise<CrawlErrorsResponse> {
171
+ return getCrawlErrors(this.http, crawlId);
172
+ }
173
+ /**
174
+ * List active crawls for the authenticated team.
175
+ */
176
+ async getActiveCrawls(): Promise<ActiveCrawlsResponse> {
177
+ return getActiveCrawls(this.http);
178
+ }
179
+ /**
180
+ * Preview normalized crawl parameters produced by a natural-language prompt.
181
+ * @param url Root URL.
182
+ * @param prompt Natural-language instruction.
183
+ */
184
+ async crawlParamsPreview(url: string, prompt: string): Promise<Record<string, unknown>> {
185
+ return crawlParamsPreview(this.http, url, prompt);
186
+ }
187
+
188
+ // Batch
189
+ /**
190
+ * Start a batch scrape job for multiple URLs (async).
191
+ * @param urls URLs to scrape.
192
+ * @param opts Batch options (scrape options, webhook, concurrency, idempotency key, etc.).
193
+ * @returns Job id and url.
194
+ */
195
+ async startBatchScrape(urls: string[], opts?: Parameters<typeof startBatchScrape>[2]): Promise<BatchScrapeResponse> {
196
+ return startBatchScrape(this.http, urls, opts);
197
+ }
198
+ /**
199
+ * Get the status and partial data of a batch scrape job.
200
+ * @param jobId Batch job id.
201
+ */
202
+ async getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob> {
203
+ return getBatchScrapeStatus(this.http, jobId);
204
+ }
205
+ /**
206
+ * Retrieve batch scrape errors and robots.txt blocks.
207
+ * @param jobId Batch job id.
208
+ */
209
+ async getBatchScrapeErrors(jobId: string): Promise<CrawlErrorsResponse> {
210
+ return getBatchScrapeErrors(this.http, jobId);
211
+ }
212
+ /**
213
+ * Cancel a batch scrape job.
214
+ * @param jobId Batch job id.
215
+ * @returns True if cancelled.
216
+ */
217
+ async cancelBatchScrape(jobId: string): Promise<boolean> {
218
+ return cancelBatchScrape(this.http, jobId);
219
+ }
220
+ /**
221
+ * Convenience waiter: start a batch scrape and poll until it finishes.
222
+ * @param urls URLs to scrape.
223
+ * @param opts Batch options plus waiter controls (pollInterval, timeout seconds).
224
+ * @returns Final job snapshot.
225
+ */
226
+ async batchScrape(urls: string[], opts?: Parameters<typeof startBatchScrape>[2] & { pollInterval?: number; timeout?: number }): Promise<BatchScrapeJob> {
227
+ return batchWaiter(this.http, urls, opts);
228
+ }
229
+
230
+ // Extract
231
+ /**
232
+ * Start an extract job (async).
233
+ * @param args Extraction request (urls, schema or prompt, flags).
234
+ * @returns Job id or processing state.
235
+ */
236
+ async startExtract(args: Parameters<typeof startExtract>[1]): Promise<ExtractResponse> {
237
+ return startExtract(this.http, args);
238
+ }
239
+ /**
240
+ * Get extract job status/data.
241
+ * @param jobId Extract job id.
242
+ */
243
+ async getExtractStatus(jobId: string): Promise<ExtractResponse> {
244
+ return getExtractStatus(this.http, jobId);
245
+ }
246
+ /**
247
+ * Convenience waiter: start an extract and poll until it finishes.
248
+ * @param args Extraction request plus waiter controls (pollInterval, timeout seconds).
249
+ * @returns Final extract response.
250
+ */
251
+ async extract(args: Parameters<typeof startExtract>[1] & { pollInterval?: number; timeout?: number }): Promise<ExtractResponse> {
252
+ return extractWaiter(this.http, args);
253
+ }
254
+
255
+ // Usage
256
+ /** Current concurrency usage. */
257
+ async getConcurrency() {
258
+ return getConcurrency(this.http);
259
+ }
260
+ /** Current credit usage. */
261
+ async getCreditUsage() {
262
+ return getCreditUsage(this.http);
263
+ }
264
+ /** Recent token usage. */
265
+ async getTokenUsage() {
266
+ return getTokenUsage(this.http);
267
+ }
268
+
269
+ // Watcher
270
+ /**
271
+ * Create a watcher for a crawl or batch job. Emits: `document`, `snapshot`, `done`, `error`.
272
+ * @param jobId Job id.
273
+ * @param opts Watcher options (kind, pollInterval, timeout seconds).
274
+ */
275
+ watcher(jobId: string, opts: WatcherOptions = {}): Watcher {
276
+ return new Watcher(this.http, jobId, opts);
277
+ }
278
+ }
279
+
280
+ export default FirecrawlClient;
281
+
@@ -0,0 +1,131 @@
1
+ import {
2
+ type BatchScrapeJob,
3
+ type BatchScrapeResponse,
4
+ type CrawlErrorsResponse,
5
+ type Document,
6
+ type ScrapeOptions,
7
+ type WebhookConfig,
8
+ } from "../types";
9
+ import { HttpClient } from "../utils/httpClient";
10
+ import { ensureValidScrapeOptions } from "../utils/validation";
11
+ import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
12
+
13
+ export interface StartBatchOptions {
14
+ options?: ScrapeOptions;
15
+ webhook?: string | WebhookConfig;
16
+ appendToId?: string;
17
+ ignoreInvalidURLs?: boolean;
18
+ maxConcurrency?: number;
19
+ zeroDataRetention?: boolean;
20
+ integration?: string;
21
+ idempotencyKey?: string;
22
+ }
23
+
24
+ export async function startBatchScrape(
25
+ http: HttpClient,
26
+ urls: string[],
27
+ {
28
+ options,
29
+ webhook,
30
+ appendToId,
31
+ ignoreInvalidURLs,
32
+ maxConcurrency,
33
+ zeroDataRetention,
34
+ integration,
35
+ idempotencyKey,
36
+ }: StartBatchOptions = {}
37
+ ): Promise<BatchScrapeResponse> {
38
+ if (!Array.isArray(urls) || urls.length === 0) throw new Error("URLs list cannot be empty");
39
+ const payload: Record<string, unknown> = { urls };
40
+ if (options) {
41
+ ensureValidScrapeOptions(options);
42
+ Object.assign(payload, options);
43
+ }
44
+ if (webhook != null) payload.webhook = webhook;
45
+ if (appendToId != null) payload.appendToId = appendToId;
46
+ if (ignoreInvalidURLs != null) payload.ignoreInvalidURLs = ignoreInvalidURLs;
47
+ if (maxConcurrency != null) payload.maxConcurrency = maxConcurrency;
48
+ if (zeroDataRetention != null) payload.zeroDataRetention = zeroDataRetention;
49
+ if (integration != null) payload.integration = integration;
50
+
51
+ try {
52
+ const headers = http.prepareHeaders(idempotencyKey);
53
+ const res = await http.post<{ success: boolean; id: string; url: string; invalidURLs?: string[]; error?: string }>("/v2/batch/scrape", payload, headers);
54
+ if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "start batch scrape");
55
+ return { id: res.data.id, url: res.data.url, invalidURLs: res.data.invalidURLs || undefined };
56
+ } catch (err: any) {
57
+ if (err?.isAxiosError) return normalizeAxiosError(err, "start batch scrape");
58
+ throw err;
59
+ }
60
+ }
61
+
62
+ export async function getBatchScrapeStatus(http: HttpClient, jobId: string): Promise<BatchScrapeJob> {
63
+ try {
64
+ const res = await http.get<{ success: boolean; status: BatchScrapeJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/batch/scrape/${jobId}`);
65
+ if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
66
+ const body = res.data;
67
+ return {
68
+ status: body.status,
69
+ completed: body.completed ?? 0,
70
+ total: body.total ?? 0,
71
+ creditsUsed: body.creditsUsed,
72
+ expiresAt: body.expiresAt,
73
+ next: body.next ?? null,
74
+ data: (body.data || []) as Document[],
75
+ };
76
+ } catch (err: any) {
77
+ if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
78
+ throw err;
79
+ }
80
+ }
81
+
82
+ export async function cancelBatchScrape(http: HttpClient, jobId: string): Promise<boolean> {
83
+ try {
84
+ const res = await http.delete<{ status: string }>(`/v2/batch/scrape/${jobId}`);
85
+ if (res.status !== 200) throwForBadResponse(res, "cancel batch scrape");
86
+ return res.data?.status === "cancelled";
87
+ } catch (err: any) {
88
+ if (err?.isAxiosError) return normalizeAxiosError(err, "cancel batch scrape");
89
+ throw err;
90
+ }
91
+ }
92
+
93
+ export async function getBatchScrapeErrors(http: HttpClient, jobId: string): Promise<CrawlErrorsResponse> {
94
+ try {
95
+ const res = await http.get<{ success?: boolean; data?: { errors: Array<Record<string, string>>; robotsBlocked: string[] } }>(`/v2/batch/scrape/${jobId}/errors`);
96
+ if (res.status !== 200) throwForBadResponse(res, "get batch scrape errors");
97
+ const payload = res.data?.data ?? (res.data as any);
98
+ return { errors: payload.errors || [], robotsBlocked: payload.robotsBlocked || [] };
99
+ } catch (err: any) {
100
+ if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape errors");
101
+ throw err;
102
+ }
103
+ }
104
+
105
+ export async function waitForBatchCompletion(http: HttpClient, jobId: string, pollInterval = 2, timeout?: number): Promise<BatchScrapeJob> {
106
+ const start = Date.now();
107
+ while (true) {
108
+ const status = await getBatchScrapeStatus(http, jobId);
109
+ if (["completed", "failed", "cancelled"].includes(status.status)) return status;
110
+ if (timeout != null && Date.now() - start > timeout * 1000) {
111
+ throw new Error(`Batch scrape job ${jobId} did not complete within ${timeout} seconds`);
112
+ }
113
+ await new Promise((r) => setTimeout(r, Math.max(1000, pollInterval * 1000)));
114
+ }
115
+ }
116
+
117
+ export async function batchScrape(
118
+ http: HttpClient,
119
+ urls: string[],
120
+ opts: StartBatchOptions & { pollInterval?: number; timeout?: number } = {}
121
+ ): Promise<BatchScrapeJob> {
122
+ const start = await startBatchScrape(http, urls, opts);
123
+ return waitForBatchCompletion(http, start.id, opts.pollInterval ?? 2, opts.timeout);
124
+ }
125
+
126
+ export function chunkUrls(urls: string[], chunkSize = 100): string[][] {
127
+ const chunks: string[][] = [];
128
+ for (let i = 0; i < urls.length; i += chunkSize) chunks.push(urls.slice(i, i + chunkSize));
129
+ return chunks;
130
+ }
131
+
@@ -0,0 +1,160 @@
1
+ import {
2
+ type ActiveCrawlsResponse,
3
+ type CrawlErrorsResponse,
4
+ type CrawlJob,
5
+ type CrawlResponse,
6
+ type Document,
7
+ type ScrapeOptions,
8
+ type WebhookConfig,
9
+ } from "../types";
10
+ import { HttpClient } from "../utils/httpClient";
11
+ import { ensureValidScrapeOptions } from "../utils/validation";
12
+ import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
13
+
14
+ export interface CrawlRequest {
15
+ url: string;
16
+ prompt?: string | null;
17
+ excludePaths?: string[] | null;
18
+ includePaths?: string[] | null;
19
+ maxDiscoveryDepth?: number | null;
20
+ sitemap?: "skip" | "include";
21
+ ignoreQueryParameters?: boolean;
22
+ limit?: number | null;
23
+ crawlEntireDomain?: boolean;
24
+ allowExternalLinks?: boolean;
25
+ allowSubdomains?: boolean;
26
+ delay?: number | null;
27
+ maxConcurrency?: number | null;
28
+ webhook?: string | WebhookConfig | null;
29
+ scrapeOptions?: ScrapeOptions | null;
30
+ zeroDataRetention?: boolean;
31
+ }
32
+
33
+ function prepareCrawlPayload(request: CrawlRequest): Record<string, unknown> {
34
+ if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty");
35
+ const data: Record<string, unknown> = { url: request.url.trim() };
36
+ if (request.prompt) data.prompt = request.prompt;
37
+ if (request.excludePaths) data.excludePaths = request.excludePaths;
38
+ if (request.includePaths) data.includePaths = request.includePaths;
39
+ if (request.maxDiscoveryDepth != null) data.maxDiscoveryDepth = request.maxDiscoveryDepth;
40
+ if (request.sitemap != null) data.sitemap = request.sitemap;
41
+ if (request.ignoreQueryParameters != null) data.ignoreQueryParameters = request.ignoreQueryParameters;
42
+ if (request.limit != null) data.limit = request.limit;
43
+ if (request.crawlEntireDomain != null) data.crawlEntireDomain = request.crawlEntireDomain;
44
+ if (request.allowExternalLinks != null) data.allowExternalLinks = request.allowExternalLinks;
45
+ if (request.allowSubdomains != null) data.allowSubdomains = request.allowSubdomains;
46
+ if (request.delay != null) data.delay = request.delay;
47
+ if (request.maxConcurrency != null) data.maxConcurrency = request.maxConcurrency;
48
+ if (request.webhook != null) data.webhook = request.webhook;
49
+ if (request.scrapeOptions) {
50
+ ensureValidScrapeOptions(request.scrapeOptions);
51
+ data.scrapeOptions = request.scrapeOptions;
52
+ }
53
+ if (request.zeroDataRetention != null) data.zeroDataRetention = request.zeroDataRetention;
54
+ return data;
55
+ }
56
+
57
+ export async function startCrawl(http: HttpClient, request: CrawlRequest): Promise<CrawlResponse> {
58
+ const payload = prepareCrawlPayload(request);
59
+ try {
60
+ const res = await http.post<{ success: boolean; id: string; url: string; error?: string }>("/v2/crawl", payload);
61
+ if (res.status !== 200 || !res.data?.success) {
62
+ throwForBadResponse(res, "start crawl");
63
+ }
64
+ return { id: res.data.id, url: res.data.url };
65
+ } catch (err: any) {
66
+ if (err?.isAxiosError) return normalizeAxiosError(err, "start crawl");
67
+ throw err;
68
+ }
69
+ }
70
+
71
+ export async function getCrawlStatus(http: HttpClient, jobId: string): Promise<CrawlJob> {
72
+ try {
73
+ const res = await http.get<{ success: boolean; status: CrawlJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/crawl/${jobId}`);
74
+ if (res.status !== 200 || !res.data?.success) {
75
+ throwForBadResponse(res, "get crawl status");
76
+ }
77
+ const body = res.data;
78
+ return {
79
+ status: body.status,
80
+ completed: body.completed ?? 0,
81
+ total: body.total ?? 0,
82
+ creditsUsed: body.creditsUsed,
83
+ expiresAt: body.expiresAt,
84
+ next: body.next ?? null,
85
+ data: (body.data || []) as Document[],
86
+ };
87
+ } catch (err: any) {
88
+ if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
89
+ throw err;
90
+ }
91
+ }
92
+
93
+ export async function cancelCrawl(http: HttpClient, jobId: string): Promise<boolean> {
94
+ try {
95
+ const res = await http.delete<{ status: string }>(`/v2/crawl/${jobId}`);
96
+ if (res.status !== 200) throwForBadResponse(res, "cancel crawl");
97
+ return res.data?.status === "cancelled";
98
+ } catch (err: any) {
99
+ if (err?.isAxiosError) return normalizeAxiosError(err, "cancel crawl");
100
+ throw err;
101
+ }
102
+ }
103
+
104
+ export async function waitForCrawlCompletion(http: HttpClient, jobId: string, pollInterval = 2, timeout?: number): Promise<CrawlJob> {
105
+ const start = Date.now();
106
+ while (true) {
107
+ const status = await getCrawlStatus(http, jobId);
108
+ if (["completed", "failed", "cancelled"].includes(status.status)) return status;
109
+ if (timeout != null && Date.now() - start > timeout * 1000) {
110
+ throw new Error(`Crawl job ${jobId} did not complete within ${timeout} seconds`);
111
+ }
112
+ await new Promise((r) => setTimeout(r, Math.max(1000, pollInterval * 1000)));
113
+ }
114
+ }
115
+
116
+ export async function crawl(http: HttpClient, request: CrawlRequest, pollInterval = 2, timeout?: number): Promise<CrawlJob> {
117
+ const started = await startCrawl(http, request);
118
+ return waitForCrawlCompletion(http, started.id, pollInterval, timeout);
119
+ }
120
+
121
+ export async function getCrawlErrors(http: HttpClient, crawlId: string): Promise<CrawlErrorsResponse> {
122
+ try {
123
+ const res = await http.get<{ success?: boolean; data?: { errors: Array<Record<string, string>>; robotsBlocked: string[] } }>(`/v2/crawl/${crawlId}/errors`);
124
+ if (res.status !== 200) throwForBadResponse(res, "get crawl errors");
125
+ const payload = res.data?.data ?? (res.data as any);
126
+ return { errors: payload.errors || [], robotsBlocked: payload.robotsBlocked || [] };
127
+ } catch (err: any) {
128
+ if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl errors");
129
+ throw err;
130
+ }
131
+ }
132
+
133
+ export async function getActiveCrawls(http: HttpClient): Promise<ActiveCrawlsResponse> {
134
+ try {
135
+ const res = await http.get<{ success: boolean; crawls: Array<{ id: string; teamId?: string; team_id?: string; url: string; options?: any }> }>(`/v2/crawl/active`);
136
+ if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get active crawls");
137
+ const crawlsIn = res.data?.crawls || [];
138
+ const crawls = crawlsIn.map((c) => ({ id: c.id, teamId: (c as any).teamId ?? (c as any).team_id, url: c.url, options: c.options ?? null }));
139
+ return { success: true, crawls };
140
+ } catch (err: any) {
141
+ if (err?.isAxiosError) return normalizeAxiosError(err, "get active crawls");
142
+ throw err;
143
+ }
144
+ }
145
+
146
+ export async function crawlParamsPreview(http: HttpClient, url: string, prompt: string): Promise<Record<string, unknown>> {
147
+ if (!url || !url.trim()) throw new Error("URL cannot be empty");
148
+ if (!prompt || !prompt.trim()) throw new Error("Prompt cannot be empty");
149
+ try {
150
+ const res = await http.post<{ success: boolean; data?: Record<string, unknown>; warning?: string }>("/v2/crawl/params-preview", { url: url.trim(), prompt });
151
+ if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "crawl params preview");
152
+ const data = res.data.data || {};
153
+ if (res.data.warning) (data as any).warning = res.data.warning;
154
+ return data;
155
+ } catch (err: any) {
156
+ if (err?.isAxiosError) return normalizeAxiosError(err, "crawl params preview");
157
+ throw err;
158
+ }
159
+ }
160
+
@@ -0,0 +1,86 @@
1
+ import { type ExtractResponse, type ScrapeOptions } from "../types";
2
+ import { HttpClient } from "../utils/httpClient";
3
+ import { ensureValidScrapeOptions } from "../utils/validation";
4
+ import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
5
+ import { zodToJsonSchema } from "zod-to-json-schema";
6
+ import type { ZodTypeAny } from "zod";
7
+
8
+ function prepareExtractPayload(args: {
9
+ urls?: string[];
10
+ prompt?: string;
11
+ schema?: Record<string, unknown> | ZodTypeAny;
12
+ systemPrompt?: string;
13
+ allowExternalLinks?: boolean;
14
+ enableWebSearch?: boolean;
15
+ showSources?: boolean;
16
+ scrapeOptions?: ScrapeOptions;
17
+ ignoreInvalidURLs?: boolean;
18
+ }): Record<string, unknown> {
19
+ const body: Record<string, unknown> = {};
20
+ if (args.urls) body.urls = args.urls;
21
+ if (args.prompt != null) body.prompt = args.prompt;
22
+ if (args.schema != null) {
23
+ const s: any = args.schema;
24
+ const isZod = s && (typeof s.safeParse === "function" || typeof s.parse === "function") && s._def;
25
+ body.schema = isZod ? zodToJsonSchema(s) : args.schema;
26
+ }
27
+ if (args.systemPrompt != null) body.systemPrompt = args.systemPrompt;
28
+ if (args.allowExternalLinks != null) body.allowExternalLinks = args.allowExternalLinks;
29
+ if (args.enableWebSearch != null) body.enableWebSearch = args.enableWebSearch;
30
+ if (args.showSources != null) body.showSources = args.showSources;
31
+ if (args.ignoreInvalidURLs != null) body.ignoreInvalidURLs = args.ignoreInvalidURLs;
32
+ if (args.scrapeOptions) {
33
+ ensureValidScrapeOptions(args.scrapeOptions);
34
+ body.scrapeOptions = args.scrapeOptions;
35
+ }
36
+ return body;
37
+ }
38
+
39
+ export async function startExtract(http: HttpClient, args: Parameters<typeof prepareExtractPayload>[0]): Promise<ExtractResponse> {
40
+ const payload = prepareExtractPayload(args);
41
+ try {
42
+ const res = await http.post<ExtractResponse>("/v2/extract", payload);
43
+ if (res.status !== 200) throwForBadResponse(res, "extract");
44
+ return res.data;
45
+ } catch (err: any) {
46
+ if (err?.isAxiosError) return normalizeAxiosError(err, "extract");
47
+ throw err;
48
+ }
49
+ }
50
+
51
+ export async function getExtractStatus(http: HttpClient, jobId: string): Promise<ExtractResponse> {
52
+ try {
53
+ const res = await http.get<ExtractResponse>(`/v2/extract/${jobId}`);
54
+ if (res.status !== 200) throwForBadResponse(res, "extract status");
55
+ return res.data;
56
+ } catch (err: any) {
57
+ if (err?.isAxiosError) return normalizeAxiosError(err, "extract status");
58
+ throw err;
59
+ }
60
+ }
61
+
62
+ export async function waitExtract(
63
+ http: HttpClient,
64
+ jobId: string,
65
+ pollInterval = 2,
66
+ timeout?: number
67
+ ): Promise<ExtractResponse> {
68
+ const start = Date.now();
69
+ while (true) {
70
+ const status = await getExtractStatus(http, jobId);
71
+ if (["completed", "failed", "cancelled"].includes(status.status || "")) return status;
72
+ if (timeout != null && Date.now() - start > timeout * 1000) return status;
73
+ await new Promise((r) => setTimeout(r, Math.max(1000, pollInterval * 1000)));
74
+ }
75
+ }
76
+
77
+ export async function extract(
78
+ http: HttpClient,
79
+ args: Parameters<typeof prepareExtractPayload>[0] & { pollInterval?: number; timeout?: number }
80
+ ): Promise<ExtractResponse> {
81
+ const started = await startExtract(http, args);
82
+ const jobId = started.id;
83
+ if (!jobId) return started;
84
+ return waitExtract(http, jobId, args.pollInterval ?? 2, args.timeout);
85
+ }
86
+
@@ -0,0 +1,37 @@
1
+ import { type MapData, type MapOptions, type SearchResult } from "../types";
2
+ import { HttpClient } from "../utils/httpClient";
3
+ import { throwForBadResponse, normalizeAxiosError } from "../utils/errorHandler";
4
+
5
+ function prepareMapPayload(url: string, options?: MapOptions): Record<string, unknown> {
6
+ if (!url || !url.trim()) throw new Error("URL cannot be empty");
7
+ const payload: Record<string, unknown> = { url: url.trim() };
8
+ if (options) {
9
+ if (options.sitemap != null) payload.sitemap = options.sitemap;
10
+ if (options.search != null) payload.search = options.search;
11
+ if (options.includeSubdomains != null) payload.includeSubdomains = options.includeSubdomains;
12
+ if (options.limit != null) payload.limit = options.limit;
13
+ if (options.timeout != null) payload.timeout = options.timeout;
14
+ }
15
+ return payload;
16
+ }
17
+
18
+ export async function map(http: HttpClient, url: string, options?: MapOptions): Promise<MapData> {
19
+ const payload = prepareMapPayload(url, options);
20
+ try {
21
+ const res = await http.post<{ success: boolean; error?: string; links?: Array<string | SearchResult> }>("/v2/map", payload);
22
+ if (res.status !== 200 || !res.data?.success) {
23
+ throwForBadResponse(res, "map");
24
+ }
25
+ const linksIn = res.data.links || [];
26
+ const links: SearchResult[] = [];
27
+ for (const item of linksIn) {
28
+ if (typeof item === "string") links.push({ url: item });
29
+ else if (item && typeof item === "object") links.push({ url: item.url, title: (item as any).title, description: (item as any).description });
30
+ }
31
+ return { links };
32
+ } catch (err: any) {
33
+ if (err?.isAxiosError) return normalizeAxiosError(err, "map");
34
+ throw err;
35
+ }
36
+ }
37
+