firecrawl 1.10.0 → 1.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +52 -71
- package/dist/index.d.cts +4 -8
- package/dist/index.d.ts +4 -8
- package/dist/index.js +52 -71
- package/package.json +1 -1
- package/src/__tests__/index.test.ts +9 -18
- package/src/__tests__/v1/e2e_withAuth/index.test.ts +18 -34
- package/src/index.ts +53 -80
package/dist/index.cjs
CHANGED
|
@@ -49,20 +49,16 @@ var FirecrawlError = class extends Error {
|
|
|
49
49
|
var FirecrawlApp = class {
|
|
50
50
|
apiKey;
|
|
51
51
|
apiUrl;
|
|
52
|
-
isCloudService(url) {
|
|
53
|
-
return url.includes("api.firecrawl.dev");
|
|
54
|
-
}
|
|
55
52
|
/**
|
|
56
53
|
* Initializes a new instance of the FirecrawlApp class.
|
|
57
54
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
58
55
|
*/
|
|
59
56
|
constructor({ apiKey = null, apiUrl = null }) {
|
|
60
|
-
|
|
61
|
-
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
57
|
+
if (typeof apiKey !== "string") {
|
|
62
58
|
throw new FirecrawlError("No API key provided", 401);
|
|
63
59
|
}
|
|
64
|
-
this.apiKey = apiKey
|
|
65
|
-
this.apiUrl =
|
|
60
|
+
this.apiKey = apiKey;
|
|
61
|
+
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
66
62
|
}
|
|
67
63
|
/**
|
|
68
64
|
* Scrapes a URL using the Firecrawl API.
|
|
@@ -202,7 +198,7 @@ var FirecrawlApp = class {
|
|
|
202
198
|
let statusData = response.data;
|
|
203
199
|
if ("data" in statusData) {
|
|
204
200
|
let data = statusData.data;
|
|
205
|
-
while ("next" in statusData) {
|
|
201
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
206
202
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
207
203
|
data = data.concat(statusData.data);
|
|
208
204
|
}
|
|
@@ -299,9 +295,9 @@ var FirecrawlApp = class {
|
|
|
299
295
|
* @param webhook - Optional webhook for the batch scrape.
|
|
300
296
|
* @returns The response from the crawl operation.
|
|
301
297
|
*/
|
|
302
|
-
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook
|
|
298
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook) {
|
|
303
299
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
304
|
-
let jsonData = { urls,
|
|
300
|
+
let jsonData = { urls, ...params };
|
|
305
301
|
if (jsonData?.extract?.schema) {
|
|
306
302
|
let schema = jsonData.extract.schema;
|
|
307
303
|
try {
|
|
@@ -337,9 +333,9 @@ var FirecrawlApp = class {
|
|
|
337
333
|
}
|
|
338
334
|
return { success: false, error: "Internal server error." };
|
|
339
335
|
}
|
|
340
|
-
async asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
336
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
341
337
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
342
|
-
let jsonData = { urls,
|
|
338
|
+
let jsonData = { urls, ...params ?? {} };
|
|
343
339
|
try {
|
|
344
340
|
const response = await this.postRequest(
|
|
345
341
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -367,8 +363,8 @@ var FirecrawlApp = class {
|
|
|
367
363
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
368
364
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
369
365
|
*/
|
|
370
|
-
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey
|
|
371
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
366
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
367
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
372
368
|
if (crawl.success && crawl.id) {
|
|
373
369
|
const id = crawl.id;
|
|
374
370
|
return new CrawlWatcher(id, this);
|
|
@@ -397,7 +393,7 @@ var FirecrawlApp = class {
|
|
|
397
393
|
let statusData = response.data;
|
|
398
394
|
if ("data" in statusData) {
|
|
399
395
|
let data = statusData.data;
|
|
400
|
-
while ("next" in statusData) {
|
|
396
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
401
397
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
402
398
|
data = data.concat(statusData.data);
|
|
403
399
|
}
|
|
@@ -533,40 +529,44 @@ var FirecrawlApp = class {
|
|
|
533
529
|
* @returns The final job status or data.
|
|
534
530
|
*/
|
|
535
531
|
async monitorJobStatus(id, headers, checkInterval) {
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
if ("
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
532
|
+
try {
|
|
533
|
+
while (true) {
|
|
534
|
+
let statusResponse = await this.getRequest(
|
|
535
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
536
|
+
headers
|
|
537
|
+
);
|
|
538
|
+
if (statusResponse.status === 200) {
|
|
539
|
+
let statusData = statusResponse.data;
|
|
540
|
+
if (statusData.status === "completed") {
|
|
541
|
+
if ("data" in statusData) {
|
|
542
|
+
let data = statusData.data;
|
|
543
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
544
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
545
|
+
statusData = statusResponse.data;
|
|
546
|
+
data = data.concat(statusData.data);
|
|
547
|
+
}
|
|
548
|
+
statusData.data = data;
|
|
549
|
+
return statusData;
|
|
550
|
+
} else {
|
|
551
|
+
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
|
550
552
|
}
|
|
551
|
-
|
|
552
|
-
|
|
553
|
+
} else if (["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)) {
|
|
554
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
555
|
+
await new Promise(
|
|
556
|
+
(resolve) => setTimeout(resolve, checkInterval * 1e3)
|
|
557
|
+
);
|
|
553
558
|
} else {
|
|
554
|
-
throw new FirecrawlError(
|
|
559
|
+
throw new FirecrawlError(
|
|
560
|
+
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
561
|
+
500
|
|
562
|
+
);
|
|
555
563
|
}
|
|
556
|
-
} else if (["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)) {
|
|
557
|
-
checkInterval = Math.max(checkInterval, 2);
|
|
558
|
-
await new Promise(
|
|
559
|
-
(resolve) => setTimeout(resolve, checkInterval * 1e3)
|
|
560
|
-
);
|
|
561
564
|
} else {
|
|
562
|
-
|
|
563
|
-
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
564
|
-
500
|
|
565
|
-
);
|
|
565
|
+
this.handleError(statusResponse, "check crawl status");
|
|
566
566
|
}
|
|
567
|
-
} else {
|
|
568
|
-
this.handleError(statusResponse, "check crawl status");
|
|
569
567
|
}
|
|
568
|
+
} catch (error) {
|
|
569
|
+
throw new FirecrawlError(error, 500);
|
|
570
570
|
}
|
|
571
571
|
}
|
|
572
572
|
/**
|
|
@@ -593,10 +593,8 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
593
593
|
ws;
|
|
594
594
|
data;
|
|
595
595
|
status;
|
|
596
|
-
id;
|
|
597
596
|
constructor(id, app) {
|
|
598
597
|
super();
|
|
599
|
-
this.id = id;
|
|
600
598
|
this.ws = new import_isows.WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
601
599
|
this.status = "scraping";
|
|
602
600
|
this.data = [];
|
|
@@ -606,8 +604,7 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
606
604
|
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
607
605
|
detail: {
|
|
608
606
|
status: this.status,
|
|
609
|
-
data: this.data
|
|
610
|
-
id: this.id
|
|
607
|
+
data: this.data
|
|
611
608
|
}
|
|
612
609
|
}));
|
|
613
610
|
} else if (msg.type === "error") {
|
|
@@ -616,8 +613,7 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
616
613
|
detail: {
|
|
617
614
|
status: this.status,
|
|
618
615
|
data: this.data,
|
|
619
|
-
error: msg.error
|
|
620
|
-
id: this.id
|
|
616
|
+
error: msg.error
|
|
621
617
|
}
|
|
622
618
|
}));
|
|
623
619
|
} else if (msg.type === "catchup") {
|
|
@@ -625,18 +621,12 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
625
621
|
this.data.push(...msg.data.data ?? []);
|
|
626
622
|
for (const doc of this.data) {
|
|
627
623
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
628
|
-
detail:
|
|
629
|
-
...doc,
|
|
630
|
-
id: this.id
|
|
631
|
-
}
|
|
624
|
+
detail: doc
|
|
632
625
|
}));
|
|
633
626
|
}
|
|
634
627
|
} else if (msg.type === "document") {
|
|
635
628
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
636
|
-
detail:
|
|
637
|
-
...msg.data,
|
|
638
|
-
id: this.id
|
|
639
|
-
}
|
|
629
|
+
detail: msg.data
|
|
640
630
|
}));
|
|
641
631
|
}
|
|
642
632
|
};
|
|
@@ -645,20 +635,12 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
645
635
|
this.ws.close();
|
|
646
636
|
return;
|
|
647
637
|
}
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
messageHandler(msg);
|
|
651
|
-
} catch (error) {
|
|
652
|
-
console.error("Error on message", error);
|
|
653
|
-
}
|
|
638
|
+
const msg = JSON.parse(ev.data);
|
|
639
|
+
messageHandler(msg);
|
|
654
640
|
}).bind(this);
|
|
655
641
|
this.ws.onclose = ((ev) => {
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
messageHandler(msg);
|
|
659
|
-
} catch (error) {
|
|
660
|
-
console.error("Error on close", error);
|
|
661
|
-
}
|
|
642
|
+
const msg = JSON.parse(ev.reason);
|
|
643
|
+
messageHandler(msg);
|
|
662
644
|
}).bind(this);
|
|
663
645
|
this.ws.onerror = ((_) => {
|
|
664
646
|
this.status = "failed";
|
|
@@ -666,8 +648,7 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
666
648
|
detail: {
|
|
667
649
|
status: this.status,
|
|
668
650
|
data: this.data,
|
|
669
|
-
error: "WebSocket error"
|
|
670
|
-
id: this.id
|
|
651
|
+
error: "WebSocket error"
|
|
671
652
|
}
|
|
672
653
|
}));
|
|
673
654
|
}).bind(this);
|
package/dist/index.d.cts
CHANGED
|
@@ -171,7 +171,6 @@ interface BatchScrapeResponse {
|
|
|
171
171
|
url?: string;
|
|
172
172
|
success: true;
|
|
173
173
|
error?: string;
|
|
174
|
-
invalidURLs?: string[];
|
|
175
174
|
}
|
|
176
175
|
/**
|
|
177
176
|
* Response interface for job status checks.
|
|
@@ -226,11 +225,10 @@ interface MapResponse {
|
|
|
226
225
|
* Defines options for extracting information from URLs.
|
|
227
226
|
*/
|
|
228
227
|
interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
229
|
-
prompt
|
|
228
|
+
prompt: string;
|
|
230
229
|
schema?: LLMSchema;
|
|
231
230
|
systemPrompt?: string;
|
|
232
231
|
allowExternalLinks?: boolean;
|
|
233
|
-
includeSubdomains?: boolean;
|
|
234
232
|
}
|
|
235
233
|
/**
|
|
236
234
|
* Response interface for extracting information from URLs.
|
|
@@ -265,7 +263,6 @@ declare class FirecrawlError extends Error {
|
|
|
265
263
|
declare class FirecrawlApp {
|
|
266
264
|
apiKey: string;
|
|
267
265
|
apiUrl: string;
|
|
268
|
-
private isCloudService;
|
|
269
266
|
/**
|
|
270
267
|
* Initializes a new instance of the FirecrawlApp class.
|
|
271
268
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
@@ -332,8 +329,8 @@ declare class FirecrawlApp {
|
|
|
332
329
|
* @param webhook - Optional webhook for the batch scrape.
|
|
333
330
|
* @returns The response from the crawl operation.
|
|
334
331
|
*/
|
|
335
|
-
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]
|
|
336
|
-
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string
|
|
332
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
333
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
337
334
|
/**
|
|
338
335
|
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
339
336
|
* @param urls - The URL to scrape.
|
|
@@ -341,7 +338,7 @@ declare class FirecrawlApp {
|
|
|
341
338
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
342
339
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
343
340
|
*/
|
|
344
|
-
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string
|
|
341
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
345
342
|
/**
|
|
346
343
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
347
344
|
* @param id - The ID of the batch scrape operation.
|
|
@@ -417,7 +414,6 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
417
414
|
private ws;
|
|
418
415
|
data: FirecrawlDocument<undefined>[];
|
|
419
416
|
status: CrawlStatusResponse["status"];
|
|
420
|
-
id: string;
|
|
421
417
|
constructor(id: string, app: FirecrawlApp);
|
|
422
418
|
close(): void;
|
|
423
419
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -171,7 +171,6 @@ interface BatchScrapeResponse {
|
|
|
171
171
|
url?: string;
|
|
172
172
|
success: true;
|
|
173
173
|
error?: string;
|
|
174
|
-
invalidURLs?: string[];
|
|
175
174
|
}
|
|
176
175
|
/**
|
|
177
176
|
* Response interface for job status checks.
|
|
@@ -226,11 +225,10 @@ interface MapResponse {
|
|
|
226
225
|
* Defines options for extracting information from URLs.
|
|
227
226
|
*/
|
|
228
227
|
interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
229
|
-
prompt
|
|
228
|
+
prompt: string;
|
|
230
229
|
schema?: LLMSchema;
|
|
231
230
|
systemPrompt?: string;
|
|
232
231
|
allowExternalLinks?: boolean;
|
|
233
|
-
includeSubdomains?: boolean;
|
|
234
232
|
}
|
|
235
233
|
/**
|
|
236
234
|
* Response interface for extracting information from URLs.
|
|
@@ -265,7 +263,6 @@ declare class FirecrawlError extends Error {
|
|
|
265
263
|
declare class FirecrawlApp {
|
|
266
264
|
apiKey: string;
|
|
267
265
|
apiUrl: string;
|
|
268
|
-
private isCloudService;
|
|
269
266
|
/**
|
|
270
267
|
* Initializes a new instance of the FirecrawlApp class.
|
|
271
268
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
@@ -332,8 +329,8 @@ declare class FirecrawlApp {
|
|
|
332
329
|
* @param webhook - Optional webhook for the batch scrape.
|
|
333
330
|
* @returns The response from the crawl operation.
|
|
334
331
|
*/
|
|
335
|
-
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]
|
|
336
|
-
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string
|
|
332
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
333
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
337
334
|
/**
|
|
338
335
|
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
339
336
|
* @param urls - The URL to scrape.
|
|
@@ -341,7 +338,7 @@ declare class FirecrawlApp {
|
|
|
341
338
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
342
339
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
343
340
|
*/
|
|
344
|
-
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string
|
|
341
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
345
342
|
/**
|
|
346
343
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
347
344
|
* @param id - The ID of the batch scrape operation.
|
|
@@ -417,7 +414,6 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
417
414
|
private ws;
|
|
418
415
|
data: FirecrawlDocument<undefined>[];
|
|
419
416
|
status: CrawlStatusResponse["status"];
|
|
420
|
-
id: string;
|
|
421
417
|
constructor(id: string, app: FirecrawlApp);
|
|
422
418
|
close(): void;
|
|
423
419
|
}
|
package/dist/index.js
CHANGED
|
@@ -13,20 +13,16 @@ var FirecrawlError = class extends Error {
|
|
|
13
13
|
var FirecrawlApp = class {
|
|
14
14
|
apiKey;
|
|
15
15
|
apiUrl;
|
|
16
|
-
isCloudService(url) {
|
|
17
|
-
return url.includes("api.firecrawl.dev");
|
|
18
|
-
}
|
|
19
16
|
/**
|
|
20
17
|
* Initializes a new instance of the FirecrawlApp class.
|
|
21
18
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
22
19
|
*/
|
|
23
20
|
constructor({ apiKey = null, apiUrl = null }) {
|
|
24
|
-
|
|
25
|
-
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
21
|
+
if (typeof apiKey !== "string") {
|
|
26
22
|
throw new FirecrawlError("No API key provided", 401);
|
|
27
23
|
}
|
|
28
|
-
this.apiKey = apiKey
|
|
29
|
-
this.apiUrl =
|
|
24
|
+
this.apiKey = apiKey;
|
|
25
|
+
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
30
26
|
}
|
|
31
27
|
/**
|
|
32
28
|
* Scrapes a URL using the Firecrawl API.
|
|
@@ -166,7 +162,7 @@ var FirecrawlApp = class {
|
|
|
166
162
|
let statusData = response.data;
|
|
167
163
|
if ("data" in statusData) {
|
|
168
164
|
let data = statusData.data;
|
|
169
|
-
while ("next" in statusData) {
|
|
165
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
170
166
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
171
167
|
data = data.concat(statusData.data);
|
|
172
168
|
}
|
|
@@ -263,9 +259,9 @@ var FirecrawlApp = class {
|
|
|
263
259
|
* @param webhook - Optional webhook for the batch scrape.
|
|
264
260
|
* @returns The response from the crawl operation.
|
|
265
261
|
*/
|
|
266
|
-
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook
|
|
262
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook) {
|
|
267
263
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
268
|
-
let jsonData = { urls,
|
|
264
|
+
let jsonData = { urls, ...params };
|
|
269
265
|
if (jsonData?.extract?.schema) {
|
|
270
266
|
let schema = jsonData.extract.schema;
|
|
271
267
|
try {
|
|
@@ -301,9 +297,9 @@ var FirecrawlApp = class {
|
|
|
301
297
|
}
|
|
302
298
|
return { success: false, error: "Internal server error." };
|
|
303
299
|
}
|
|
304
|
-
async asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
300
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
305
301
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
306
|
-
let jsonData = { urls,
|
|
302
|
+
let jsonData = { urls, ...params ?? {} };
|
|
307
303
|
try {
|
|
308
304
|
const response = await this.postRequest(
|
|
309
305
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -331,8 +327,8 @@ var FirecrawlApp = class {
|
|
|
331
327
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
332
328
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
333
329
|
*/
|
|
334
|
-
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey
|
|
335
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
330
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
331
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
336
332
|
if (crawl.success && crawl.id) {
|
|
337
333
|
const id = crawl.id;
|
|
338
334
|
return new CrawlWatcher(id, this);
|
|
@@ -361,7 +357,7 @@ var FirecrawlApp = class {
|
|
|
361
357
|
let statusData = response.data;
|
|
362
358
|
if ("data" in statusData) {
|
|
363
359
|
let data = statusData.data;
|
|
364
|
-
while ("next" in statusData) {
|
|
360
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
365
361
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
366
362
|
data = data.concat(statusData.data);
|
|
367
363
|
}
|
|
@@ -497,40 +493,44 @@ var FirecrawlApp = class {
|
|
|
497
493
|
* @returns The final job status or data.
|
|
498
494
|
*/
|
|
499
495
|
async monitorJobStatus(id, headers, checkInterval) {
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
if ("
|
|
509
|
-
|
|
510
|
-
|
|
511
|
-
|
|
512
|
-
|
|
513
|
-
|
|
496
|
+
try {
|
|
497
|
+
while (true) {
|
|
498
|
+
let statusResponse = await this.getRequest(
|
|
499
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
500
|
+
headers
|
|
501
|
+
);
|
|
502
|
+
if (statusResponse.status === 200) {
|
|
503
|
+
let statusData = statusResponse.data;
|
|
504
|
+
if (statusData.status === "completed") {
|
|
505
|
+
if ("data" in statusData) {
|
|
506
|
+
let data = statusData.data;
|
|
507
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
508
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
509
|
+
statusData = statusResponse.data;
|
|
510
|
+
data = data.concat(statusData.data);
|
|
511
|
+
}
|
|
512
|
+
statusData.data = data;
|
|
513
|
+
return statusData;
|
|
514
|
+
} else {
|
|
515
|
+
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
|
514
516
|
}
|
|
515
|
-
|
|
516
|
-
|
|
517
|
+
} else if (["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)) {
|
|
518
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
519
|
+
await new Promise(
|
|
520
|
+
(resolve) => setTimeout(resolve, checkInterval * 1e3)
|
|
521
|
+
);
|
|
517
522
|
} else {
|
|
518
|
-
throw new FirecrawlError(
|
|
523
|
+
throw new FirecrawlError(
|
|
524
|
+
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
525
|
+
500
|
|
526
|
+
);
|
|
519
527
|
}
|
|
520
|
-
} else if (["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)) {
|
|
521
|
-
checkInterval = Math.max(checkInterval, 2);
|
|
522
|
-
await new Promise(
|
|
523
|
-
(resolve) => setTimeout(resolve, checkInterval * 1e3)
|
|
524
|
-
);
|
|
525
528
|
} else {
|
|
526
|
-
|
|
527
|
-
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
528
|
-
500
|
|
529
|
-
);
|
|
529
|
+
this.handleError(statusResponse, "check crawl status");
|
|
530
530
|
}
|
|
531
|
-
} else {
|
|
532
|
-
this.handleError(statusResponse, "check crawl status");
|
|
533
531
|
}
|
|
532
|
+
} catch (error) {
|
|
533
|
+
throw new FirecrawlError(error, 500);
|
|
534
534
|
}
|
|
535
535
|
}
|
|
536
536
|
/**
|
|
@@ -557,10 +557,8 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
557
557
|
ws;
|
|
558
558
|
data;
|
|
559
559
|
status;
|
|
560
|
-
id;
|
|
561
560
|
constructor(id, app) {
|
|
562
561
|
super();
|
|
563
|
-
this.id = id;
|
|
564
562
|
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
565
563
|
this.status = "scraping";
|
|
566
564
|
this.data = [];
|
|
@@ -570,8 +568,7 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
570
568
|
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
571
569
|
detail: {
|
|
572
570
|
status: this.status,
|
|
573
|
-
data: this.data
|
|
574
|
-
id: this.id
|
|
571
|
+
data: this.data
|
|
575
572
|
}
|
|
576
573
|
}));
|
|
577
574
|
} else if (msg.type === "error") {
|
|
@@ -580,8 +577,7 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
580
577
|
detail: {
|
|
581
578
|
status: this.status,
|
|
582
579
|
data: this.data,
|
|
583
|
-
error: msg.error
|
|
584
|
-
id: this.id
|
|
580
|
+
error: msg.error
|
|
585
581
|
}
|
|
586
582
|
}));
|
|
587
583
|
} else if (msg.type === "catchup") {
|
|
@@ -589,18 +585,12 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
589
585
|
this.data.push(...msg.data.data ?? []);
|
|
590
586
|
for (const doc of this.data) {
|
|
591
587
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
592
|
-
detail:
|
|
593
|
-
...doc,
|
|
594
|
-
id: this.id
|
|
595
|
-
}
|
|
588
|
+
detail: doc
|
|
596
589
|
}));
|
|
597
590
|
}
|
|
598
591
|
} else if (msg.type === "document") {
|
|
599
592
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
600
|
-
detail:
|
|
601
|
-
...msg.data,
|
|
602
|
-
id: this.id
|
|
603
|
-
}
|
|
593
|
+
detail: msg.data
|
|
604
594
|
}));
|
|
605
595
|
}
|
|
606
596
|
};
|
|
@@ -609,20 +599,12 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
609
599
|
this.ws.close();
|
|
610
600
|
return;
|
|
611
601
|
}
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
messageHandler(msg);
|
|
615
|
-
} catch (error) {
|
|
616
|
-
console.error("Error on message", error);
|
|
617
|
-
}
|
|
602
|
+
const msg = JSON.parse(ev.data);
|
|
603
|
+
messageHandler(msg);
|
|
618
604
|
}).bind(this);
|
|
619
605
|
this.ws.onclose = ((ev) => {
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
messageHandler(msg);
|
|
623
|
-
} catch (error) {
|
|
624
|
-
console.error("Error on close", error);
|
|
625
|
-
}
|
|
606
|
+
const msg = JSON.parse(ev.reason);
|
|
607
|
+
messageHandler(msg);
|
|
626
608
|
}).bind(this);
|
|
627
609
|
this.ws.onerror = ((_) => {
|
|
628
610
|
this.status = "failed";
|
|
@@ -630,8 +612,7 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
630
612
|
detail: {
|
|
631
613
|
status: this.status,
|
|
632
614
|
data: this.data,
|
|
633
|
-
error: "WebSocket error"
|
|
634
|
-
id: this.id
|
|
615
|
+
error: "WebSocket error"
|
|
635
616
|
}
|
|
636
617
|
}));
|
|
637
618
|
}).bind(this);
|
package/package.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
import { describe, expect, jest
|
|
2
|
-
|
|
3
|
-
import FirecrawlApp from '../index';
|
|
1
|
+
import { describe, test, expect, jest } from '@jest/globals';
|
|
4
2
|
import axios from 'axios';
|
|
5
|
-
import
|
|
3
|
+
import FirecrawlApp from '../index';
|
|
4
|
+
|
|
6
5
|
import { readFile } from 'fs/promises';
|
|
6
|
+
import { join } from 'path';
|
|
7
7
|
|
|
8
8
|
// Mock jest and set the type
|
|
9
9
|
jest.mock('axios');
|
|
@@ -14,22 +14,13 @@ async function loadFixture(name: string): Promise<string> {
|
|
|
14
14
|
return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8')
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
-
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
|
18
|
-
|
|
19
17
|
describe('the firecrawl JS SDK', () => {
|
|
20
18
|
|
|
21
|
-
test('Should require an API key
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
}).toThrow('No API key provided');
|
|
27
|
-
} else {
|
|
28
|
-
// Should not throw for self-hosted
|
|
29
|
-
expect(() => {
|
|
30
|
-
new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL });
|
|
31
|
-
}).not.toThrow();
|
|
32
|
-
}
|
|
19
|
+
test('Should require an API key to instantiate FirecrawlApp', async () => {
|
|
20
|
+
const fn = () => {
|
|
21
|
+
new FirecrawlApp({ apiKey: undefined });
|
|
22
|
+
};
|
|
23
|
+
expect(fn).toThrow('No API key provided');
|
|
33
24
|
});
|
|
34
25
|
|
|
35
26
|
test('Should return scraped data from a /scrape API call', async () => {
|
|
@@ -9,28 +9,15 @@ const TEST_API_KEY = process.env.TEST_API_KEY;
|
|
|
9
9
|
const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev";
|
|
10
10
|
|
|
11
11
|
describe('FirecrawlApp E2E Tests', () => {
|
|
12
|
-
test.concurrent('should throw error for no API key
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
|
17
|
-
}).toThrow("No API key provided");
|
|
18
|
-
} else {
|
|
19
|
-
// Should not throw for self-hosted
|
|
20
|
-
expect(() => {
|
|
21
|
-
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
|
22
|
-
}).not.toThrow();
|
|
23
|
-
}
|
|
12
|
+
test.concurrent('should throw error for no API key', async () => {
|
|
13
|
+
expect(() => {
|
|
14
|
+
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
|
15
|
+
}).toThrow("No API key provided");
|
|
24
16
|
});
|
|
25
17
|
|
|
26
18
|
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404");
|
|
30
|
-
} else {
|
|
31
|
-
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
32
|
-
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
|
33
|
-
}
|
|
19
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
20
|
+
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
|
34
21
|
});
|
|
35
22
|
|
|
36
23
|
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
|
@@ -168,13 +155,14 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
168
155
|
}, 30000); // 30 seconds timeout
|
|
169
156
|
|
|
170
157
|
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
158
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
159
|
+
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
|
160
|
+
});
|
|
161
|
+
|
|
162
|
+
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
|
|
163
|
+
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
164
|
+
const blocklistedUrl = "https://twitter.com/fake-test";
|
|
165
|
+
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
|
178
166
|
});
|
|
179
167
|
|
|
180
168
|
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
|
@@ -349,13 +337,8 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
349
337
|
}, 60000); // 60 seconds timeout
|
|
350
338
|
|
|
351
339
|
test.concurrent('should throw error for invalid API key on map', async () => {
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404");
|
|
355
|
-
} else {
|
|
356
|
-
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
357
|
-
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow();
|
|
358
|
-
}
|
|
340
|
+
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
|
341
|
+
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
|
359
342
|
});
|
|
360
343
|
|
|
361
344
|
test.concurrent('should throw error for blocklisted URL on map', async () => {
|
|
@@ -372,7 +355,8 @@ describe('FirecrawlApp E2E Tests', () => {
|
|
|
372
355
|
}, 30000); // 30 seconds timeout
|
|
373
356
|
|
|
374
357
|
test.concurrent('should return successful response for valid map', async () => {
|
|
375
|
-
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
358
|
+
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
|
359
|
+
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
|
376
360
|
expect(response).not.toBeNull();
|
|
377
361
|
|
|
378
362
|
expect(response.links?.length).toBeGreaterThan(0);
|
package/src/index.ts
CHANGED
|
@@ -183,7 +183,6 @@ export interface BatchScrapeResponse {
|
|
|
183
183
|
url?: string;
|
|
184
184
|
success: true;
|
|
185
185
|
error?: string;
|
|
186
|
-
invalidURLs?: string[];
|
|
187
186
|
}
|
|
188
187
|
|
|
189
188
|
/**
|
|
@@ -243,11 +242,10 @@ export interface MapResponse {
|
|
|
243
242
|
* Defines options for extracting information from URLs.
|
|
244
243
|
*/
|
|
245
244
|
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
246
|
-
prompt
|
|
245
|
+
prompt: string;
|
|
247
246
|
schema?: LLMSchema;
|
|
248
247
|
systemPrompt?: string;
|
|
249
248
|
allowExternalLinks?: boolean;
|
|
250
|
-
includeSubdomains?: boolean;
|
|
251
249
|
}
|
|
252
250
|
|
|
253
251
|
/**
|
|
@@ -290,23 +288,17 @@ export default class FirecrawlApp {
|
|
|
290
288
|
public apiKey: string;
|
|
291
289
|
public apiUrl: string;
|
|
292
290
|
|
|
293
|
-
private isCloudService(url: string): boolean {
|
|
294
|
-
return url.includes('api.firecrawl.dev');
|
|
295
|
-
}
|
|
296
|
-
|
|
297
291
|
/**
|
|
298
292
|
* Initializes a new instance of the FirecrawlApp class.
|
|
299
293
|
* @param config - Configuration options for the FirecrawlApp instance.
|
|
300
294
|
*/
|
|
301
295
|
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
if (this.isCloudService(baseUrl) && typeof apiKey !== "string") {
|
|
296
|
+
if (typeof apiKey !== "string") {
|
|
305
297
|
throw new FirecrawlError("No API key provided", 401);
|
|
306
298
|
}
|
|
307
299
|
|
|
308
|
-
this.apiKey = apiKey
|
|
309
|
-
this.apiUrl =
|
|
300
|
+
this.apiKey = apiKey;
|
|
301
|
+
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
|
310
302
|
}
|
|
311
303
|
|
|
312
304
|
/**
|
|
@@ -470,7 +462,7 @@ export default class FirecrawlApp {
|
|
|
470
462
|
let statusData = response.data
|
|
471
463
|
if ("data" in statusData) {
|
|
472
464
|
let data = statusData.data;
|
|
473
|
-
while ('next' in statusData) {
|
|
465
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
474
466
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
475
467
|
data = data.concat(statusData.data);
|
|
476
468
|
}
|
|
@@ -584,10 +576,9 @@ export default class FirecrawlApp {
|
|
|
584
576
|
pollInterval: number = 2,
|
|
585
577
|
idempotencyKey?: string,
|
|
586
578
|
webhook?: CrawlParams["webhook"],
|
|
587
|
-
ignoreInvalidURLs?: boolean,
|
|
588
579
|
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
589
580
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
590
|
-
let jsonData: any = { urls,
|
|
581
|
+
let jsonData: any = { urls, ...params };
|
|
591
582
|
if (jsonData?.extract?.schema) {
|
|
592
583
|
let schema = jsonData.extract.schema;
|
|
593
584
|
|
|
@@ -630,12 +621,10 @@ export default class FirecrawlApp {
|
|
|
630
621
|
async asyncBatchScrapeUrls(
|
|
631
622
|
urls: string[],
|
|
632
623
|
params?: ScrapeParams,
|
|
633
|
-
idempotencyKey?: string
|
|
634
|
-
webhook?: CrawlParams["webhook"],
|
|
635
|
-
ignoreInvalidURLs?: boolean,
|
|
624
|
+
idempotencyKey?: string
|
|
636
625
|
): Promise<BatchScrapeResponse | ErrorResponse> {
|
|
637
626
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
638
|
-
let jsonData: any = { urls,
|
|
627
|
+
let jsonData: any = { urls, ...(params ?? {}) };
|
|
639
628
|
try {
|
|
640
629
|
const response: AxiosResponse = await this.postRequest(
|
|
641
630
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -668,10 +657,8 @@ export default class FirecrawlApp {
|
|
|
668
657
|
urls: string[],
|
|
669
658
|
params?: ScrapeParams,
|
|
670
659
|
idempotencyKey?: string,
|
|
671
|
-
webhook?: CrawlParams["webhook"],
|
|
672
|
-
ignoreInvalidURLs?: boolean,
|
|
673
660
|
) {
|
|
674
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
661
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
675
662
|
|
|
676
663
|
if (crawl.success && crawl.id) {
|
|
677
664
|
const id = crawl.id;
|
|
@@ -704,7 +691,7 @@ export default class FirecrawlApp {
|
|
|
704
691
|
let statusData = response.data
|
|
705
692
|
if ("data" in statusData) {
|
|
706
693
|
let data = statusData.data;
|
|
707
|
-
while ('next' in statusData) {
|
|
694
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
708
695
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
709
696
|
data = data.concat(statusData.data);
|
|
710
697
|
}
|
|
@@ -863,42 +850,46 @@ export default class FirecrawlApp {
|
|
|
863
850
|
headers: AxiosRequestHeaders,
|
|
864
851
|
checkInterval: number
|
|
865
852
|
): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
if ("
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
853
|
+
try {
|
|
854
|
+
while (true) {
|
|
855
|
+
let statusResponse: AxiosResponse = await this.getRequest(
|
|
856
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
857
|
+
headers
|
|
858
|
+
);
|
|
859
|
+
if (statusResponse.status === 200) {
|
|
860
|
+
let statusData = statusResponse.data;
|
|
861
|
+
if (statusData.status === "completed") {
|
|
862
|
+
if ("data" in statusData) {
|
|
863
|
+
let data = statusData.data;
|
|
864
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
865
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
866
|
+
statusData = statusResponse.data;
|
|
867
|
+
data = data.concat(statusData.data);
|
|
868
|
+
}
|
|
869
|
+
statusData.data = data;
|
|
870
|
+
return statusData;
|
|
871
|
+
} else {
|
|
872
|
+
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
|
880
873
|
}
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
|
|
892
|
-
|
|
874
|
+
} else if (
|
|
875
|
+
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
|
876
|
+
) {
|
|
877
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
878
|
+
await new Promise((resolve) =>
|
|
879
|
+
setTimeout(resolve, checkInterval * 1000)
|
|
880
|
+
);
|
|
881
|
+
} else {
|
|
882
|
+
throw new FirecrawlError(
|
|
883
|
+
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
884
|
+
500
|
|
885
|
+
);
|
|
886
|
+
}
|
|
893
887
|
} else {
|
|
894
|
-
|
|
895
|
-
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
896
|
-
500
|
|
897
|
-
);
|
|
888
|
+
this.handleError(statusResponse, "check crawl status");
|
|
898
889
|
}
|
|
899
|
-
} else {
|
|
900
|
-
this.handleError(statusResponse, "check crawl status");
|
|
901
890
|
}
|
|
891
|
+
} catch (error: any) {
|
|
892
|
+
throw new FirecrawlError(error, 500);
|
|
902
893
|
}
|
|
903
894
|
}
|
|
904
895
|
|
|
@@ -941,11 +932,9 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
941
932
|
private ws: WebSocket;
|
|
942
933
|
public data: FirecrawlDocument<undefined>[];
|
|
943
934
|
public status: CrawlStatusResponse["status"];
|
|
944
|
-
public id: string;
|
|
945
935
|
|
|
946
936
|
constructor(id: string, app: FirecrawlApp) {
|
|
947
937
|
super();
|
|
948
|
-
this.id = id;
|
|
949
938
|
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
950
939
|
this.status = "scraping";
|
|
951
940
|
this.data = [];
|
|
@@ -976,7 +965,6 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
976
965
|
detail: {
|
|
977
966
|
status: this.status,
|
|
978
967
|
data: this.data,
|
|
979
|
-
id: this.id,
|
|
980
968
|
},
|
|
981
969
|
}));
|
|
982
970
|
} else if (msg.type === "error") {
|
|
@@ -986,7 +974,6 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
986
974
|
status: this.status,
|
|
987
975
|
data: this.data,
|
|
988
976
|
error: msg.error,
|
|
989
|
-
id: this.id,
|
|
990
977
|
},
|
|
991
978
|
}));
|
|
992
979
|
} else if (msg.type === "catchup") {
|
|
@@ -994,18 +981,12 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
994
981
|
this.data.push(...(msg.data.data ?? []));
|
|
995
982
|
for (const doc of this.data) {
|
|
996
983
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
997
|
-
detail:
|
|
998
|
-
...doc,
|
|
999
|
-
id: this.id,
|
|
1000
|
-
},
|
|
984
|
+
detail: doc,
|
|
1001
985
|
}));
|
|
1002
986
|
}
|
|
1003
987
|
} else if (msg.type === "document") {
|
|
1004
988
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
1005
|
-
detail:
|
|
1006
|
-
...msg.data,
|
|
1007
|
-
id: this.id,
|
|
1008
|
-
},
|
|
989
|
+
detail: msg.data,
|
|
1009
990
|
}));
|
|
1010
991
|
}
|
|
1011
992
|
}
|
|
@@ -1015,21 +996,14 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
1015
996
|
this.ws.close();
|
|
1016
997
|
return;
|
|
1017
998
|
}
|
|
1018
|
-
|
|
1019
|
-
|
|
1020
|
-
|
|
1021
|
-
} catch (error) {
|
|
1022
|
-
console.error("Error on message", error);
|
|
1023
|
-
}
|
|
999
|
+
|
|
1000
|
+
const msg = JSON.parse(ev.data) as Message;
|
|
1001
|
+
messageHandler(msg);
|
|
1024
1002
|
}).bind(this);
|
|
1025
1003
|
|
|
1026
1004
|
this.ws.onclose = ((ev: CloseEvent) => {
|
|
1027
|
-
|
|
1028
|
-
|
|
1029
|
-
messageHandler(msg);
|
|
1030
|
-
} catch (error) {
|
|
1031
|
-
console.error("Error on close", error);
|
|
1032
|
-
}
|
|
1005
|
+
const msg = JSON.parse(ev.reason) as Message;
|
|
1006
|
+
messageHandler(msg);
|
|
1033
1007
|
}).bind(this);
|
|
1034
1008
|
|
|
1035
1009
|
this.ws.onerror = ((_: Event) => {
|
|
@@ -1039,7 +1013,6 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
1039
1013
|
status: this.status,
|
|
1040
1014
|
data: this.data,
|
|
1041
1015
|
error: "WebSocket error",
|
|
1042
|
-
id: this.id,
|
|
1043
1016
|
},
|
|
1044
1017
|
}));
|
|
1045
1018
|
}).bind(this);
|