firecrawl 1.9.7 → 1.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.cjs +45 -52
- package/dist/index.d.cts +4 -7
- package/dist/index.d.ts +4 -7
- package/dist/index.js +45 -52
- package/package.json +1 -1
- package/src/index.ts +45 -59
package/dist/index.cjs
CHANGED
|
@@ -198,7 +198,7 @@ var FirecrawlApp = class {
|
|
|
198
198
|
let statusData = response.data;
|
|
199
199
|
if ("data" in statusData) {
|
|
200
200
|
let data = statusData.data;
|
|
201
|
-
while ("next" in statusData) {
|
|
201
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
202
202
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
203
203
|
data = data.concat(statusData.data);
|
|
204
204
|
}
|
|
@@ -295,9 +295,9 @@ var FirecrawlApp = class {
|
|
|
295
295
|
* @param webhook - Optional webhook for the batch scrape.
|
|
296
296
|
* @returns The response from the crawl operation.
|
|
297
297
|
*/
|
|
298
|
-
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook
|
|
298
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook) {
|
|
299
299
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
300
|
-
let jsonData = { urls,
|
|
300
|
+
let jsonData = { urls, ...params };
|
|
301
301
|
if (jsonData?.extract?.schema) {
|
|
302
302
|
let schema = jsonData.extract.schema;
|
|
303
303
|
try {
|
|
@@ -333,9 +333,9 @@ var FirecrawlApp = class {
|
|
|
333
333
|
}
|
|
334
334
|
return { success: false, error: "Internal server error." };
|
|
335
335
|
}
|
|
336
|
-
async asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
336
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
337
337
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
338
|
-
let jsonData = { urls,
|
|
338
|
+
let jsonData = { urls, ...params ?? {} };
|
|
339
339
|
try {
|
|
340
340
|
const response = await this.postRequest(
|
|
341
341
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -363,8 +363,8 @@ var FirecrawlApp = class {
|
|
|
363
363
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
364
364
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
365
365
|
*/
|
|
366
|
-
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey
|
|
367
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
366
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
367
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
368
368
|
if (crawl.success && crawl.id) {
|
|
369
369
|
const id = crawl.id;
|
|
370
370
|
return new CrawlWatcher(id, this);
|
|
@@ -393,7 +393,7 @@ var FirecrawlApp = class {
|
|
|
393
393
|
let statusData = response.data;
|
|
394
394
|
if ("data" in statusData) {
|
|
395
395
|
let data = statusData.data;
|
|
396
|
-
while ("next" in statusData) {
|
|
396
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
397
397
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
398
398
|
data = data.concat(statusData.data);
|
|
399
399
|
}
|
|
@@ -529,40 +529,44 @@ var FirecrawlApp = class {
|
|
|
529
529
|
* @returns The final job status or data.
|
|
530
530
|
*/
|
|
531
531
|
async monitorJobStatus(id, headers, checkInterval) {
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
if ("
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
532
|
+
try {
|
|
533
|
+
while (true) {
|
|
534
|
+
let statusResponse = await this.getRequest(
|
|
535
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
536
|
+
headers
|
|
537
|
+
);
|
|
538
|
+
if (statusResponse.status === 200) {
|
|
539
|
+
let statusData = statusResponse.data;
|
|
540
|
+
if (statusData.status === "completed") {
|
|
541
|
+
if ("data" in statusData) {
|
|
542
|
+
let data = statusData.data;
|
|
543
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
544
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
545
|
+
statusData = statusResponse.data;
|
|
546
|
+
data = data.concat(statusData.data);
|
|
547
|
+
}
|
|
548
|
+
statusData.data = data;
|
|
549
|
+
return statusData;
|
|
550
|
+
} else {
|
|
551
|
+
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
|
546
552
|
}
|
|
547
|
-
|
|
548
|
-
|
|
553
|
+
} else if (["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)) {
|
|
554
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
555
|
+
await new Promise(
|
|
556
|
+
(resolve) => setTimeout(resolve, checkInterval * 1e3)
|
|
557
|
+
);
|
|
549
558
|
} else {
|
|
550
|
-
throw new FirecrawlError(
|
|
559
|
+
throw new FirecrawlError(
|
|
560
|
+
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
561
|
+
500
|
|
562
|
+
);
|
|
551
563
|
}
|
|
552
|
-
} else if (["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)) {
|
|
553
|
-
checkInterval = Math.max(checkInterval, 2);
|
|
554
|
-
await new Promise(
|
|
555
|
-
(resolve) => setTimeout(resolve, checkInterval * 1e3)
|
|
556
|
-
);
|
|
557
564
|
} else {
|
|
558
|
-
|
|
559
|
-
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
560
|
-
500
|
|
561
|
-
);
|
|
565
|
+
this.handleError(statusResponse, "check crawl status");
|
|
562
566
|
}
|
|
563
|
-
} else {
|
|
564
|
-
this.handleError(statusResponse, "check crawl status");
|
|
565
567
|
}
|
|
568
|
+
} catch (error) {
|
|
569
|
+
throw new FirecrawlError(error, 500);
|
|
566
570
|
}
|
|
567
571
|
}
|
|
568
572
|
/**
|
|
@@ -589,10 +593,8 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
589
593
|
ws;
|
|
590
594
|
data;
|
|
591
595
|
status;
|
|
592
|
-
id;
|
|
593
596
|
constructor(id, app) {
|
|
594
597
|
super();
|
|
595
|
-
this.id = id;
|
|
596
598
|
this.ws = new import_isows.WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
597
599
|
this.status = "scraping";
|
|
598
600
|
this.data = [];
|
|
@@ -602,8 +604,7 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
602
604
|
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
603
605
|
detail: {
|
|
604
606
|
status: this.status,
|
|
605
|
-
data: this.data
|
|
606
|
-
id: this.id
|
|
607
|
+
data: this.data
|
|
607
608
|
}
|
|
608
609
|
}));
|
|
609
610
|
} else if (msg.type === "error") {
|
|
@@ -612,8 +613,7 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
612
613
|
detail: {
|
|
613
614
|
status: this.status,
|
|
614
615
|
data: this.data,
|
|
615
|
-
error: msg.error
|
|
616
|
-
id: this.id
|
|
616
|
+
error: msg.error
|
|
617
617
|
}
|
|
618
618
|
}));
|
|
619
619
|
} else if (msg.type === "catchup") {
|
|
@@ -621,18 +621,12 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
621
621
|
this.data.push(...msg.data.data ?? []);
|
|
622
622
|
for (const doc of this.data) {
|
|
623
623
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
624
|
-
detail:
|
|
625
|
-
...doc,
|
|
626
|
-
id: this.id
|
|
627
|
-
}
|
|
624
|
+
detail: doc
|
|
628
625
|
}));
|
|
629
626
|
}
|
|
630
627
|
} else if (msg.type === "document") {
|
|
631
628
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
632
|
-
detail:
|
|
633
|
-
...msg.data,
|
|
634
|
-
id: this.id
|
|
635
|
-
}
|
|
629
|
+
detail: msg.data
|
|
636
630
|
}));
|
|
637
631
|
}
|
|
638
632
|
};
|
|
@@ -654,8 +648,7 @@ var CrawlWatcher = class extends import_typescript_event_target.TypedEventTarget
|
|
|
654
648
|
detail: {
|
|
655
649
|
status: this.status,
|
|
656
650
|
data: this.data,
|
|
657
|
-
error: "WebSocket error"
|
|
658
|
-
id: this.id
|
|
651
|
+
error: "WebSocket error"
|
|
659
652
|
}
|
|
660
653
|
}));
|
|
661
654
|
}).bind(this);
|
package/dist/index.d.cts
CHANGED
|
@@ -171,7 +171,6 @@ interface BatchScrapeResponse {
|
|
|
171
171
|
url?: string;
|
|
172
172
|
success: true;
|
|
173
173
|
error?: string;
|
|
174
|
-
invalidURLs?: string[];
|
|
175
174
|
}
|
|
176
175
|
/**
|
|
177
176
|
* Response interface for job status checks.
|
|
@@ -226,11 +225,10 @@ interface MapResponse {
|
|
|
226
225
|
* Defines options for extracting information from URLs.
|
|
227
226
|
*/
|
|
228
227
|
interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
229
|
-
prompt
|
|
228
|
+
prompt: string;
|
|
230
229
|
schema?: LLMSchema;
|
|
231
230
|
systemPrompt?: string;
|
|
232
231
|
allowExternalLinks?: boolean;
|
|
233
|
-
includeSubdomains?: boolean;
|
|
234
232
|
}
|
|
235
233
|
/**
|
|
236
234
|
* Response interface for extracting information from URLs.
|
|
@@ -331,8 +329,8 @@ declare class FirecrawlApp {
|
|
|
331
329
|
* @param webhook - Optional webhook for the batch scrape.
|
|
332
330
|
* @returns The response from the crawl operation.
|
|
333
331
|
*/
|
|
334
|
-
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]
|
|
335
|
-
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string
|
|
332
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
333
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
336
334
|
/**
|
|
337
335
|
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
338
336
|
* @param urls - The URL to scrape.
|
|
@@ -340,7 +338,7 @@ declare class FirecrawlApp {
|
|
|
340
338
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
341
339
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
342
340
|
*/
|
|
343
|
-
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string
|
|
341
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
344
342
|
/**
|
|
345
343
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
346
344
|
* @param id - The ID of the batch scrape operation.
|
|
@@ -416,7 +414,6 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
416
414
|
private ws;
|
|
417
415
|
data: FirecrawlDocument<undefined>[];
|
|
418
416
|
status: CrawlStatusResponse["status"];
|
|
419
|
-
id: string;
|
|
420
417
|
constructor(id: string, app: FirecrawlApp);
|
|
421
418
|
close(): void;
|
|
422
419
|
}
|
package/dist/index.d.ts
CHANGED
|
@@ -171,7 +171,6 @@ interface BatchScrapeResponse {
|
|
|
171
171
|
url?: string;
|
|
172
172
|
success: true;
|
|
173
173
|
error?: string;
|
|
174
|
-
invalidURLs?: string[];
|
|
175
174
|
}
|
|
176
175
|
/**
|
|
177
176
|
* Response interface for job status checks.
|
|
@@ -226,11 +225,10 @@ interface MapResponse {
|
|
|
226
225
|
* Defines options for extracting information from URLs.
|
|
227
226
|
*/
|
|
228
227
|
interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
229
|
-
prompt
|
|
228
|
+
prompt: string;
|
|
230
229
|
schema?: LLMSchema;
|
|
231
230
|
systemPrompt?: string;
|
|
232
231
|
allowExternalLinks?: boolean;
|
|
233
|
-
includeSubdomains?: boolean;
|
|
234
232
|
}
|
|
235
233
|
/**
|
|
236
234
|
* Response interface for extracting information from URLs.
|
|
@@ -331,8 +329,8 @@ declare class FirecrawlApp {
|
|
|
331
329
|
* @param webhook - Optional webhook for the batch scrape.
|
|
332
330
|
* @returns The response from the crawl operation.
|
|
333
331
|
*/
|
|
334
|
-
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]
|
|
335
|
-
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string
|
|
332
|
+
batchScrapeUrls(urls: string[], params?: ScrapeParams, pollInterval?: number, idempotencyKey?: string, webhook?: CrawlParams["webhook"]): Promise<BatchScrapeStatusResponse | ErrorResponse>;
|
|
333
|
+
asyncBatchScrapeUrls(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<BatchScrapeResponse | ErrorResponse>;
|
|
336
334
|
/**
|
|
337
335
|
* Initiates a batch scrape job and returns a CrawlWatcher to monitor the job via WebSocket.
|
|
338
336
|
* @param urls - The URL to scrape.
|
|
@@ -340,7 +338,7 @@ declare class FirecrawlApp {
|
|
|
340
338
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
341
339
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
342
340
|
*/
|
|
343
|
-
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string
|
|
341
|
+
batchScrapeUrlsAndWatch(urls: string[], params?: ScrapeParams, idempotencyKey?: string): Promise<CrawlWatcher>;
|
|
344
342
|
/**
|
|
345
343
|
* Checks the status of a batch scrape job using the Firecrawl API.
|
|
346
344
|
* @param id - The ID of the batch scrape operation.
|
|
@@ -416,7 +414,6 @@ declare class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
416
414
|
private ws;
|
|
417
415
|
data: FirecrawlDocument<undefined>[];
|
|
418
416
|
status: CrawlStatusResponse["status"];
|
|
419
|
-
id: string;
|
|
420
417
|
constructor(id: string, app: FirecrawlApp);
|
|
421
418
|
close(): void;
|
|
422
419
|
}
|
package/dist/index.js
CHANGED
|
@@ -162,7 +162,7 @@ var FirecrawlApp = class {
|
|
|
162
162
|
let statusData = response.data;
|
|
163
163
|
if ("data" in statusData) {
|
|
164
164
|
let data = statusData.data;
|
|
165
|
-
while ("next" in statusData) {
|
|
165
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
166
166
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
167
167
|
data = data.concat(statusData.data);
|
|
168
168
|
}
|
|
@@ -259,9 +259,9 @@ var FirecrawlApp = class {
|
|
|
259
259
|
* @param webhook - Optional webhook for the batch scrape.
|
|
260
260
|
* @returns The response from the crawl operation.
|
|
261
261
|
*/
|
|
262
|
-
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook
|
|
262
|
+
async batchScrapeUrls(urls, params, pollInterval = 2, idempotencyKey, webhook) {
|
|
263
263
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
264
|
-
let jsonData = { urls,
|
|
264
|
+
let jsonData = { urls, ...params };
|
|
265
265
|
if (jsonData?.extract?.schema) {
|
|
266
266
|
let schema = jsonData.extract.schema;
|
|
267
267
|
try {
|
|
@@ -297,9 +297,9 @@ var FirecrawlApp = class {
|
|
|
297
297
|
}
|
|
298
298
|
return { success: false, error: "Internal server error." };
|
|
299
299
|
}
|
|
300
|
-
async asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
300
|
+
async asyncBatchScrapeUrls(urls, params, idempotencyKey) {
|
|
301
301
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
302
|
-
let jsonData = { urls,
|
|
302
|
+
let jsonData = { urls, ...params ?? {} };
|
|
303
303
|
try {
|
|
304
304
|
const response = await this.postRequest(
|
|
305
305
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -327,8 +327,8 @@ var FirecrawlApp = class {
|
|
|
327
327
|
* @param idempotencyKey - Optional idempotency key for the request.
|
|
328
328
|
* @returns A CrawlWatcher instance to monitor the crawl job.
|
|
329
329
|
*/
|
|
330
|
-
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey
|
|
331
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
330
|
+
async batchScrapeUrlsAndWatch(urls, params, idempotencyKey) {
|
|
331
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
332
332
|
if (crawl.success && crawl.id) {
|
|
333
333
|
const id = crawl.id;
|
|
334
334
|
return new CrawlWatcher(id, this);
|
|
@@ -357,7 +357,7 @@ var FirecrawlApp = class {
|
|
|
357
357
|
let statusData = response.data;
|
|
358
358
|
if ("data" in statusData) {
|
|
359
359
|
let data = statusData.data;
|
|
360
|
-
while ("next" in statusData) {
|
|
360
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
361
361
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
362
362
|
data = data.concat(statusData.data);
|
|
363
363
|
}
|
|
@@ -493,40 +493,44 @@ var FirecrawlApp = class {
|
|
|
493
493
|
* @returns The final job status or data.
|
|
494
494
|
*/
|
|
495
495
|
async monitorJobStatus(id, headers, checkInterval) {
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
|
|
501
|
-
|
|
502
|
-
|
|
503
|
-
|
|
504
|
-
if ("
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
508
|
-
|
|
509
|
-
|
|
496
|
+
try {
|
|
497
|
+
while (true) {
|
|
498
|
+
let statusResponse = await this.getRequest(
|
|
499
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
500
|
+
headers
|
|
501
|
+
);
|
|
502
|
+
if (statusResponse.status === 200) {
|
|
503
|
+
let statusData = statusResponse.data;
|
|
504
|
+
if (statusData.status === "completed") {
|
|
505
|
+
if ("data" in statusData) {
|
|
506
|
+
let data = statusData.data;
|
|
507
|
+
while (typeof statusData === "object" && "next" in statusData) {
|
|
508
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
509
|
+
statusData = statusResponse.data;
|
|
510
|
+
data = data.concat(statusData.data);
|
|
511
|
+
}
|
|
512
|
+
statusData.data = data;
|
|
513
|
+
return statusData;
|
|
514
|
+
} else {
|
|
515
|
+
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
|
510
516
|
}
|
|
511
|
-
|
|
512
|
-
|
|
517
|
+
} else if (["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)) {
|
|
518
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
519
|
+
await new Promise(
|
|
520
|
+
(resolve) => setTimeout(resolve, checkInterval * 1e3)
|
|
521
|
+
);
|
|
513
522
|
} else {
|
|
514
|
-
throw new FirecrawlError(
|
|
523
|
+
throw new FirecrawlError(
|
|
524
|
+
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
525
|
+
500
|
|
526
|
+
);
|
|
515
527
|
}
|
|
516
|
-
} else if (["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)) {
|
|
517
|
-
checkInterval = Math.max(checkInterval, 2);
|
|
518
|
-
await new Promise(
|
|
519
|
-
(resolve) => setTimeout(resolve, checkInterval * 1e3)
|
|
520
|
-
);
|
|
521
528
|
} else {
|
|
522
|
-
|
|
523
|
-
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
524
|
-
500
|
|
525
|
-
);
|
|
529
|
+
this.handleError(statusResponse, "check crawl status");
|
|
526
530
|
}
|
|
527
|
-
} else {
|
|
528
|
-
this.handleError(statusResponse, "check crawl status");
|
|
529
531
|
}
|
|
532
|
+
} catch (error) {
|
|
533
|
+
throw new FirecrawlError(error, 500);
|
|
530
534
|
}
|
|
531
535
|
}
|
|
532
536
|
/**
|
|
@@ -553,10 +557,8 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
553
557
|
ws;
|
|
554
558
|
data;
|
|
555
559
|
status;
|
|
556
|
-
id;
|
|
557
560
|
constructor(id, app) {
|
|
558
561
|
super();
|
|
559
|
-
this.id = id;
|
|
560
562
|
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
561
563
|
this.status = "scraping";
|
|
562
564
|
this.data = [];
|
|
@@ -566,8 +568,7 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
566
568
|
this.dispatchTypedEvent("done", new CustomEvent("done", {
|
|
567
569
|
detail: {
|
|
568
570
|
status: this.status,
|
|
569
|
-
data: this.data
|
|
570
|
-
id: this.id
|
|
571
|
+
data: this.data
|
|
571
572
|
}
|
|
572
573
|
}));
|
|
573
574
|
} else if (msg.type === "error") {
|
|
@@ -576,8 +577,7 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
576
577
|
detail: {
|
|
577
578
|
status: this.status,
|
|
578
579
|
data: this.data,
|
|
579
|
-
error: msg.error
|
|
580
|
-
id: this.id
|
|
580
|
+
error: msg.error
|
|
581
581
|
}
|
|
582
582
|
}));
|
|
583
583
|
} else if (msg.type === "catchup") {
|
|
@@ -585,18 +585,12 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
585
585
|
this.data.push(...msg.data.data ?? []);
|
|
586
586
|
for (const doc of this.data) {
|
|
587
587
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
588
|
-
detail:
|
|
589
|
-
...doc,
|
|
590
|
-
id: this.id
|
|
591
|
-
}
|
|
588
|
+
detail: doc
|
|
592
589
|
}));
|
|
593
590
|
}
|
|
594
591
|
} else if (msg.type === "document") {
|
|
595
592
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
596
|
-
detail:
|
|
597
|
-
...msg.data,
|
|
598
|
-
id: this.id
|
|
599
|
-
}
|
|
593
|
+
detail: msg.data
|
|
600
594
|
}));
|
|
601
595
|
}
|
|
602
596
|
};
|
|
@@ -618,8 +612,7 @@ var CrawlWatcher = class extends TypedEventTarget {
|
|
|
618
612
|
detail: {
|
|
619
613
|
status: this.status,
|
|
620
614
|
data: this.data,
|
|
621
|
-
error: "WebSocket error"
|
|
622
|
-
id: this.id
|
|
615
|
+
error: "WebSocket error"
|
|
623
616
|
}
|
|
624
617
|
}));
|
|
625
618
|
}).bind(this);
|
package/package.json
CHANGED
package/src/index.ts
CHANGED
|
@@ -183,7 +183,6 @@ export interface BatchScrapeResponse {
|
|
|
183
183
|
url?: string;
|
|
184
184
|
success: true;
|
|
185
185
|
error?: string;
|
|
186
|
-
invalidURLs?: string[];
|
|
187
186
|
}
|
|
188
187
|
|
|
189
188
|
/**
|
|
@@ -243,11 +242,10 @@ export interface MapResponse {
|
|
|
243
242
|
* Defines options for extracting information from URLs.
|
|
244
243
|
*/
|
|
245
244
|
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
|
|
246
|
-
prompt
|
|
245
|
+
prompt: string;
|
|
247
246
|
schema?: LLMSchema;
|
|
248
247
|
systemPrompt?: string;
|
|
249
248
|
allowExternalLinks?: boolean;
|
|
250
|
-
includeSubdomains?: boolean;
|
|
251
249
|
}
|
|
252
250
|
|
|
253
251
|
/**
|
|
@@ -464,7 +462,7 @@ export default class FirecrawlApp {
|
|
|
464
462
|
let statusData = response.data
|
|
465
463
|
if ("data" in statusData) {
|
|
466
464
|
let data = statusData.data;
|
|
467
|
-
while ('next' in statusData) {
|
|
465
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
468
466
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
469
467
|
data = data.concat(statusData.data);
|
|
470
468
|
}
|
|
@@ -578,10 +576,9 @@ export default class FirecrawlApp {
|
|
|
578
576
|
pollInterval: number = 2,
|
|
579
577
|
idempotencyKey?: string,
|
|
580
578
|
webhook?: CrawlParams["webhook"],
|
|
581
|
-
ignoreInvalidURLs?: boolean,
|
|
582
579
|
): Promise<BatchScrapeStatusResponse | ErrorResponse> {
|
|
583
580
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
584
|
-
let jsonData: any = { urls,
|
|
581
|
+
let jsonData: any = { urls, ...params };
|
|
585
582
|
if (jsonData?.extract?.schema) {
|
|
586
583
|
let schema = jsonData.extract.schema;
|
|
587
584
|
|
|
@@ -624,12 +621,10 @@ export default class FirecrawlApp {
|
|
|
624
621
|
async asyncBatchScrapeUrls(
|
|
625
622
|
urls: string[],
|
|
626
623
|
params?: ScrapeParams,
|
|
627
|
-
idempotencyKey?: string
|
|
628
|
-
webhook?: CrawlParams["webhook"],
|
|
629
|
-
ignoreInvalidURLs?: boolean,
|
|
624
|
+
idempotencyKey?: string
|
|
630
625
|
): Promise<BatchScrapeResponse | ErrorResponse> {
|
|
631
626
|
const headers = this.prepareHeaders(idempotencyKey);
|
|
632
|
-
let jsonData: any = { urls,
|
|
627
|
+
let jsonData: any = { urls, ...(params ?? {}) };
|
|
633
628
|
try {
|
|
634
629
|
const response: AxiosResponse = await this.postRequest(
|
|
635
630
|
this.apiUrl + `/v1/batch/scrape`,
|
|
@@ -662,10 +657,8 @@ export default class FirecrawlApp {
|
|
|
662
657
|
urls: string[],
|
|
663
658
|
params?: ScrapeParams,
|
|
664
659
|
idempotencyKey?: string,
|
|
665
|
-
webhook?: CrawlParams["webhook"],
|
|
666
|
-
ignoreInvalidURLs?: boolean,
|
|
667
660
|
) {
|
|
668
|
-
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey
|
|
661
|
+
const crawl = await this.asyncBatchScrapeUrls(urls, params, idempotencyKey);
|
|
669
662
|
|
|
670
663
|
if (crawl.success && crawl.id) {
|
|
671
664
|
const id = crawl.id;
|
|
@@ -698,7 +691,7 @@ export default class FirecrawlApp {
|
|
|
698
691
|
let statusData = response.data
|
|
699
692
|
if ("data" in statusData) {
|
|
700
693
|
let data = statusData.data;
|
|
701
|
-
while ('next' in statusData) {
|
|
694
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
702
695
|
statusData = (await this.getRequest(statusData.next, headers)).data;
|
|
703
696
|
data = data.concat(statusData.data);
|
|
704
697
|
}
|
|
@@ -857,42 +850,46 @@ export default class FirecrawlApp {
|
|
|
857
850
|
headers: AxiosRequestHeaders,
|
|
858
851
|
checkInterval: number
|
|
859
852
|
): Promise<CrawlStatusResponse | ErrorResponse> {
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
if ("
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
853
|
+
try {
|
|
854
|
+
while (true) {
|
|
855
|
+
let statusResponse: AxiosResponse = await this.getRequest(
|
|
856
|
+
`${this.apiUrl}/v1/crawl/${id}`,
|
|
857
|
+
headers
|
|
858
|
+
);
|
|
859
|
+
if (statusResponse.status === 200) {
|
|
860
|
+
let statusData = statusResponse.data;
|
|
861
|
+
if (statusData.status === "completed") {
|
|
862
|
+
if ("data" in statusData) {
|
|
863
|
+
let data = statusData.data;
|
|
864
|
+
while (typeof statusData === 'object' && 'next' in statusData) {
|
|
865
|
+
statusResponse = await this.getRequest(statusData.next, headers);
|
|
866
|
+
statusData = statusResponse.data;
|
|
867
|
+
data = data.concat(statusData.data);
|
|
868
|
+
}
|
|
869
|
+
statusData.data = data;
|
|
870
|
+
return statusData;
|
|
871
|
+
} else {
|
|
872
|
+
throw new FirecrawlError("Crawl job completed but no data was returned", 500);
|
|
874
873
|
}
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
874
|
+
} else if (
|
|
875
|
+
["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status)
|
|
876
|
+
) {
|
|
877
|
+
checkInterval = Math.max(checkInterval, 2);
|
|
878
|
+
await new Promise((resolve) =>
|
|
879
|
+
setTimeout(resolve, checkInterval * 1000)
|
|
880
|
+
);
|
|
881
|
+
} else {
|
|
882
|
+
throw new FirecrawlError(
|
|
883
|
+
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
884
|
+
500
|
|
885
|
+
);
|
|
886
|
+
}
|
|
887
887
|
} else {
|
|
888
|
-
|
|
889
|
-
`Crawl job failed or was stopped. Status: ${statusData.status}`,
|
|
890
|
-
500
|
|
891
|
-
);
|
|
888
|
+
this.handleError(statusResponse, "check crawl status");
|
|
892
889
|
}
|
|
893
|
-
} else {
|
|
894
|
-
this.handleError(statusResponse, "check crawl status");
|
|
895
890
|
}
|
|
891
|
+
} catch (error: any) {
|
|
892
|
+
throw new FirecrawlError(error, 500);
|
|
896
893
|
}
|
|
897
894
|
}
|
|
898
895
|
|
|
@@ -935,11 +932,9 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
935
932
|
private ws: WebSocket;
|
|
936
933
|
public data: FirecrawlDocument<undefined>[];
|
|
937
934
|
public status: CrawlStatusResponse["status"];
|
|
938
|
-
public id: string;
|
|
939
935
|
|
|
940
936
|
constructor(id: string, app: FirecrawlApp) {
|
|
941
937
|
super();
|
|
942
|
-
this.id = id;
|
|
943
938
|
this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey);
|
|
944
939
|
this.status = "scraping";
|
|
945
940
|
this.data = [];
|
|
@@ -970,7 +965,6 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
970
965
|
detail: {
|
|
971
966
|
status: this.status,
|
|
972
967
|
data: this.data,
|
|
973
|
-
id: this.id,
|
|
974
968
|
},
|
|
975
969
|
}));
|
|
976
970
|
} else if (msg.type === "error") {
|
|
@@ -980,7 +974,6 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
980
974
|
status: this.status,
|
|
981
975
|
data: this.data,
|
|
982
976
|
error: msg.error,
|
|
983
|
-
id: this.id,
|
|
984
977
|
},
|
|
985
978
|
}));
|
|
986
979
|
} else if (msg.type === "catchup") {
|
|
@@ -988,18 +981,12 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
988
981
|
this.data.push(...(msg.data.data ?? []));
|
|
989
982
|
for (const doc of this.data) {
|
|
990
983
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
991
|
-
detail:
|
|
992
|
-
...doc,
|
|
993
|
-
id: this.id,
|
|
994
|
-
},
|
|
984
|
+
detail: doc,
|
|
995
985
|
}));
|
|
996
986
|
}
|
|
997
987
|
} else if (msg.type === "document") {
|
|
998
988
|
this.dispatchTypedEvent("document", new CustomEvent("document", {
|
|
999
|
-
detail:
|
|
1000
|
-
...msg.data,
|
|
1001
|
-
id: this.id,
|
|
1002
|
-
},
|
|
989
|
+
detail: msg.data,
|
|
1003
990
|
}));
|
|
1004
991
|
}
|
|
1005
992
|
}
|
|
@@ -1026,7 +1013,6 @@ export class CrawlWatcher extends TypedEventTarget<CrawlWatcherEvents> {
|
|
|
1026
1013
|
status: this.status,
|
|
1027
1014
|
data: this.data,
|
|
1028
1015
|
error: "WebSocket error",
|
|
1029
|
-
id: this.id,
|
|
1030
1016
|
},
|
|
1031
1017
|
}));
|
|
1032
1018
|
}).bind(this);
|