firecrawl 3.3.0 → 4.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-Y3QF4XAJ.js → chunk-YH34PXKT.js} +1 -1
- package/dist/index.cjs +70 -11
- package/dist/index.d.cts +13 -3
- package/dist/index.d.ts +13 -3
- package/dist/index.js +71 -12
- package/dist/{package-LI2S3JCZ.js → package-CW75NWUC.js} +1 -1
- package/package.json +1 -1
- package/src/__tests__/unit/v2/pagination.test.ts +112 -0
- package/src/v2/client.ts +5 -4
- package/src/v2/methods/batch.ts +25 -5
- package/src/v2/methods/crawl.ts +28 -5
- package/src/v2/types.ts +12 -0
- package/src/v2/utils/pagination.ts +45 -0
|
@@ -8,7 +8,7 @@ var require_package = __commonJS({
|
|
|
8
8
|
"package.json"(exports, module) {
|
|
9
9
|
module.exports = {
|
|
10
10
|
name: "@mendable/firecrawl-js",
|
|
11
|
-
version: "
|
|
11
|
+
version: "4.0.0",
|
|
12
12
|
description: "JavaScript SDK for Firecrawl API",
|
|
13
13
|
main: "dist/index.js",
|
|
14
14
|
types: "dist/index.d.ts",
|
package/dist/index.cjs
CHANGED
|
@@ -35,7 +35,7 @@ var require_package = __commonJS({
|
|
|
35
35
|
"package.json"(exports2, module2) {
|
|
36
36
|
module2.exports = {
|
|
37
37
|
name: "@mendable/firecrawl-js",
|
|
38
|
-
version: "
|
|
38
|
+
version: "4.0.0",
|
|
39
39
|
description: "JavaScript SDK for Firecrawl API",
|
|
40
40
|
main: "dist/index.js",
|
|
41
41
|
types: "dist/index.d.ts",
|
|
@@ -395,6 +395,37 @@ async function map(http, url, options) {
|
|
|
395
395
|
}
|
|
396
396
|
}
|
|
397
397
|
|
|
398
|
+
// src/v2/utils/pagination.ts
|
|
399
|
+
async function fetchAllPages(http, nextUrl, initial, pagination) {
|
|
400
|
+
const docs = initial.slice();
|
|
401
|
+
let current = nextUrl;
|
|
402
|
+
let pageCount = 0;
|
|
403
|
+
const maxPages = pagination?.maxPages ?? void 0;
|
|
404
|
+
const maxResults = pagination?.maxResults ?? void 0;
|
|
405
|
+
const maxWaitTime = pagination?.maxWaitTime ?? void 0;
|
|
406
|
+
const started = Date.now();
|
|
407
|
+
while (current) {
|
|
408
|
+
if (maxPages != null && pageCount >= maxPages) break;
|
|
409
|
+
if (maxWaitTime != null && (Date.now() - started) / 1e3 > maxWaitTime) break;
|
|
410
|
+
let payload = null;
|
|
411
|
+
try {
|
|
412
|
+
const res = await http.get(current);
|
|
413
|
+
payload = res.data;
|
|
414
|
+
} catch {
|
|
415
|
+
break;
|
|
416
|
+
}
|
|
417
|
+
if (!payload?.success) break;
|
|
418
|
+
for (const d of payload.data || []) {
|
|
419
|
+
if (maxResults != null && docs.length >= maxResults) break;
|
|
420
|
+
docs.push(d);
|
|
421
|
+
}
|
|
422
|
+
if (maxResults != null && docs.length >= maxResults) break;
|
|
423
|
+
current = payload.next ?? null;
|
|
424
|
+
pageCount += 1;
|
|
425
|
+
}
|
|
426
|
+
return docs;
|
|
427
|
+
}
|
|
428
|
+
|
|
398
429
|
// src/v2/methods/crawl.ts
|
|
399
430
|
function prepareCrawlPayload(request) {
|
|
400
431
|
if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty");
|
|
@@ -432,21 +463,35 @@ async function startCrawl(http, request) {
|
|
|
432
463
|
throw err;
|
|
433
464
|
}
|
|
434
465
|
}
|
|
435
|
-
async function getCrawlStatus(http, jobId) {
|
|
466
|
+
async function getCrawlStatus(http, jobId, pagination) {
|
|
436
467
|
try {
|
|
437
468
|
const res = await http.get(`/v2/crawl/${jobId}`);
|
|
438
469
|
if (res.status !== 200 || !res.data?.success) {
|
|
439
470
|
throwForBadResponse(res, "get crawl status");
|
|
440
471
|
}
|
|
441
472
|
const body = res.data;
|
|
473
|
+
const initialDocs = body.data || [];
|
|
474
|
+
const auto = pagination?.autoPaginate ?? true;
|
|
475
|
+
if (!auto || !body.next) {
|
|
476
|
+
return {
|
|
477
|
+
status: body.status,
|
|
478
|
+
completed: body.completed ?? 0,
|
|
479
|
+
total: body.total ?? 0,
|
|
480
|
+
creditsUsed: body.creditsUsed,
|
|
481
|
+
expiresAt: body.expiresAt,
|
|
482
|
+
next: body.next ?? null,
|
|
483
|
+
data: initialDocs
|
|
484
|
+
};
|
|
485
|
+
}
|
|
486
|
+
const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
|
|
442
487
|
return {
|
|
443
488
|
status: body.status,
|
|
444
489
|
completed: body.completed ?? 0,
|
|
445
490
|
total: body.total ?? 0,
|
|
446
491
|
creditsUsed: body.creditsUsed,
|
|
447
492
|
expiresAt: body.expiresAt,
|
|
448
|
-
next:
|
|
449
|
-
data:
|
|
493
|
+
next: null,
|
|
494
|
+
data: aggregated
|
|
450
495
|
};
|
|
451
496
|
} catch (err) {
|
|
452
497
|
if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
|
|
@@ -549,19 +594,33 @@ async function startBatchScrape(http, urls, {
|
|
|
549
594
|
throw err;
|
|
550
595
|
}
|
|
551
596
|
}
|
|
552
|
-
async function getBatchScrapeStatus(http, jobId) {
|
|
597
|
+
async function getBatchScrapeStatus(http, jobId, pagination) {
|
|
553
598
|
try {
|
|
554
599
|
const res = await http.get(`/v2/batch/scrape/${jobId}`);
|
|
555
600
|
if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
|
|
556
601
|
const body = res.data;
|
|
602
|
+
const initialDocs = body.data || [];
|
|
603
|
+
const auto = pagination?.autoPaginate ?? true;
|
|
604
|
+
if (!auto || !body.next) {
|
|
605
|
+
return {
|
|
606
|
+
status: body.status,
|
|
607
|
+
completed: body.completed ?? 0,
|
|
608
|
+
total: body.total ?? 0,
|
|
609
|
+
creditsUsed: body.creditsUsed,
|
|
610
|
+
expiresAt: body.expiresAt,
|
|
611
|
+
next: body.next ?? null,
|
|
612
|
+
data: initialDocs
|
|
613
|
+
};
|
|
614
|
+
}
|
|
615
|
+
const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
|
|
557
616
|
return {
|
|
558
617
|
status: body.status,
|
|
559
618
|
completed: body.completed ?? 0,
|
|
560
619
|
total: body.total ?? 0,
|
|
561
620
|
creditsUsed: body.creditsUsed,
|
|
562
621
|
expiresAt: body.expiresAt,
|
|
563
|
-
next:
|
|
564
|
-
data:
|
|
622
|
+
next: null,
|
|
623
|
+
data: aggregated
|
|
565
624
|
};
|
|
566
625
|
} catch (err) {
|
|
567
626
|
if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
|
|
@@ -885,8 +944,8 @@ var FirecrawlClient = class {
|
|
|
885
944
|
* Get the status and partial data of a crawl job.
|
|
886
945
|
* @param jobId Crawl job id.
|
|
887
946
|
*/
|
|
888
|
-
async getCrawlStatus(jobId) {
|
|
889
|
-
return getCrawlStatus(this.http, jobId);
|
|
947
|
+
async getCrawlStatus(jobId, pagination) {
|
|
948
|
+
return getCrawlStatus(this.http, jobId, pagination);
|
|
890
949
|
}
|
|
891
950
|
/**
|
|
892
951
|
* Cancel a crawl job.
|
|
@@ -940,8 +999,8 @@ var FirecrawlClient = class {
|
|
|
940
999
|
* Get the status and partial data of a batch scrape job.
|
|
941
1000
|
* @param jobId Batch job id.
|
|
942
1001
|
*/
|
|
943
|
-
async getBatchScrapeStatus(jobId) {
|
|
944
|
-
return getBatchScrapeStatus(this.http, jobId);
|
|
1002
|
+
async getBatchScrapeStatus(jobId, pagination) {
|
|
1003
|
+
return getBatchScrapeStatus(this.http, jobId, pagination);
|
|
945
1004
|
}
|
|
946
1005
|
/**
|
|
947
1006
|
* Retrieve batch scrape errors and robots.txt blocks.
|
package/dist/index.d.cts
CHANGED
|
@@ -151,6 +151,16 @@ interface Document {
|
|
|
151
151
|
warning?: string;
|
|
152
152
|
changeTracking?: Record<string, unknown>;
|
|
153
153
|
}
|
|
154
|
+
interface PaginationConfig {
|
|
155
|
+
/** When true (default), automatically follow `next` links and aggregate all documents. */
|
|
156
|
+
autoPaginate?: boolean;
|
|
157
|
+
/** Maximum number of additional pages to fetch after the first response. */
|
|
158
|
+
maxPages?: number;
|
|
159
|
+
/** Maximum total number of documents to return across all pages. */
|
|
160
|
+
maxResults?: number;
|
|
161
|
+
/** Maximum time to spend fetching additional pages (in seconds). */
|
|
162
|
+
maxWaitTime?: number;
|
|
163
|
+
}
|
|
154
164
|
interface SearchResultWeb {
|
|
155
165
|
url: string;
|
|
156
166
|
title?: string;
|
|
@@ -440,7 +450,7 @@ declare class FirecrawlClient {
|
|
|
440
450
|
* Get the status and partial data of a crawl job.
|
|
441
451
|
* @param jobId Crawl job id.
|
|
442
452
|
*/
|
|
443
|
-
getCrawlStatus(jobId: string): Promise<CrawlJob>;
|
|
453
|
+
getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
|
|
444
454
|
/**
|
|
445
455
|
* Cancel a crawl job.
|
|
446
456
|
* @param jobId Crawl job id.
|
|
@@ -483,7 +493,7 @@ declare class FirecrawlClient {
|
|
|
483
493
|
* Get the status and partial data of a batch scrape job.
|
|
484
494
|
* @param jobId Batch job id.
|
|
485
495
|
*/
|
|
486
|
-
getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
|
|
496
|
+
getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;
|
|
487
497
|
/**
|
|
488
498
|
* Retrieve batch scrape errors and robots.txt blocks.
|
|
489
499
|
* @param jobId Batch job id.
|
|
@@ -1361,4 +1371,4 @@ declare class Firecrawl extends FirecrawlClient {
|
|
|
1361
1371
|
get v1(): FirecrawlApp;
|
|
1362
1372
|
}
|
|
1363
1373
|
|
|
1364
|
-
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
|
|
1374
|
+
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
|
package/dist/index.d.ts
CHANGED
|
@@ -151,6 +151,16 @@ interface Document {
|
|
|
151
151
|
warning?: string;
|
|
152
152
|
changeTracking?: Record<string, unknown>;
|
|
153
153
|
}
|
|
154
|
+
interface PaginationConfig {
|
|
155
|
+
/** When true (default), automatically follow `next` links and aggregate all documents. */
|
|
156
|
+
autoPaginate?: boolean;
|
|
157
|
+
/** Maximum number of additional pages to fetch after the first response. */
|
|
158
|
+
maxPages?: number;
|
|
159
|
+
/** Maximum total number of documents to return across all pages. */
|
|
160
|
+
maxResults?: number;
|
|
161
|
+
/** Maximum time to spend fetching additional pages (in seconds). */
|
|
162
|
+
maxWaitTime?: number;
|
|
163
|
+
}
|
|
154
164
|
interface SearchResultWeb {
|
|
155
165
|
url: string;
|
|
156
166
|
title?: string;
|
|
@@ -440,7 +450,7 @@ declare class FirecrawlClient {
|
|
|
440
450
|
* Get the status and partial data of a crawl job.
|
|
441
451
|
* @param jobId Crawl job id.
|
|
442
452
|
*/
|
|
443
|
-
getCrawlStatus(jobId: string): Promise<CrawlJob>;
|
|
453
|
+
getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
|
|
444
454
|
/**
|
|
445
455
|
* Cancel a crawl job.
|
|
446
456
|
* @param jobId Crawl job id.
|
|
@@ -483,7 +493,7 @@ declare class FirecrawlClient {
|
|
|
483
493
|
* Get the status and partial data of a batch scrape job.
|
|
484
494
|
* @param jobId Batch job id.
|
|
485
495
|
*/
|
|
486
|
-
getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
|
|
496
|
+
getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;
|
|
487
497
|
/**
|
|
488
498
|
* Retrieve batch scrape errors and robots.txt blocks.
|
|
489
499
|
* @param jobId Batch job id.
|
|
@@ -1361,4 +1371,4 @@ declare class Firecrawl extends FirecrawlClient {
|
|
|
1361
1371
|
get v1(): FirecrawlApp;
|
|
1362
1372
|
}
|
|
1363
1373
|
|
|
1364
|
-
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
|
|
1374
|
+
export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
|
package/dist/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
require_package
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-YH34PXKT.js";
|
|
4
4
|
|
|
5
5
|
// src/v2/utils/httpClient.ts
|
|
6
6
|
import axios from "axios";
|
|
@@ -279,6 +279,37 @@ async function map(http, url, options) {
|
|
|
279
279
|
}
|
|
280
280
|
}
|
|
281
281
|
|
|
282
|
+
// src/v2/utils/pagination.ts
|
|
283
|
+
async function fetchAllPages(http, nextUrl, initial, pagination) {
|
|
284
|
+
const docs = initial.slice();
|
|
285
|
+
let current = nextUrl;
|
|
286
|
+
let pageCount = 0;
|
|
287
|
+
const maxPages = pagination?.maxPages ?? void 0;
|
|
288
|
+
const maxResults = pagination?.maxResults ?? void 0;
|
|
289
|
+
const maxWaitTime = pagination?.maxWaitTime ?? void 0;
|
|
290
|
+
const started = Date.now();
|
|
291
|
+
while (current) {
|
|
292
|
+
if (maxPages != null && pageCount >= maxPages) break;
|
|
293
|
+
if (maxWaitTime != null && (Date.now() - started) / 1e3 > maxWaitTime) break;
|
|
294
|
+
let payload = null;
|
|
295
|
+
try {
|
|
296
|
+
const res = await http.get(current);
|
|
297
|
+
payload = res.data;
|
|
298
|
+
} catch {
|
|
299
|
+
break;
|
|
300
|
+
}
|
|
301
|
+
if (!payload?.success) break;
|
|
302
|
+
for (const d of payload.data || []) {
|
|
303
|
+
if (maxResults != null && docs.length >= maxResults) break;
|
|
304
|
+
docs.push(d);
|
|
305
|
+
}
|
|
306
|
+
if (maxResults != null && docs.length >= maxResults) break;
|
|
307
|
+
current = payload.next ?? null;
|
|
308
|
+
pageCount += 1;
|
|
309
|
+
}
|
|
310
|
+
return docs;
|
|
311
|
+
}
|
|
312
|
+
|
|
282
313
|
// src/v2/methods/crawl.ts
|
|
283
314
|
function prepareCrawlPayload(request) {
|
|
284
315
|
if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty");
|
|
@@ -316,21 +347,35 @@ async function startCrawl(http, request) {
|
|
|
316
347
|
throw err;
|
|
317
348
|
}
|
|
318
349
|
}
|
|
319
|
-
async function getCrawlStatus(http, jobId) {
|
|
350
|
+
async function getCrawlStatus(http, jobId, pagination) {
|
|
320
351
|
try {
|
|
321
352
|
const res = await http.get(`/v2/crawl/${jobId}`);
|
|
322
353
|
if (res.status !== 200 || !res.data?.success) {
|
|
323
354
|
throwForBadResponse(res, "get crawl status");
|
|
324
355
|
}
|
|
325
356
|
const body = res.data;
|
|
357
|
+
const initialDocs = body.data || [];
|
|
358
|
+
const auto = pagination?.autoPaginate ?? true;
|
|
359
|
+
if (!auto || !body.next) {
|
|
360
|
+
return {
|
|
361
|
+
status: body.status,
|
|
362
|
+
completed: body.completed ?? 0,
|
|
363
|
+
total: body.total ?? 0,
|
|
364
|
+
creditsUsed: body.creditsUsed,
|
|
365
|
+
expiresAt: body.expiresAt,
|
|
366
|
+
next: body.next ?? null,
|
|
367
|
+
data: initialDocs
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
|
|
326
371
|
return {
|
|
327
372
|
status: body.status,
|
|
328
373
|
completed: body.completed ?? 0,
|
|
329
374
|
total: body.total ?? 0,
|
|
330
375
|
creditsUsed: body.creditsUsed,
|
|
331
376
|
expiresAt: body.expiresAt,
|
|
332
|
-
next:
|
|
333
|
-
data:
|
|
377
|
+
next: null,
|
|
378
|
+
data: aggregated
|
|
334
379
|
};
|
|
335
380
|
} catch (err) {
|
|
336
381
|
if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
|
|
@@ -433,19 +478,33 @@ async function startBatchScrape(http, urls, {
|
|
|
433
478
|
throw err;
|
|
434
479
|
}
|
|
435
480
|
}
|
|
436
|
-
async function getBatchScrapeStatus(http, jobId) {
|
|
481
|
+
async function getBatchScrapeStatus(http, jobId, pagination) {
|
|
437
482
|
try {
|
|
438
483
|
const res = await http.get(`/v2/batch/scrape/${jobId}`);
|
|
439
484
|
if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
|
|
440
485
|
const body = res.data;
|
|
486
|
+
const initialDocs = body.data || [];
|
|
487
|
+
const auto = pagination?.autoPaginate ?? true;
|
|
488
|
+
if (!auto || !body.next) {
|
|
489
|
+
return {
|
|
490
|
+
status: body.status,
|
|
491
|
+
completed: body.completed ?? 0,
|
|
492
|
+
total: body.total ?? 0,
|
|
493
|
+
creditsUsed: body.creditsUsed,
|
|
494
|
+
expiresAt: body.expiresAt,
|
|
495
|
+
next: body.next ?? null,
|
|
496
|
+
data: initialDocs
|
|
497
|
+
};
|
|
498
|
+
}
|
|
499
|
+
const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
|
|
441
500
|
return {
|
|
442
501
|
status: body.status,
|
|
443
502
|
completed: body.completed ?? 0,
|
|
444
503
|
total: body.total ?? 0,
|
|
445
504
|
creditsUsed: body.creditsUsed,
|
|
446
505
|
expiresAt: body.expiresAt,
|
|
447
|
-
next:
|
|
448
|
-
data:
|
|
506
|
+
next: null,
|
|
507
|
+
data: aggregated
|
|
449
508
|
};
|
|
450
509
|
} catch (err) {
|
|
451
510
|
if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
|
|
@@ -769,8 +828,8 @@ var FirecrawlClient = class {
|
|
|
769
828
|
* Get the status and partial data of a crawl job.
|
|
770
829
|
* @param jobId Crawl job id.
|
|
771
830
|
*/
|
|
772
|
-
async getCrawlStatus(jobId) {
|
|
773
|
-
return getCrawlStatus(this.http, jobId);
|
|
831
|
+
async getCrawlStatus(jobId, pagination) {
|
|
832
|
+
return getCrawlStatus(this.http, jobId, pagination);
|
|
774
833
|
}
|
|
775
834
|
/**
|
|
776
835
|
* Cancel a crawl job.
|
|
@@ -824,8 +883,8 @@ var FirecrawlClient = class {
|
|
|
824
883
|
* Get the status and partial data of a batch scrape job.
|
|
825
884
|
* @param jobId Batch job id.
|
|
826
885
|
*/
|
|
827
|
-
async getBatchScrapeStatus(jobId) {
|
|
828
|
-
return getBatchScrapeStatus(this.http, jobId);
|
|
886
|
+
async getBatchScrapeStatus(jobId, pagination) {
|
|
887
|
+
return getBatchScrapeStatus(this.http, jobId, pagination);
|
|
829
888
|
}
|
|
830
889
|
/**
|
|
831
890
|
* Retrieve batch scrape errors and robots.txt blocks.
|
|
@@ -933,7 +992,7 @@ var FirecrawlApp = class {
|
|
|
933
992
|
if (typeof process !== "undefined" && process.env && process.env.npm_package_version) {
|
|
934
993
|
return process.env.npm_package_version;
|
|
935
994
|
}
|
|
936
|
-
const packageJson = await import("./package-
|
|
995
|
+
const packageJson = await import("./package-CW75NWUC.js");
|
|
937
996
|
return packageJson.default.version;
|
|
938
997
|
} catch (error) {
|
|
939
998
|
const isTest = typeof process !== "undefined" && (process.env.JEST_WORKER_ID != null || false);
|
package/package.json
CHANGED
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
import { describe, test, expect, jest } from "@jest/globals";
|
|
2
|
+
import { getCrawlStatus } from "../../../v2/methods/crawl";
|
|
3
|
+
import { getBatchScrapeStatus } from "../../../v2/methods/batch";
|
|
4
|
+
|
|
5
|
+
describe("JS SDK v2 pagination", () => {
|
|
6
|
+
function makeHttp(getImpl: (url: string) => any) {
|
|
7
|
+
return { get: jest.fn(async (u: string) => getImpl(u)) } as any;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
test("crawl: autoPaginate=false returns next", async () => {
|
|
11
|
+
const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 2, next: "https://api/next", data: [{ markdown: "a" }] } };
|
|
12
|
+
const http = makeHttp(() => first);
|
|
13
|
+
const res = await getCrawlStatus(http, "job1", { autoPaginate: false });
|
|
14
|
+
expect(res.data.length).toBe(1);
|
|
15
|
+
expect(res.next).toBe("https://api/next");
|
|
16
|
+
});
|
|
17
|
+
|
|
18
|
+
test("crawl: default autoPaginate aggregates and nulls next", async () => {
|
|
19
|
+
const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 3, next: "https://api/n1", data: [{ markdown: "a" }] } };
|
|
20
|
+
const second = { status: 200, data: { success: true, next: "https://api/n2", data: [{ markdown: "b" }] } };
|
|
21
|
+
const third = { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
|
|
22
|
+
const http = makeHttp((url) => {
|
|
23
|
+
if (url.includes("/v2/crawl/")) return first;
|
|
24
|
+
if (url.endsWith("n1")) return second;
|
|
25
|
+
return third;
|
|
26
|
+
});
|
|
27
|
+
const res = await getCrawlStatus(http, "job1");
|
|
28
|
+
expect(res.data.length).toBe(3);
|
|
29
|
+
expect(res.next).toBeNull();
|
|
30
|
+
});
|
|
31
|
+
|
|
32
|
+
test("crawl: respects maxPages and maxResults", async () => {
|
|
33
|
+
const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 10, next: "https://api/n1", data: [{ markdown: "a" }] } };
|
|
34
|
+
const page = (n: number) => ({ status: 200, data: { success: true, next: n < 3 ? `https://api/n${n + 1}` : null, data: [{ markdown: `p${n}` }] } });
|
|
35
|
+
const http = makeHttp((url) => {
|
|
36
|
+
if (url.includes("/v2/crawl/")) return first;
|
|
37
|
+
if (url.endsWith("n1")) return page(1);
|
|
38
|
+
if (url.endsWith("n2")) return page(2);
|
|
39
|
+
return page(3);
|
|
40
|
+
});
|
|
41
|
+
const res = await getCrawlStatus(http, "job1", { autoPaginate: true, maxPages: 2, maxResults: 2 });
|
|
42
|
+
expect(res.data.length).toBe(2);
|
|
43
|
+
});
|
|
44
|
+
|
|
45
|
+
test("batch: default autoPaginate aggregates and nulls next", async () => {
|
|
46
|
+
const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 3, next: "https://api/b1", data: [{ markdown: "a" }] } };
|
|
47
|
+
const second = { status: 200, data: { success: true, next: "https://api/b2", data: [{ markdown: "b" }] } };
|
|
48
|
+
const third = { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
|
|
49
|
+
const http = makeHttp((url) => {
|
|
50
|
+
if (url.includes("/v2/batch/scrape/")) return first;
|
|
51
|
+
if (url.endsWith("b1")) return second;
|
|
52
|
+
return third;
|
|
53
|
+
});
|
|
54
|
+
const res = await getBatchScrapeStatus(http, "jobB");
|
|
55
|
+
expect(res.data.length).toBe(3);
|
|
56
|
+
expect(res.next).toBeNull();
|
|
57
|
+
});
|
|
58
|
+
|
|
59
|
+
test("batch: autoPaginate=false returns next", async () => {
|
|
60
|
+
const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 2, next: "https://api/nextBatch", data: [{ markdown: "a" }] } };
|
|
61
|
+
const http = makeHttp(() => first);
|
|
62
|
+
const res = await getBatchScrapeStatus(http, "jobB", { autoPaginate: false });
|
|
63
|
+
expect(res.data.length).toBe(1);
|
|
64
|
+
expect(res.next).toBe("https://api/nextBatch");
|
|
65
|
+
});
|
|
66
|
+
|
|
67
|
+
test("crawl: maxWaitTime stops pagination after first page", async () => {
|
|
68
|
+
const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 5, next: "https://api/n1", data: [{ markdown: "a" }] } };
|
|
69
|
+
const p1 = { status: 200, data: { success: true, next: "https://api/n2", data: [{ markdown: "b" }] } };
|
|
70
|
+
const http: any = makeHttp((url: string) => {
|
|
71
|
+
if (url.includes("/v2/crawl/")) return first;
|
|
72
|
+
if (url.endsWith("n1")) return p1;
|
|
73
|
+
return { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
|
|
74
|
+
});
|
|
75
|
+
const nowSpy = jest.spyOn(Date, "now");
|
|
76
|
+
try {
|
|
77
|
+
nowSpy
|
|
78
|
+
.mockImplementationOnce(() => 0) // started
|
|
79
|
+
.mockImplementationOnce(() => 0) // first loop check
|
|
80
|
+
.mockImplementationOnce(() => 3000); // second loop check > maxWaitTime
|
|
81
|
+
const res = await getCrawlStatus(http, "jobC", { autoPaginate: true, maxWaitTime: 1 });
|
|
82
|
+
expect(res.data.length).toBe(2); // initial + first page
|
|
83
|
+
expect((http.get as jest.Mock).mock.calls.length).toBe(2); // initial + n1 only
|
|
84
|
+
} finally {
|
|
85
|
+
nowSpy.mockRestore();
|
|
86
|
+
}
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
test("batch: maxWaitTime stops pagination after first page", async () => {
|
|
90
|
+
const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 5, next: "https://api/b1", data: [{ markdown: "a" }] } };
|
|
91
|
+
const p1 = { status: 200, data: { success: true, next: "https://api/b2", data: [{ markdown: "b" }] } };
|
|
92
|
+
const http: any = makeHttp((url: string) => {
|
|
93
|
+
if (url.includes("/v2/batch/scrape/")) return first;
|
|
94
|
+
if (url.endsWith("b1")) return p1;
|
|
95
|
+
return { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
|
|
96
|
+
});
|
|
97
|
+
const nowSpy = jest.spyOn(Date, "now");
|
|
98
|
+
try {
|
|
99
|
+
nowSpy
|
|
100
|
+
.mockImplementationOnce(() => 0) // started
|
|
101
|
+
.mockImplementationOnce(() => 0) // first loop check
|
|
102
|
+
.mockImplementationOnce(() => 3000); // second loop check > maxWaitTime
|
|
103
|
+
const res = await getBatchScrapeStatus(http, "jobB", { autoPaginate: true, maxWaitTime: 1 });
|
|
104
|
+
expect(res.data.length).toBe(2);
|
|
105
|
+
expect((http.get as jest.Mock).mock.calls.length).toBe(2);
|
|
106
|
+
} finally {
|
|
107
|
+
nowSpy.mockRestore();
|
|
108
|
+
}
|
|
109
|
+
});
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
|
package/src/v2/client.ts
CHANGED
|
@@ -36,6 +36,7 @@ import type {
|
|
|
36
36
|
ExtractResponse,
|
|
37
37
|
CrawlOptions,
|
|
38
38
|
BatchScrapeOptions,
|
|
39
|
+
PaginationConfig,
|
|
39
40
|
} from "./types";
|
|
40
41
|
import { Watcher } from "./watcher";
|
|
41
42
|
import type { WatcherOptions } from "./watcher";
|
|
@@ -145,8 +146,8 @@ export class FirecrawlClient {
|
|
|
145
146
|
* Get the status and partial data of a crawl job.
|
|
146
147
|
* @param jobId Crawl job id.
|
|
147
148
|
*/
|
|
148
|
-
async getCrawlStatus(jobId: string): Promise<CrawlJob> {
|
|
149
|
-
return getCrawlStatus(this.http, jobId);
|
|
149
|
+
async getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob> {
|
|
150
|
+
return getCrawlStatus(this.http, jobId, pagination);
|
|
150
151
|
}
|
|
151
152
|
/**
|
|
152
153
|
* Cancel a crawl job.
|
|
@@ -201,8 +202,8 @@ export class FirecrawlClient {
|
|
|
201
202
|
* Get the status and partial data of a batch scrape job.
|
|
202
203
|
* @param jobId Batch job id.
|
|
203
204
|
*/
|
|
204
|
-
async getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob> {
|
|
205
|
-
return getBatchScrapeStatus(this.http, jobId);
|
|
205
|
+
async getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob> {
|
|
206
|
+
return getBatchScrapeStatus(this.http, jobId, pagination);
|
|
206
207
|
}
|
|
207
208
|
/**
|
|
208
209
|
* Retrieve batch scrape errors and robots.txt blocks.
|
package/src/v2/methods/batch.ts
CHANGED
|
@@ -4,9 +4,11 @@ import {
|
|
|
4
4
|
type CrawlErrorsResponse,
|
|
5
5
|
type Document,
|
|
6
6
|
type BatchScrapeOptions,
|
|
7
|
+
type PaginationConfig,
|
|
7
8
|
} from "../types";
|
|
8
9
|
import { HttpClient } from "../utils/httpClient";
|
|
9
10
|
import { ensureValidScrapeOptions } from "../utils/validation";
|
|
11
|
+
import { fetchAllPages } from "../utils/pagination";
|
|
10
12
|
import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
|
|
11
13
|
|
|
12
14
|
export async function startBatchScrape(
|
|
@@ -47,19 +49,38 @@ export async function startBatchScrape(
|
|
|
47
49
|
}
|
|
48
50
|
}
|
|
49
51
|
|
|
50
|
-
export async function getBatchScrapeStatus(
|
|
52
|
+
export async function getBatchScrapeStatus(
|
|
53
|
+
http: HttpClient,
|
|
54
|
+
jobId: string,
|
|
55
|
+
pagination?: PaginationConfig
|
|
56
|
+
): Promise<BatchScrapeJob> {
|
|
51
57
|
try {
|
|
52
58
|
const res = await http.get<{ success: boolean; status: BatchScrapeJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/batch/scrape/${jobId}`);
|
|
53
59
|
if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
|
|
54
60
|
const body = res.data;
|
|
61
|
+
const initialDocs = (body.data || []) as Document[];
|
|
62
|
+
const auto = pagination?.autoPaginate ?? true;
|
|
63
|
+
if (!auto || !body.next) {
|
|
64
|
+
return {
|
|
65
|
+
status: body.status,
|
|
66
|
+
completed: body.completed ?? 0,
|
|
67
|
+
total: body.total ?? 0,
|
|
68
|
+
creditsUsed: body.creditsUsed,
|
|
69
|
+
expiresAt: body.expiresAt,
|
|
70
|
+
next: body.next ?? null,
|
|
71
|
+
data: initialDocs,
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
|
|
55
76
|
return {
|
|
56
77
|
status: body.status,
|
|
57
78
|
completed: body.completed ?? 0,
|
|
58
79
|
total: body.total ?? 0,
|
|
59
80
|
creditsUsed: body.creditsUsed,
|
|
60
81
|
expiresAt: body.expiresAt,
|
|
61
|
-
next:
|
|
62
|
-
data:
|
|
82
|
+
next: null,
|
|
83
|
+
data: aggregated,
|
|
63
84
|
};
|
|
64
85
|
} catch (err: any) {
|
|
65
86
|
if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
|
|
@@ -115,5 +136,4 @@ export function chunkUrls(urls: string[], chunkSize = 100): string[][] {
|
|
|
115
136
|
const chunks: string[][] = [];
|
|
116
137
|
for (let i = 0; i < urls.length; i += chunkSize) chunks.push(urls.slice(i, i + chunkSize));
|
|
117
138
|
return chunks;
|
|
118
|
-
}
|
|
119
|
-
|
|
139
|
+
}
|
package/src/v2/methods/crawl.ts
CHANGED
|
@@ -5,10 +5,13 @@ import {
|
|
|
5
5
|
type CrawlResponse,
|
|
6
6
|
type Document,
|
|
7
7
|
type CrawlOptions,
|
|
8
|
+
type PaginationConfig,
|
|
8
9
|
} from "../types";
|
|
9
10
|
import { HttpClient } from "../utils/httpClient";
|
|
10
11
|
import { ensureValidScrapeOptions } from "../utils/validation";
|
|
11
12
|
import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
|
|
13
|
+
import type { HttpClient as _Http } from "../utils/httpClient";
|
|
14
|
+
import { fetchAllPages } from "../utils/pagination";
|
|
12
15
|
|
|
13
16
|
export type CrawlRequest = CrawlOptions & {
|
|
14
17
|
url: string;
|
|
@@ -52,21 +55,42 @@ export async function startCrawl(http: HttpClient, request: CrawlRequest): Promi
|
|
|
52
55
|
}
|
|
53
56
|
}
|
|
54
57
|
|
|
55
|
-
export async function getCrawlStatus(
|
|
58
|
+
export async function getCrawlStatus(
|
|
59
|
+
http: HttpClient,
|
|
60
|
+
jobId: string,
|
|
61
|
+
pagination?: PaginationConfig
|
|
62
|
+
): Promise<CrawlJob> {
|
|
56
63
|
try {
|
|
57
64
|
const res = await http.get<{ success: boolean; status: CrawlJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/crawl/${jobId}`);
|
|
58
65
|
if (res.status !== 200 || !res.data?.success) {
|
|
59
66
|
throwForBadResponse(res, "get crawl status");
|
|
60
67
|
}
|
|
61
68
|
const body = res.data;
|
|
69
|
+
const initialDocs = (body.data || []) as Document[];
|
|
70
|
+
|
|
71
|
+
const auto = pagination?.autoPaginate ?? true;
|
|
72
|
+
if (!auto || !body.next) {
|
|
73
|
+
return {
|
|
74
|
+
status: body.status,
|
|
75
|
+
completed: body.completed ?? 0,
|
|
76
|
+
total: body.total ?? 0,
|
|
77
|
+
creditsUsed: body.creditsUsed,
|
|
78
|
+
expiresAt: body.expiresAt,
|
|
79
|
+
next: body.next ?? null,
|
|
80
|
+
data: initialDocs,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
|
|
85
|
+
|
|
62
86
|
return {
|
|
63
87
|
status: body.status,
|
|
64
88
|
completed: body.completed ?? 0,
|
|
65
89
|
total: body.total ?? 0,
|
|
66
90
|
creditsUsed: body.creditsUsed,
|
|
67
91
|
expiresAt: body.expiresAt,
|
|
68
|
-
next:
|
|
69
|
-
data:
|
|
92
|
+
next: null,
|
|
93
|
+
data: aggregated,
|
|
70
94
|
};
|
|
71
95
|
} catch (err: any) {
|
|
72
96
|
if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
|
|
@@ -140,5 +164,4 @@ export async function crawlParamsPreview(http: HttpClient, url: string, prompt:
|
|
|
140
164
|
if (err?.isAxiosError) return normalizeAxiosError(err, "crawl params preview");
|
|
141
165
|
throw err;
|
|
142
166
|
}
|
|
143
|
-
}
|
|
144
|
-
|
|
167
|
+
}
|
package/src/v2/types.ts
CHANGED
|
@@ -189,6 +189,18 @@ export interface Document {
|
|
|
189
189
|
changeTracking?: Record<string, unknown>;
|
|
190
190
|
}
|
|
191
191
|
|
|
192
|
+
// Pagination configuration for auto-fetching pages from v2 endpoints that return a `next` URL
|
|
193
|
+
export interface PaginationConfig {
|
|
194
|
+
/** When true (default), automatically follow `next` links and aggregate all documents. */
|
|
195
|
+
autoPaginate?: boolean;
|
|
196
|
+
/** Maximum number of additional pages to fetch after the first response. */
|
|
197
|
+
maxPages?: number;
|
|
198
|
+
/** Maximum total number of documents to return across all pages. */
|
|
199
|
+
maxResults?: number;
|
|
200
|
+
/** Maximum time to spend fetching additional pages (in seconds). */
|
|
201
|
+
maxWaitTime?: number;
|
|
202
|
+
}
|
|
203
|
+
|
|
192
204
|
export interface SearchResultWeb {
|
|
193
205
|
url: string;
|
|
194
206
|
title?: string;
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import type { HttpClient } from "../utils/httpClient";
|
|
2
|
+
import type { Document, PaginationConfig } from "../types";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Shared helper to follow `next` cursors and aggregate documents with limits.
|
|
6
|
+
*/
|
|
7
|
+
export async function fetchAllPages(
|
|
8
|
+
http: HttpClient,
|
|
9
|
+
nextUrl: string,
|
|
10
|
+
initial: Document[],
|
|
11
|
+
pagination?: PaginationConfig
|
|
12
|
+
): Promise<Document[]> {
|
|
13
|
+
const docs = initial.slice();
|
|
14
|
+
let current: string | null = nextUrl;
|
|
15
|
+
let pageCount = 0;
|
|
16
|
+
const maxPages = pagination?.maxPages ?? undefined;
|
|
17
|
+
const maxResults = pagination?.maxResults ?? undefined;
|
|
18
|
+
const maxWaitTime = pagination?.maxWaitTime ?? undefined;
|
|
19
|
+
const started = Date.now();
|
|
20
|
+
|
|
21
|
+
while (current) {
|
|
22
|
+
if (maxPages != null && pageCount >= maxPages) break;
|
|
23
|
+
if (maxWaitTime != null && (Date.now() - started) / 1000 > maxWaitTime) break;
|
|
24
|
+
|
|
25
|
+
let payload: { success: boolean; next?: string | null; data?: Document[] } | null = null;
|
|
26
|
+
try {
|
|
27
|
+
const res = await http.get<{ success: boolean; next?: string | null; data?: Document[] }>(current);
|
|
28
|
+
payload = res.data;
|
|
29
|
+
} catch {
|
|
30
|
+
break; // axios rejects on non-2xx; stop pagination gracefully
|
|
31
|
+
}
|
|
32
|
+
if (!payload?.success) break;
|
|
33
|
+
|
|
34
|
+
for (const d of payload.data || []) {
|
|
35
|
+
if (maxResults != null && docs.length >= maxResults) break;
|
|
36
|
+
docs.push(d as Document);
|
|
37
|
+
}
|
|
38
|
+
if (maxResults != null && docs.length >= maxResults) break;
|
|
39
|
+
current = (payload.next ?? null) as string | null;
|
|
40
|
+
pageCount += 1;
|
|
41
|
+
}
|
|
42
|
+
return docs;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|