firecrawl 3.3.0 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ var require_package = __commonJS({
8
8
  "package.json"(exports, module) {
9
9
  module.exports = {
10
10
  name: "@mendable/firecrawl-js",
11
- version: "3.3.0",
11
+ version: "4.0.0",
12
12
  description: "JavaScript SDK for Firecrawl API",
13
13
  main: "dist/index.js",
14
14
  types: "dist/index.d.ts",
package/dist/index.cjs CHANGED
@@ -35,7 +35,7 @@ var require_package = __commonJS({
35
35
  "package.json"(exports2, module2) {
36
36
  module2.exports = {
37
37
  name: "@mendable/firecrawl-js",
38
- version: "3.3.0",
38
+ version: "4.0.0",
39
39
  description: "JavaScript SDK for Firecrawl API",
40
40
  main: "dist/index.js",
41
41
  types: "dist/index.d.ts",
@@ -395,6 +395,37 @@ async function map(http, url, options) {
395
395
  }
396
396
  }
397
397
 
398
+ // src/v2/utils/pagination.ts
399
+ async function fetchAllPages(http, nextUrl, initial, pagination) {
400
+ const docs = initial.slice();
401
+ let current = nextUrl;
402
+ let pageCount = 0;
403
+ const maxPages = pagination?.maxPages ?? void 0;
404
+ const maxResults = pagination?.maxResults ?? void 0;
405
+ const maxWaitTime = pagination?.maxWaitTime ?? void 0;
406
+ const started = Date.now();
407
+ while (current) {
408
+ if (maxPages != null && pageCount >= maxPages) break;
409
+ if (maxWaitTime != null && (Date.now() - started) / 1e3 > maxWaitTime) break;
410
+ let payload = null;
411
+ try {
412
+ const res = await http.get(current);
413
+ payload = res.data;
414
+ } catch {
415
+ break;
416
+ }
417
+ if (!payload?.success) break;
418
+ for (const d of payload.data || []) {
419
+ if (maxResults != null && docs.length >= maxResults) break;
420
+ docs.push(d);
421
+ }
422
+ if (maxResults != null && docs.length >= maxResults) break;
423
+ current = payload.next ?? null;
424
+ pageCount += 1;
425
+ }
426
+ return docs;
427
+ }
428
+
398
429
  // src/v2/methods/crawl.ts
399
430
  function prepareCrawlPayload(request) {
400
431
  if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty");
@@ -432,21 +463,35 @@ async function startCrawl(http, request) {
432
463
  throw err;
433
464
  }
434
465
  }
435
- async function getCrawlStatus(http, jobId) {
466
+ async function getCrawlStatus(http, jobId, pagination) {
436
467
  try {
437
468
  const res = await http.get(`/v2/crawl/${jobId}`);
438
469
  if (res.status !== 200 || !res.data?.success) {
439
470
  throwForBadResponse(res, "get crawl status");
440
471
  }
441
472
  const body = res.data;
473
+ const initialDocs = body.data || [];
474
+ const auto = pagination?.autoPaginate ?? true;
475
+ if (!auto || !body.next) {
476
+ return {
477
+ status: body.status,
478
+ completed: body.completed ?? 0,
479
+ total: body.total ?? 0,
480
+ creditsUsed: body.creditsUsed,
481
+ expiresAt: body.expiresAt,
482
+ next: body.next ?? null,
483
+ data: initialDocs
484
+ };
485
+ }
486
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
442
487
  return {
443
488
  status: body.status,
444
489
  completed: body.completed ?? 0,
445
490
  total: body.total ?? 0,
446
491
  creditsUsed: body.creditsUsed,
447
492
  expiresAt: body.expiresAt,
448
- next: body.next ?? null,
449
- data: body.data || []
493
+ next: null,
494
+ data: aggregated
450
495
  };
451
496
  } catch (err) {
452
497
  if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
@@ -549,19 +594,33 @@ async function startBatchScrape(http, urls, {
549
594
  throw err;
550
595
  }
551
596
  }
552
- async function getBatchScrapeStatus(http, jobId) {
597
+ async function getBatchScrapeStatus(http, jobId, pagination) {
553
598
  try {
554
599
  const res = await http.get(`/v2/batch/scrape/${jobId}`);
555
600
  if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
556
601
  const body = res.data;
602
+ const initialDocs = body.data || [];
603
+ const auto = pagination?.autoPaginate ?? true;
604
+ if (!auto || !body.next) {
605
+ return {
606
+ status: body.status,
607
+ completed: body.completed ?? 0,
608
+ total: body.total ?? 0,
609
+ creditsUsed: body.creditsUsed,
610
+ expiresAt: body.expiresAt,
611
+ next: body.next ?? null,
612
+ data: initialDocs
613
+ };
614
+ }
615
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
557
616
  return {
558
617
  status: body.status,
559
618
  completed: body.completed ?? 0,
560
619
  total: body.total ?? 0,
561
620
  creditsUsed: body.creditsUsed,
562
621
  expiresAt: body.expiresAt,
563
- next: body.next ?? null,
564
- data: body.data || []
622
+ next: null,
623
+ data: aggregated
565
624
  };
566
625
  } catch (err) {
567
626
  if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
@@ -885,8 +944,8 @@ var FirecrawlClient = class {
885
944
  * Get the status and partial data of a crawl job.
886
945
  * @param jobId Crawl job id.
887
946
  */
888
- async getCrawlStatus(jobId) {
889
- return getCrawlStatus(this.http, jobId);
947
+ async getCrawlStatus(jobId, pagination) {
948
+ return getCrawlStatus(this.http, jobId, pagination);
890
949
  }
891
950
  /**
892
951
  * Cancel a crawl job.
@@ -940,8 +999,8 @@ var FirecrawlClient = class {
940
999
  * Get the status and partial data of a batch scrape job.
941
1000
  * @param jobId Batch job id.
942
1001
  */
943
- async getBatchScrapeStatus(jobId) {
944
- return getBatchScrapeStatus(this.http, jobId);
1002
+ async getBatchScrapeStatus(jobId, pagination) {
1003
+ return getBatchScrapeStatus(this.http, jobId, pagination);
945
1004
  }
946
1005
  /**
947
1006
  * Retrieve batch scrape errors and robots.txt blocks.
package/dist/index.d.cts CHANGED
@@ -151,6 +151,16 @@ interface Document {
151
151
  warning?: string;
152
152
  changeTracking?: Record<string, unknown>;
153
153
  }
154
+ interface PaginationConfig {
155
+ /** When true (default), automatically follow `next` links and aggregate all documents. */
156
+ autoPaginate?: boolean;
157
+ /** Maximum number of additional pages to fetch after the first response. */
158
+ maxPages?: number;
159
+ /** Maximum total number of documents to return across all pages. */
160
+ maxResults?: number;
161
+ /** Maximum time to spend fetching additional pages (in seconds). */
162
+ maxWaitTime?: number;
163
+ }
154
164
  interface SearchResultWeb {
155
165
  url: string;
156
166
  title?: string;
@@ -440,7 +450,7 @@ declare class FirecrawlClient {
440
450
  * Get the status and partial data of a crawl job.
441
451
  * @param jobId Crawl job id.
442
452
  */
443
- getCrawlStatus(jobId: string): Promise<CrawlJob>;
453
+ getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
444
454
  /**
445
455
  * Cancel a crawl job.
446
456
  * @param jobId Crawl job id.
@@ -483,7 +493,7 @@ declare class FirecrawlClient {
483
493
  * Get the status and partial data of a batch scrape job.
484
494
  * @param jobId Batch job id.
485
495
  */
486
- getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
496
+ getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;
487
497
  /**
488
498
  * Retrieve batch scrape errors and robots.txt blocks.
489
499
  * @param jobId Batch job id.
@@ -1361,4 +1371,4 @@ declare class Firecrawl extends FirecrawlClient {
1361
1371
  get v1(): FirecrawlApp;
1362
1372
  }
1363
1373
 
1364
- export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
1374
+ export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
package/dist/index.d.ts CHANGED
@@ -151,6 +151,16 @@ interface Document {
151
151
  warning?: string;
152
152
  changeTracking?: Record<string, unknown>;
153
153
  }
154
+ interface PaginationConfig {
155
+ /** When true (default), automatically follow `next` links and aggregate all documents. */
156
+ autoPaginate?: boolean;
157
+ /** Maximum number of additional pages to fetch after the first response. */
158
+ maxPages?: number;
159
+ /** Maximum total number of documents to return across all pages. */
160
+ maxResults?: number;
161
+ /** Maximum time to spend fetching additional pages (in seconds). */
162
+ maxWaitTime?: number;
163
+ }
154
164
  interface SearchResultWeb {
155
165
  url: string;
156
166
  title?: string;
@@ -440,7 +450,7 @@ declare class FirecrawlClient {
440
450
  * Get the status and partial data of a crawl job.
441
451
  * @param jobId Crawl job id.
442
452
  */
443
- getCrawlStatus(jobId: string): Promise<CrawlJob>;
453
+ getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
444
454
  /**
445
455
  * Cancel a crawl job.
446
456
  * @param jobId Crawl job id.
@@ -483,7 +493,7 @@ declare class FirecrawlClient {
483
493
  * Get the status and partial data of a batch scrape job.
484
494
  * @param jobId Batch job id.
485
495
  */
486
- getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
496
+ getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;
487
497
  /**
488
498
  * Retrieve batch scrape errors and robots.txt blocks.
489
499
  * @param jobId Batch job id.
@@ -1361,4 +1371,4 @@ declare class Firecrawl extends FirecrawlClient {
1361
1371
  get v1(): FirecrawlApp;
1362
1372
  }
1363
1373
 
1364
- export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
1374
+ export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  require_package
3
- } from "./chunk-Y3QF4XAJ.js";
3
+ } from "./chunk-YH34PXKT.js";
4
4
 
5
5
  // src/v2/utils/httpClient.ts
6
6
  import axios from "axios";
@@ -279,6 +279,37 @@ async function map(http, url, options) {
279
279
  }
280
280
  }
281
281
 
282
+ // src/v2/utils/pagination.ts
283
+ async function fetchAllPages(http, nextUrl, initial, pagination) {
284
+ const docs = initial.slice();
285
+ let current = nextUrl;
286
+ let pageCount = 0;
287
+ const maxPages = pagination?.maxPages ?? void 0;
288
+ const maxResults = pagination?.maxResults ?? void 0;
289
+ const maxWaitTime = pagination?.maxWaitTime ?? void 0;
290
+ const started = Date.now();
291
+ while (current) {
292
+ if (maxPages != null && pageCount >= maxPages) break;
293
+ if (maxWaitTime != null && (Date.now() - started) / 1e3 > maxWaitTime) break;
294
+ let payload = null;
295
+ try {
296
+ const res = await http.get(current);
297
+ payload = res.data;
298
+ } catch {
299
+ break;
300
+ }
301
+ if (!payload?.success) break;
302
+ for (const d of payload.data || []) {
303
+ if (maxResults != null && docs.length >= maxResults) break;
304
+ docs.push(d);
305
+ }
306
+ if (maxResults != null && docs.length >= maxResults) break;
307
+ current = payload.next ?? null;
308
+ pageCount += 1;
309
+ }
310
+ return docs;
311
+ }
312
+
282
313
  // src/v2/methods/crawl.ts
283
314
  function prepareCrawlPayload(request) {
284
315
  if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty");
@@ -316,21 +347,35 @@ async function startCrawl(http, request) {
316
347
  throw err;
317
348
  }
318
349
  }
319
- async function getCrawlStatus(http, jobId) {
350
+ async function getCrawlStatus(http, jobId, pagination) {
320
351
  try {
321
352
  const res = await http.get(`/v2/crawl/${jobId}`);
322
353
  if (res.status !== 200 || !res.data?.success) {
323
354
  throwForBadResponse(res, "get crawl status");
324
355
  }
325
356
  const body = res.data;
357
+ const initialDocs = body.data || [];
358
+ const auto = pagination?.autoPaginate ?? true;
359
+ if (!auto || !body.next) {
360
+ return {
361
+ status: body.status,
362
+ completed: body.completed ?? 0,
363
+ total: body.total ?? 0,
364
+ creditsUsed: body.creditsUsed,
365
+ expiresAt: body.expiresAt,
366
+ next: body.next ?? null,
367
+ data: initialDocs
368
+ };
369
+ }
370
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
326
371
  return {
327
372
  status: body.status,
328
373
  completed: body.completed ?? 0,
329
374
  total: body.total ?? 0,
330
375
  creditsUsed: body.creditsUsed,
331
376
  expiresAt: body.expiresAt,
332
- next: body.next ?? null,
333
- data: body.data || []
377
+ next: null,
378
+ data: aggregated
334
379
  };
335
380
  } catch (err) {
336
381
  if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
@@ -433,19 +478,33 @@ async function startBatchScrape(http, urls, {
433
478
  throw err;
434
479
  }
435
480
  }
436
- async function getBatchScrapeStatus(http, jobId) {
481
+ async function getBatchScrapeStatus(http, jobId, pagination) {
437
482
  try {
438
483
  const res = await http.get(`/v2/batch/scrape/${jobId}`);
439
484
  if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
440
485
  const body = res.data;
486
+ const initialDocs = body.data || [];
487
+ const auto = pagination?.autoPaginate ?? true;
488
+ if (!auto || !body.next) {
489
+ return {
490
+ status: body.status,
491
+ completed: body.completed ?? 0,
492
+ total: body.total ?? 0,
493
+ creditsUsed: body.creditsUsed,
494
+ expiresAt: body.expiresAt,
495
+ next: body.next ?? null,
496
+ data: initialDocs
497
+ };
498
+ }
499
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
441
500
  return {
442
501
  status: body.status,
443
502
  completed: body.completed ?? 0,
444
503
  total: body.total ?? 0,
445
504
  creditsUsed: body.creditsUsed,
446
505
  expiresAt: body.expiresAt,
447
- next: body.next ?? null,
448
- data: body.data || []
506
+ next: null,
507
+ data: aggregated
449
508
  };
450
509
  } catch (err) {
451
510
  if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
@@ -769,8 +828,8 @@ var FirecrawlClient = class {
769
828
  * Get the status and partial data of a crawl job.
770
829
  * @param jobId Crawl job id.
771
830
  */
772
- async getCrawlStatus(jobId) {
773
- return getCrawlStatus(this.http, jobId);
831
+ async getCrawlStatus(jobId, pagination) {
832
+ return getCrawlStatus(this.http, jobId, pagination);
774
833
  }
775
834
  /**
776
835
  * Cancel a crawl job.
@@ -824,8 +883,8 @@ var FirecrawlClient = class {
824
883
  * Get the status and partial data of a batch scrape job.
825
884
  * @param jobId Batch job id.
826
885
  */
827
- async getBatchScrapeStatus(jobId) {
828
- return getBatchScrapeStatus(this.http, jobId);
886
+ async getBatchScrapeStatus(jobId, pagination) {
887
+ return getBatchScrapeStatus(this.http, jobId, pagination);
829
888
  }
830
889
  /**
831
890
  * Retrieve batch scrape errors and robots.txt blocks.
@@ -933,7 +992,7 @@ var FirecrawlApp = class {
933
992
  if (typeof process !== "undefined" && process.env && process.env.npm_package_version) {
934
993
  return process.env.npm_package_version;
935
994
  }
936
- const packageJson = await import("./package-LI2S3JCZ.js");
995
+ const packageJson = await import("./package-CW75NWUC.js");
937
996
  return packageJson.default.version;
938
997
  } catch (error) {
939
998
  const isTest = typeof process !== "undefined" && (process.env.JEST_WORKER_ID != null || false);
@@ -1,4 +1,4 @@
1
1
  import {
2
2
  require_package
3
- } from "./chunk-Y3QF4XAJ.js";
3
+ } from "./chunk-YH34PXKT.js";
4
4
  export default require_package();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "firecrawl",
3
- "version": "3.3.0",
3
+ "version": "4.0.0",
4
4
  "description": "JavaScript SDK for Firecrawl API",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -0,0 +1,112 @@
1
+ import { describe, test, expect, jest } from "@jest/globals";
2
+ import { getCrawlStatus } from "../../../v2/methods/crawl";
3
+ import { getBatchScrapeStatus } from "../../../v2/methods/batch";
4
+
5
+ describe("JS SDK v2 pagination", () => {
6
+ function makeHttp(getImpl: (url: string) => any) {
7
+ return { get: jest.fn(async (u: string) => getImpl(u)) } as any;
8
+ }
9
+
10
+ test("crawl: autoPaginate=false returns next", async () => {
11
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 2, next: "https://api/next", data: [{ markdown: "a" }] } };
12
+ const http = makeHttp(() => first);
13
+ const res = await getCrawlStatus(http, "job1", { autoPaginate: false });
14
+ expect(res.data.length).toBe(1);
15
+ expect(res.next).toBe("https://api/next");
16
+ });
17
+
18
+ test("crawl: default autoPaginate aggregates and nulls next", async () => {
19
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 3, next: "https://api/n1", data: [{ markdown: "a" }] } };
20
+ const second = { status: 200, data: { success: true, next: "https://api/n2", data: [{ markdown: "b" }] } };
21
+ const third = { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
22
+ const http = makeHttp((url) => {
23
+ if (url.includes("/v2/crawl/")) return first;
24
+ if (url.endsWith("n1")) return second;
25
+ return third;
26
+ });
27
+ const res = await getCrawlStatus(http, "job1");
28
+ expect(res.data.length).toBe(3);
29
+ expect(res.next).toBeNull();
30
+ });
31
+
32
+ test("crawl: respects maxPages and maxResults", async () => {
33
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 10, next: "https://api/n1", data: [{ markdown: "a" }] } };
34
+ const page = (n: number) => ({ status: 200, data: { success: true, next: n < 3 ? `https://api/n${n + 1}` : null, data: [{ markdown: `p${n}` }] } });
35
+ const http = makeHttp((url) => {
36
+ if (url.includes("/v2/crawl/")) return first;
37
+ if (url.endsWith("n1")) return page(1);
38
+ if (url.endsWith("n2")) return page(2);
39
+ return page(3);
40
+ });
41
+ const res = await getCrawlStatus(http, "job1", { autoPaginate: true, maxPages: 2, maxResults: 2 });
42
+ expect(res.data.length).toBe(2);
43
+ });
44
+
45
+ test("batch: default autoPaginate aggregates and nulls next", async () => {
46
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 3, next: "https://api/b1", data: [{ markdown: "a" }] } };
47
+ const second = { status: 200, data: { success: true, next: "https://api/b2", data: [{ markdown: "b" }] } };
48
+ const third = { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
49
+ const http = makeHttp((url) => {
50
+ if (url.includes("/v2/batch/scrape/")) return first;
51
+ if (url.endsWith("b1")) return second;
52
+ return third;
53
+ });
54
+ const res = await getBatchScrapeStatus(http, "jobB");
55
+ expect(res.data.length).toBe(3);
56
+ expect(res.next).toBeNull();
57
+ });
58
+
59
+ test("batch: autoPaginate=false returns next", async () => {
60
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 2, next: "https://api/nextBatch", data: [{ markdown: "a" }] } };
61
+ const http = makeHttp(() => first);
62
+ const res = await getBatchScrapeStatus(http, "jobB", { autoPaginate: false });
63
+ expect(res.data.length).toBe(1);
64
+ expect(res.next).toBe("https://api/nextBatch");
65
+ });
66
+
67
+ test("crawl: maxWaitTime stops pagination after first page", async () => {
68
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 5, next: "https://api/n1", data: [{ markdown: "a" }] } };
69
+ const p1 = { status: 200, data: { success: true, next: "https://api/n2", data: [{ markdown: "b" }] } };
70
+ const http: any = makeHttp((url: string) => {
71
+ if (url.includes("/v2/crawl/")) return first;
72
+ if (url.endsWith("n1")) return p1;
73
+ return { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
74
+ });
75
+ const nowSpy = jest.spyOn(Date, "now");
76
+ try {
77
+ nowSpy
78
+ .mockImplementationOnce(() => 0) // started
79
+ .mockImplementationOnce(() => 0) // first loop check
80
+ .mockImplementationOnce(() => 3000); // second loop check > maxWaitTime
81
+ const res = await getCrawlStatus(http, "jobC", { autoPaginate: true, maxWaitTime: 1 });
82
+ expect(res.data.length).toBe(2); // initial + first page
83
+ expect((http.get as jest.Mock).mock.calls.length).toBe(2); // initial + n1 only
84
+ } finally {
85
+ nowSpy.mockRestore();
86
+ }
87
+ });
88
+
89
+ test("batch: maxWaitTime stops pagination after first page", async () => {
90
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 5, next: "https://api/b1", data: [{ markdown: "a" }] } };
91
+ const p1 = { status: 200, data: { success: true, next: "https://api/b2", data: [{ markdown: "b" }] } };
92
+ const http: any = makeHttp((url: string) => {
93
+ if (url.includes("/v2/batch/scrape/")) return first;
94
+ if (url.endsWith("b1")) return p1;
95
+ return { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
96
+ });
97
+ const nowSpy = jest.spyOn(Date, "now");
98
+ try {
99
+ nowSpy
100
+ .mockImplementationOnce(() => 0) // started
101
+ .mockImplementationOnce(() => 0) // first loop check
102
+ .mockImplementationOnce(() => 3000); // second loop check > maxWaitTime
103
+ const res = await getBatchScrapeStatus(http, "jobB", { autoPaginate: true, maxWaitTime: 1 });
104
+ expect(res.data.length).toBe(2);
105
+ expect((http.get as jest.Mock).mock.calls.length).toBe(2);
106
+ } finally {
107
+ nowSpy.mockRestore();
108
+ }
109
+ });
110
+ });
111
+
112
+
package/src/v2/client.ts CHANGED
@@ -36,6 +36,7 @@ import type {
36
36
  ExtractResponse,
37
37
  CrawlOptions,
38
38
  BatchScrapeOptions,
39
+ PaginationConfig,
39
40
  } from "./types";
40
41
  import { Watcher } from "./watcher";
41
42
  import type { WatcherOptions } from "./watcher";
@@ -145,8 +146,8 @@ export class FirecrawlClient {
145
146
  * Get the status and partial data of a crawl job.
146
147
  * @param jobId Crawl job id.
147
148
  */
148
- async getCrawlStatus(jobId: string): Promise<CrawlJob> {
149
- return getCrawlStatus(this.http, jobId);
149
+ async getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob> {
150
+ return getCrawlStatus(this.http, jobId, pagination);
150
151
  }
151
152
  /**
152
153
  * Cancel a crawl job.
@@ -201,8 +202,8 @@ export class FirecrawlClient {
201
202
  * Get the status and partial data of a batch scrape job.
202
203
  * @param jobId Batch job id.
203
204
  */
204
- async getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob> {
205
- return getBatchScrapeStatus(this.http, jobId);
205
+ async getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob> {
206
+ return getBatchScrapeStatus(this.http, jobId, pagination);
206
207
  }
207
208
  /**
208
209
  * Retrieve batch scrape errors and robots.txt blocks.
@@ -4,9 +4,11 @@ import {
4
4
  type CrawlErrorsResponse,
5
5
  type Document,
6
6
  type BatchScrapeOptions,
7
+ type PaginationConfig,
7
8
  } from "../types";
8
9
  import { HttpClient } from "../utils/httpClient";
9
10
  import { ensureValidScrapeOptions } from "../utils/validation";
11
+ import { fetchAllPages } from "../utils/pagination";
10
12
  import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
11
13
 
12
14
  export async function startBatchScrape(
@@ -47,19 +49,38 @@ export async function startBatchScrape(
47
49
  }
48
50
  }
49
51
 
50
- export async function getBatchScrapeStatus(http: HttpClient, jobId: string): Promise<BatchScrapeJob> {
52
+ export async function getBatchScrapeStatus(
53
+ http: HttpClient,
54
+ jobId: string,
55
+ pagination?: PaginationConfig
56
+ ): Promise<BatchScrapeJob> {
51
57
  try {
52
58
  const res = await http.get<{ success: boolean; status: BatchScrapeJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/batch/scrape/${jobId}`);
53
59
  if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
54
60
  const body = res.data;
61
+ const initialDocs = (body.data || []) as Document[];
62
+ const auto = pagination?.autoPaginate ?? true;
63
+ if (!auto || !body.next) {
64
+ return {
65
+ status: body.status,
66
+ completed: body.completed ?? 0,
67
+ total: body.total ?? 0,
68
+ creditsUsed: body.creditsUsed,
69
+ expiresAt: body.expiresAt,
70
+ next: body.next ?? null,
71
+ data: initialDocs,
72
+ };
73
+ }
74
+
75
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
55
76
  return {
56
77
  status: body.status,
57
78
  completed: body.completed ?? 0,
58
79
  total: body.total ?? 0,
59
80
  creditsUsed: body.creditsUsed,
60
81
  expiresAt: body.expiresAt,
61
- next: body.next ?? null,
62
- data: (body.data || []) as Document[],
82
+ next: null,
83
+ data: aggregated,
63
84
  };
64
85
  } catch (err: any) {
65
86
  if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
@@ -115,5 +136,4 @@ export function chunkUrls(urls: string[], chunkSize = 100): string[][] {
115
136
  const chunks: string[][] = [];
116
137
  for (let i = 0; i < urls.length; i += chunkSize) chunks.push(urls.slice(i, i + chunkSize));
117
138
  return chunks;
118
- }
119
-
139
+ }
@@ -5,10 +5,13 @@ import {
5
5
  type CrawlResponse,
6
6
  type Document,
7
7
  type CrawlOptions,
8
+ type PaginationConfig,
8
9
  } from "../types";
9
10
  import { HttpClient } from "../utils/httpClient";
10
11
  import { ensureValidScrapeOptions } from "../utils/validation";
11
12
  import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
13
+ import type { HttpClient as _Http } from "../utils/httpClient";
14
+ import { fetchAllPages } from "../utils/pagination";
12
15
 
13
16
  export type CrawlRequest = CrawlOptions & {
14
17
  url: string;
@@ -52,21 +55,42 @@ export async function startCrawl(http: HttpClient, request: CrawlRequest): Promi
52
55
  }
53
56
  }
54
57
 
55
- export async function getCrawlStatus(http: HttpClient, jobId: string): Promise<CrawlJob> {
58
+ export async function getCrawlStatus(
59
+ http: HttpClient,
60
+ jobId: string,
61
+ pagination?: PaginationConfig
62
+ ): Promise<CrawlJob> {
56
63
  try {
57
64
  const res = await http.get<{ success: boolean; status: CrawlJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/crawl/${jobId}`);
58
65
  if (res.status !== 200 || !res.data?.success) {
59
66
  throwForBadResponse(res, "get crawl status");
60
67
  }
61
68
  const body = res.data;
69
+ const initialDocs = (body.data || []) as Document[];
70
+
71
+ const auto = pagination?.autoPaginate ?? true;
72
+ if (!auto || !body.next) {
73
+ return {
74
+ status: body.status,
75
+ completed: body.completed ?? 0,
76
+ total: body.total ?? 0,
77
+ creditsUsed: body.creditsUsed,
78
+ expiresAt: body.expiresAt,
79
+ next: body.next ?? null,
80
+ data: initialDocs,
81
+ };
82
+ }
83
+
84
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
85
+
62
86
  return {
63
87
  status: body.status,
64
88
  completed: body.completed ?? 0,
65
89
  total: body.total ?? 0,
66
90
  creditsUsed: body.creditsUsed,
67
91
  expiresAt: body.expiresAt,
68
- next: body.next ?? null,
69
- data: (body.data || []) as Document[],
92
+ next: null,
93
+ data: aggregated,
70
94
  };
71
95
  } catch (err: any) {
72
96
  if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
@@ -140,5 +164,4 @@ export async function crawlParamsPreview(http: HttpClient, url: string, prompt:
140
164
  if (err?.isAxiosError) return normalizeAxiosError(err, "crawl params preview");
141
165
  throw err;
142
166
  }
143
- }
144
-
167
+ }
package/src/v2/types.ts CHANGED
@@ -189,6 +189,18 @@ export interface Document {
189
189
  changeTracking?: Record<string, unknown>;
190
190
  }
191
191
 
192
+ // Pagination configuration for auto-fetching pages from v2 endpoints that return a `next` URL
193
+ export interface PaginationConfig {
194
+ /** When true (default), automatically follow `next` links and aggregate all documents. */
195
+ autoPaginate?: boolean;
196
+ /** Maximum number of additional pages to fetch after the first response. */
197
+ maxPages?: number;
198
+ /** Maximum total number of documents to return across all pages. */
199
+ maxResults?: number;
200
+ /** Maximum time to spend fetching additional pages (in seconds). */
201
+ maxWaitTime?: number;
202
+ }
203
+
192
204
  export interface SearchResultWeb {
193
205
  url: string;
194
206
  title?: string;
@@ -0,0 +1,45 @@
1
+ import type { HttpClient } from "../utils/httpClient";
2
+ import type { Document, PaginationConfig } from "../types";
3
+
4
+ /**
5
+ * Shared helper to follow `next` cursors and aggregate documents with limits.
6
+ */
7
+ export async function fetchAllPages(
8
+ http: HttpClient,
9
+ nextUrl: string,
10
+ initial: Document[],
11
+ pagination?: PaginationConfig
12
+ ): Promise<Document[]> {
13
+ const docs = initial.slice();
14
+ let current: string | null = nextUrl;
15
+ let pageCount = 0;
16
+ const maxPages = pagination?.maxPages ?? undefined;
17
+ const maxResults = pagination?.maxResults ?? undefined;
18
+ const maxWaitTime = pagination?.maxWaitTime ?? undefined;
19
+ const started = Date.now();
20
+
21
+ while (current) {
22
+ if (maxPages != null && pageCount >= maxPages) break;
23
+ if (maxWaitTime != null && (Date.now() - started) / 1000 > maxWaitTime) break;
24
+
25
+ let payload: { success: boolean; next?: string | null; data?: Document[] } | null = null;
26
+ try {
27
+ const res = await http.get<{ success: boolean; next?: string | null; data?: Document[] }>(current);
28
+ payload = res.data;
29
+ } catch {
30
+ break; // axios rejects on non-2xx; stop pagination gracefully
31
+ }
32
+ if (!payload?.success) break;
33
+
34
+ for (const d of payload.data || []) {
35
+ if (maxResults != null && docs.length >= maxResults) break;
36
+ docs.push(d as Document);
37
+ }
38
+ if (maxResults != null && docs.length >= maxResults) break;
39
+ current = (payload.next ?? null) as string | null;
40
+ pageCount += 1;
41
+ }
42
+ return docs;
43
+ }
44
+
45
+