@mendable/firecrawl 3.2.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -8,7 +8,7 @@ var require_package = __commonJS({
8
8
  "package.json"(exports, module) {
9
9
  module.exports = {
10
10
  name: "@mendable/firecrawl-js",
11
- version: "3.2.1",
11
+ version: "4.0.0",
12
12
  description: "JavaScript SDK for Firecrawl API",
13
13
  main: "dist/index.js",
14
14
  types: "dist/index.d.ts",
package/dist/index.cjs CHANGED
@@ -35,7 +35,7 @@ var require_package = __commonJS({
35
35
  "package.json"(exports2, module2) {
36
36
  module2.exports = {
37
37
  name: "@mendable/firecrawl-js",
38
- version: "3.2.1",
38
+ version: "4.0.0",
39
39
  description: "JavaScript SDK for Firecrawl API",
40
40
  main: "dist/index.js",
41
41
  types: "dist/index.d.ts",
@@ -395,6 +395,37 @@ async function map(http, url, options) {
395
395
  }
396
396
  }
397
397
 
398
+ // src/v2/utils/pagination.ts
399
+ async function fetchAllPages(http, nextUrl, initial, pagination) {
400
+ const docs = initial.slice();
401
+ let current = nextUrl;
402
+ let pageCount = 0;
403
+ const maxPages = pagination?.maxPages ?? void 0;
404
+ const maxResults = pagination?.maxResults ?? void 0;
405
+ const maxWaitTime = pagination?.maxWaitTime ?? void 0;
406
+ const started = Date.now();
407
+ while (current) {
408
+ if (maxPages != null && pageCount >= maxPages) break;
409
+ if (maxWaitTime != null && (Date.now() - started) / 1e3 > maxWaitTime) break;
410
+ let payload = null;
411
+ try {
412
+ const res = await http.get(current);
413
+ payload = res.data;
414
+ } catch {
415
+ break;
416
+ }
417
+ if (!payload?.success) break;
418
+ for (const d of payload.data || []) {
419
+ if (maxResults != null && docs.length >= maxResults) break;
420
+ docs.push(d);
421
+ }
422
+ if (maxResults != null && docs.length >= maxResults) break;
423
+ current = payload.next ?? null;
424
+ pageCount += 1;
425
+ }
426
+ return docs;
427
+ }
428
+
398
429
  // src/v2/methods/crawl.ts
399
430
  function prepareCrawlPayload(request) {
400
431
  if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty");
@@ -432,21 +463,35 @@ async function startCrawl(http, request) {
432
463
  throw err;
433
464
  }
434
465
  }
435
- async function getCrawlStatus(http, jobId) {
466
+ async function getCrawlStatus(http, jobId, pagination) {
436
467
  try {
437
468
  const res = await http.get(`/v2/crawl/${jobId}`);
438
469
  if (res.status !== 200 || !res.data?.success) {
439
470
  throwForBadResponse(res, "get crawl status");
440
471
  }
441
472
  const body = res.data;
473
+ const initialDocs = body.data || [];
474
+ const auto = pagination?.autoPaginate ?? true;
475
+ if (!auto || !body.next) {
476
+ return {
477
+ status: body.status,
478
+ completed: body.completed ?? 0,
479
+ total: body.total ?? 0,
480
+ creditsUsed: body.creditsUsed,
481
+ expiresAt: body.expiresAt,
482
+ next: body.next ?? null,
483
+ data: initialDocs
484
+ };
485
+ }
486
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
442
487
  return {
443
488
  status: body.status,
444
489
  completed: body.completed ?? 0,
445
490
  total: body.total ?? 0,
446
491
  creditsUsed: body.creditsUsed,
447
492
  expiresAt: body.expiresAt,
448
- next: body.next ?? null,
449
- data: body.data || []
493
+ next: null,
494
+ data: aggregated
450
495
  };
451
496
  } catch (err) {
452
497
  if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
@@ -549,19 +594,33 @@ async function startBatchScrape(http, urls, {
549
594
  throw err;
550
595
  }
551
596
  }
552
- async function getBatchScrapeStatus(http, jobId) {
597
+ async function getBatchScrapeStatus(http, jobId, pagination) {
553
598
  try {
554
599
  const res = await http.get(`/v2/batch/scrape/${jobId}`);
555
600
  if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
556
601
  const body = res.data;
602
+ const initialDocs = body.data || [];
603
+ const auto = pagination?.autoPaginate ?? true;
604
+ if (!auto || !body.next) {
605
+ return {
606
+ status: body.status,
607
+ completed: body.completed ?? 0,
608
+ total: body.total ?? 0,
609
+ creditsUsed: body.creditsUsed,
610
+ expiresAt: body.expiresAt,
611
+ next: body.next ?? null,
612
+ data: initialDocs
613
+ };
614
+ }
615
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
557
616
  return {
558
617
  status: body.status,
559
618
  completed: body.completed ?? 0,
560
619
  total: body.total ?? 0,
561
620
  creditsUsed: body.creditsUsed,
562
621
  expiresAt: body.expiresAt,
563
- next: body.next ?? null,
564
- data: body.data || []
622
+ next: null,
623
+ data: aggregated
565
624
  };
566
625
  } catch (err) {
567
626
  if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
@@ -885,8 +944,8 @@ var FirecrawlClient = class {
885
944
  * Get the status and partial data of a crawl job.
886
945
  * @param jobId Crawl job id.
887
946
  */
888
- async getCrawlStatus(jobId) {
889
- return getCrawlStatus(this.http, jobId);
947
+ async getCrawlStatus(jobId, pagination) {
948
+ return getCrawlStatus(this.http, jobId, pagination);
890
949
  }
891
950
  /**
892
951
  * Cancel a crawl job.
@@ -940,8 +999,8 @@ var FirecrawlClient = class {
940
999
  * Get the status and partial data of a batch scrape job.
941
1000
  * @param jobId Batch job id.
942
1001
  */
943
- async getBatchScrapeStatus(jobId) {
944
- return getBatchScrapeStatus(this.http, jobId);
1002
+ async getBatchScrapeStatus(jobId, pagination) {
1003
+ return getBatchScrapeStatus(this.http, jobId, pagination);
945
1004
  }
946
1005
  /**
947
1006
  * Retrieve batch scrape errors and robots.txt blocks.
package/dist/index.d.cts CHANGED
@@ -4,7 +4,7 @@ import { AxiosResponse, AxiosRequestHeaders } from 'axios';
4
4
  import { EventEmitter } from 'events';
5
5
  import { TypedEventTarget } from 'typescript-event-target';
6
6
 
7
- type FormatString = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "summary" | "changeTracking" | "json";
7
+ type FormatString = "markdown" | "html" | "rawHtml" | "links" | "images" | "screenshot" | "summary" | "changeTracking" | "json" | "attributes";
8
8
  interface Viewport {
9
9
  width: number;
10
10
  height: number;
@@ -33,7 +33,14 @@ interface ChangeTrackingFormat extends Format {
33
33
  prompt?: string;
34
34
  tag?: string;
35
35
  }
36
- type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat;
36
+ interface AttributesFormat extends Format {
37
+ type: "attributes";
38
+ selectors: Array<{
39
+ selector: string;
40
+ attribute: string;
41
+ }>;
42
+ }
43
+ type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat | AttributesFormat;
37
44
  interface LocationConfig {
38
45
  country?: string;
39
46
  languages?: string[];
@@ -133,11 +140,27 @@ interface Document {
133
140
  summary?: string;
134
141
  metadata?: DocumentMetadata;
135
142
  links?: string[];
143
+ images?: string[];
136
144
  screenshot?: string;
145
+ attributes?: Array<{
146
+ selector: string;
147
+ attribute: string;
148
+ values: string[];
149
+ }>;
137
150
  actions?: Record<string, unknown>;
138
151
  warning?: string;
139
152
  changeTracking?: Record<string, unknown>;
140
153
  }
154
+ interface PaginationConfig {
155
+ /** When true (default), automatically follow `next` links and aggregate all documents. */
156
+ autoPaginate?: boolean;
157
+ /** Maximum number of additional pages to fetch after the first response. */
158
+ maxPages?: number;
159
+ /** Maximum total number of documents to return across all pages. */
160
+ maxResults?: number;
161
+ /** Maximum time to spend fetching additional pages (in seconds). */
162
+ maxWaitTime?: number;
163
+ }
141
164
  interface SearchResultWeb {
142
165
  url: string;
143
166
  title?: string;
@@ -427,7 +450,7 @@ declare class FirecrawlClient {
427
450
  * Get the status and partial data of a crawl job.
428
451
  * @param jobId Crawl job id.
429
452
  */
430
- getCrawlStatus(jobId: string): Promise<CrawlJob>;
453
+ getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
431
454
  /**
432
455
  * Cancel a crawl job.
433
456
  * @param jobId Crawl job id.
@@ -470,7 +493,7 @@ declare class FirecrawlClient {
470
493
  * Get the status and partial data of a batch scrape job.
471
494
  * @param jobId Batch job id.
472
495
  */
473
- getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
496
+ getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;
474
497
  /**
475
498
  * Retrieve batch scrape errors and robots.txt blocks.
476
499
  * @param jobId Batch job id.
@@ -1348,4 +1371,4 @@ declare class Firecrawl extends FirecrawlClient {
1348
1371
  get v1(): FirecrawlApp;
1349
1372
  }
1350
1373
 
1351
- export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
1374
+ export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
package/dist/index.d.ts CHANGED
@@ -4,7 +4,7 @@ import { AxiosResponse, AxiosRequestHeaders } from 'axios';
4
4
  import { EventEmitter } from 'events';
5
5
  import { TypedEventTarget } from 'typescript-event-target';
6
6
 
7
- type FormatString = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "summary" | "changeTracking" | "json";
7
+ type FormatString = "markdown" | "html" | "rawHtml" | "links" | "images" | "screenshot" | "summary" | "changeTracking" | "json" | "attributes";
8
8
  interface Viewport {
9
9
  width: number;
10
10
  height: number;
@@ -33,7 +33,14 @@ interface ChangeTrackingFormat extends Format {
33
33
  prompt?: string;
34
34
  tag?: string;
35
35
  }
36
- type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat;
36
+ interface AttributesFormat extends Format {
37
+ type: "attributes";
38
+ selectors: Array<{
39
+ selector: string;
40
+ attribute: string;
41
+ }>;
42
+ }
43
+ type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat | AttributesFormat;
37
44
  interface LocationConfig {
38
45
  country?: string;
39
46
  languages?: string[];
@@ -133,11 +140,27 @@ interface Document {
133
140
  summary?: string;
134
141
  metadata?: DocumentMetadata;
135
142
  links?: string[];
143
+ images?: string[];
136
144
  screenshot?: string;
145
+ attributes?: Array<{
146
+ selector: string;
147
+ attribute: string;
148
+ values: string[];
149
+ }>;
137
150
  actions?: Record<string, unknown>;
138
151
  warning?: string;
139
152
  changeTracking?: Record<string, unknown>;
140
153
  }
154
+ interface PaginationConfig {
155
+ /** When true (default), automatically follow `next` links and aggregate all documents. */
156
+ autoPaginate?: boolean;
157
+ /** Maximum number of additional pages to fetch after the first response. */
158
+ maxPages?: number;
159
+ /** Maximum total number of documents to return across all pages. */
160
+ maxResults?: number;
161
+ /** Maximum time to spend fetching additional pages (in seconds). */
162
+ maxWaitTime?: number;
163
+ }
141
164
  interface SearchResultWeb {
142
165
  url: string;
143
166
  title?: string;
@@ -427,7 +450,7 @@ declare class FirecrawlClient {
427
450
  * Get the status and partial data of a crawl job.
428
451
  * @param jobId Crawl job id.
429
452
  */
430
- getCrawlStatus(jobId: string): Promise<CrawlJob>;
453
+ getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
431
454
  /**
432
455
  * Cancel a crawl job.
433
456
  * @param jobId Crawl job id.
@@ -470,7 +493,7 @@ declare class FirecrawlClient {
470
493
  * Get the status and partial data of a batch scrape job.
471
494
  * @param jobId Batch job id.
472
495
  */
473
- getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
496
+ getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;
474
497
  /**
475
498
  * Retrieve batch scrape errors and robots.txt blocks.
476
499
  * @param jobId Batch job id.
@@ -1348,4 +1371,4 @@ declare class Firecrawl extends FirecrawlClient {
1348
1371
  get v1(): FirecrawlApp;
1349
1372
  }
1350
1373
 
1351
- export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
1374
+ export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
package/dist/index.js CHANGED
@@ -1,6 +1,6 @@
1
1
  import {
2
2
  require_package
3
- } from "./chunk-QPAPMZLC.js";
3
+ } from "./chunk-YH34PXKT.js";
4
4
 
5
5
  // src/v2/utils/httpClient.ts
6
6
  import axios from "axios";
@@ -279,6 +279,37 @@ async function map(http, url, options) {
279
279
  }
280
280
  }
281
281
 
282
+ // src/v2/utils/pagination.ts
283
+ async function fetchAllPages(http, nextUrl, initial, pagination) {
284
+ const docs = initial.slice();
285
+ let current = nextUrl;
286
+ let pageCount = 0;
287
+ const maxPages = pagination?.maxPages ?? void 0;
288
+ const maxResults = pagination?.maxResults ?? void 0;
289
+ const maxWaitTime = pagination?.maxWaitTime ?? void 0;
290
+ const started = Date.now();
291
+ while (current) {
292
+ if (maxPages != null && pageCount >= maxPages) break;
293
+ if (maxWaitTime != null && (Date.now() - started) / 1e3 > maxWaitTime) break;
294
+ let payload = null;
295
+ try {
296
+ const res = await http.get(current);
297
+ payload = res.data;
298
+ } catch {
299
+ break;
300
+ }
301
+ if (!payload?.success) break;
302
+ for (const d of payload.data || []) {
303
+ if (maxResults != null && docs.length >= maxResults) break;
304
+ docs.push(d);
305
+ }
306
+ if (maxResults != null && docs.length >= maxResults) break;
307
+ current = payload.next ?? null;
308
+ pageCount += 1;
309
+ }
310
+ return docs;
311
+ }
312
+
282
313
  // src/v2/methods/crawl.ts
283
314
  function prepareCrawlPayload(request) {
284
315
  if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty");
@@ -316,21 +347,35 @@ async function startCrawl(http, request) {
316
347
  throw err;
317
348
  }
318
349
  }
319
- async function getCrawlStatus(http, jobId) {
350
+ async function getCrawlStatus(http, jobId, pagination) {
320
351
  try {
321
352
  const res = await http.get(`/v2/crawl/${jobId}`);
322
353
  if (res.status !== 200 || !res.data?.success) {
323
354
  throwForBadResponse(res, "get crawl status");
324
355
  }
325
356
  const body = res.data;
357
+ const initialDocs = body.data || [];
358
+ const auto = pagination?.autoPaginate ?? true;
359
+ if (!auto || !body.next) {
360
+ return {
361
+ status: body.status,
362
+ completed: body.completed ?? 0,
363
+ total: body.total ?? 0,
364
+ creditsUsed: body.creditsUsed,
365
+ expiresAt: body.expiresAt,
366
+ next: body.next ?? null,
367
+ data: initialDocs
368
+ };
369
+ }
370
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
326
371
  return {
327
372
  status: body.status,
328
373
  completed: body.completed ?? 0,
329
374
  total: body.total ?? 0,
330
375
  creditsUsed: body.creditsUsed,
331
376
  expiresAt: body.expiresAt,
332
- next: body.next ?? null,
333
- data: body.data || []
377
+ next: null,
378
+ data: aggregated
334
379
  };
335
380
  } catch (err) {
336
381
  if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
@@ -433,19 +478,33 @@ async function startBatchScrape(http, urls, {
433
478
  throw err;
434
479
  }
435
480
  }
436
- async function getBatchScrapeStatus(http, jobId) {
481
+ async function getBatchScrapeStatus(http, jobId, pagination) {
437
482
  try {
438
483
  const res = await http.get(`/v2/batch/scrape/${jobId}`);
439
484
  if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
440
485
  const body = res.data;
486
+ const initialDocs = body.data || [];
487
+ const auto = pagination?.autoPaginate ?? true;
488
+ if (!auto || !body.next) {
489
+ return {
490
+ status: body.status,
491
+ completed: body.completed ?? 0,
492
+ total: body.total ?? 0,
493
+ creditsUsed: body.creditsUsed,
494
+ expiresAt: body.expiresAt,
495
+ next: body.next ?? null,
496
+ data: initialDocs
497
+ };
498
+ }
499
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
441
500
  return {
442
501
  status: body.status,
443
502
  completed: body.completed ?? 0,
444
503
  total: body.total ?? 0,
445
504
  creditsUsed: body.creditsUsed,
446
505
  expiresAt: body.expiresAt,
447
- next: body.next ?? null,
448
- data: body.data || []
506
+ next: null,
507
+ data: aggregated
449
508
  };
450
509
  } catch (err) {
451
510
  if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
@@ -769,8 +828,8 @@ var FirecrawlClient = class {
769
828
  * Get the status and partial data of a crawl job.
770
829
  * @param jobId Crawl job id.
771
830
  */
772
- async getCrawlStatus(jobId) {
773
- return getCrawlStatus(this.http, jobId);
831
+ async getCrawlStatus(jobId, pagination) {
832
+ return getCrawlStatus(this.http, jobId, pagination);
774
833
  }
775
834
  /**
776
835
  * Cancel a crawl job.
@@ -824,8 +883,8 @@ var FirecrawlClient = class {
824
883
  * Get the status and partial data of a batch scrape job.
825
884
  * @param jobId Batch job id.
826
885
  */
827
- async getBatchScrapeStatus(jobId) {
828
- return getBatchScrapeStatus(this.http, jobId);
886
+ async getBatchScrapeStatus(jobId, pagination) {
887
+ return getBatchScrapeStatus(this.http, jobId, pagination);
829
888
  }
830
889
  /**
831
890
  * Retrieve batch scrape errors and robots.txt blocks.
@@ -933,7 +992,7 @@ var FirecrawlApp = class {
933
992
  if (typeof process !== "undefined" && process.env && process.env.npm_package_version) {
934
993
  return process.env.npm_package_version;
935
994
  }
936
- const packageJson = await import("./package-VNFDXLYR.js");
995
+ const packageJson = await import("./package-CW75NWUC.js");
937
996
  return packageJson.default.version;
938
997
  } catch (error) {
939
998
  const isTest = typeof process !== "undefined" && (process.env.JEST_WORKER_ID != null || false);
@@ -1,4 +1,4 @@
1
1
  import {
2
2
  require_package
3
- } from "./chunk-QPAPMZLC.js";
3
+ } from "./chunk-YH34PXKT.js";
4
4
  export default require_package();
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mendable/firecrawl",
3
- "version": "3.2.1",
3
+ "version": "4.0.0",
4
4
  "description": "JavaScript SDK for Firecrawl API",
5
5
  "main": "dist/index.js",
6
6
  "types": "dist/index.d.ts",
@@ -121,6 +121,39 @@ describe("v2.scrape e2e", () => {
121
121
  }
122
122
  }, 90_000);
123
123
 
124
+ test("images format: extract all images from webpage", async () => {
125
+ if (!client) throw new Error();
126
+ const doc = await client.scrape("https://firecrawl.dev", {
127
+ formats: ["images"],
128
+ });
129
+ expect(doc.images).toBeTruthy();
130
+ expect(Array.isArray(doc.images)).toBe(true);
131
+ expect(doc.images.length).toBeGreaterThan(0);
132
+ // Should find firecrawl logo/branding images
133
+ expect(doc.images.some(img => img.includes("firecrawl") || img.includes("logo"))).toBe(true);
134
+ }, 60_000);
135
+
136
+ test("images format: works with multiple formats", async () => {
137
+ if (!client) throw new Error();
138
+ const doc = await client.scrape("https://github.com", {
139
+ formats: ["markdown", "links", "images"],
140
+ });
141
+ expect(doc.markdown).toBeTruthy();
142
+ expect(doc.links).toBeTruthy();
143
+ expect(doc.images).toBeTruthy();
144
+ expect(Array.isArray(doc.images)).toBe(true);
145
+ expect(doc.images.length).toBeGreaterThan(0);
146
+
147
+ // Images should find things not available in links format
148
+ const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico'];
149
+ const linkImages = doc.links?.filter(link =>
150
+ imageExtensions.some(ext => link.toLowerCase().includes(ext))
151
+ ) || [];
152
+
153
+ // Should discover additional images beyond those with obvious extensions
154
+ expect(doc.images.length).toBeGreaterThanOrEqual(linkImages.length);
155
+ }, 60_000);
156
+
124
157
  test("invalid url should throw", async () => {
125
158
  if (!client) throw new Error();
126
159
  await expect(client.scrape("")).rejects.toThrow("URL cannot be empty");
@@ -0,0 +1,112 @@
1
+ import { describe, test, expect, jest } from "@jest/globals";
2
+ import { getCrawlStatus } from "../../../v2/methods/crawl";
3
+ import { getBatchScrapeStatus } from "../../../v2/methods/batch";
4
+
5
+ describe("JS SDK v2 pagination", () => {
6
+ function makeHttp(getImpl: (url: string) => any) {
7
+ return { get: jest.fn(async (u: string) => getImpl(u)) } as any;
8
+ }
9
+
10
+ test("crawl: autoPaginate=false returns next", async () => {
11
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 2, next: "https://api/next", data: [{ markdown: "a" }] } };
12
+ const http = makeHttp(() => first);
13
+ const res = await getCrawlStatus(http, "job1", { autoPaginate: false });
14
+ expect(res.data.length).toBe(1);
15
+ expect(res.next).toBe("https://api/next");
16
+ });
17
+
18
+ test("crawl: default autoPaginate aggregates and nulls next", async () => {
19
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 3, next: "https://api/n1", data: [{ markdown: "a" }] } };
20
+ const second = { status: 200, data: { success: true, next: "https://api/n2", data: [{ markdown: "b" }] } };
21
+ const third = { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
22
+ const http = makeHttp((url) => {
23
+ if (url.includes("/v2/crawl/")) return first;
24
+ if (url.endsWith("n1")) return second;
25
+ return third;
26
+ });
27
+ const res = await getCrawlStatus(http, "job1");
28
+ expect(res.data.length).toBe(3);
29
+ expect(res.next).toBeNull();
30
+ });
31
+
32
+ test("crawl: respects maxPages and maxResults", async () => {
33
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 10, next: "https://api/n1", data: [{ markdown: "a" }] } };
34
+ const page = (n: number) => ({ status: 200, data: { success: true, next: n < 3 ? `https://api/n${n + 1}` : null, data: [{ markdown: `p${n}` }] } });
35
+ const http = makeHttp((url) => {
36
+ if (url.includes("/v2/crawl/")) return first;
37
+ if (url.endsWith("n1")) return page(1);
38
+ if (url.endsWith("n2")) return page(2);
39
+ return page(3);
40
+ });
41
+ const res = await getCrawlStatus(http, "job1", { autoPaginate: true, maxPages: 2, maxResults: 2 });
42
+ expect(res.data.length).toBe(2);
43
+ });
44
+
45
+ test("batch: default autoPaginate aggregates and nulls next", async () => {
46
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 3, next: "https://api/b1", data: [{ markdown: "a" }] } };
47
+ const second = { status: 200, data: { success: true, next: "https://api/b2", data: [{ markdown: "b" }] } };
48
+ const third = { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
49
+ const http = makeHttp((url) => {
50
+ if (url.includes("/v2/batch/scrape/")) return first;
51
+ if (url.endsWith("b1")) return second;
52
+ return third;
53
+ });
54
+ const res = await getBatchScrapeStatus(http, "jobB");
55
+ expect(res.data.length).toBe(3);
56
+ expect(res.next).toBeNull();
57
+ });
58
+
59
+ test("batch: autoPaginate=false returns next", async () => {
60
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 2, next: "https://api/nextBatch", data: [{ markdown: "a" }] } };
61
+ const http = makeHttp(() => first);
62
+ const res = await getBatchScrapeStatus(http, "jobB", { autoPaginate: false });
63
+ expect(res.data.length).toBe(1);
64
+ expect(res.next).toBe("https://api/nextBatch");
65
+ });
66
+
67
+ test("crawl: maxWaitTime stops pagination after first page", async () => {
68
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 5, next: "https://api/n1", data: [{ markdown: "a" }] } };
69
+ const p1 = { status: 200, data: { success: true, next: "https://api/n2", data: [{ markdown: "b" }] } };
70
+ const http: any = makeHttp((url: string) => {
71
+ if (url.includes("/v2/crawl/")) return first;
72
+ if (url.endsWith("n1")) return p1;
73
+ return { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
74
+ });
75
+ const nowSpy = jest.spyOn(Date, "now");
76
+ try {
77
+ nowSpy
78
+ .mockImplementationOnce(() => 0) // started
79
+ .mockImplementationOnce(() => 0) // first loop check
80
+ .mockImplementationOnce(() => 3000); // second loop check > maxWaitTime
81
+ const res = await getCrawlStatus(http, "jobC", { autoPaginate: true, maxWaitTime: 1 });
82
+ expect(res.data.length).toBe(2); // initial + first page
83
+ expect((http.get as jest.Mock).mock.calls.length).toBe(2); // initial + n1 only
84
+ } finally {
85
+ nowSpy.mockRestore();
86
+ }
87
+ });
88
+
89
+ test("batch: maxWaitTime stops pagination after first page", async () => {
90
+ const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 5, next: "https://api/b1", data: [{ markdown: "a" }] } };
91
+ const p1 = { status: 200, data: { success: true, next: "https://api/b2", data: [{ markdown: "b" }] } };
92
+ const http: any = makeHttp((url: string) => {
93
+ if (url.includes("/v2/batch/scrape/")) return first;
94
+ if (url.endsWith("b1")) return p1;
95
+ return { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
96
+ });
97
+ const nowSpy = jest.spyOn(Date, "now");
98
+ try {
99
+ nowSpy
100
+ .mockImplementationOnce(() => 0) // started
101
+ .mockImplementationOnce(() => 0) // first loop check
102
+ .mockImplementationOnce(() => 3000); // second loop check > maxWaitTime
103
+ const res = await getBatchScrapeStatus(http, "jobB", { autoPaginate: true, maxWaitTime: 1 });
104
+ expect(res.data.length).toBe(2);
105
+ expect((http.get as jest.Mock).mock.calls.length).toBe(2);
106
+ } finally {
107
+ nowSpy.mockRestore();
108
+ }
109
+ });
110
+ });
111
+
112
+
package/src/v2/client.ts CHANGED
@@ -36,6 +36,7 @@ import type {
36
36
  ExtractResponse,
37
37
  CrawlOptions,
38
38
  BatchScrapeOptions,
39
+ PaginationConfig,
39
40
  } from "./types";
40
41
  import { Watcher } from "./watcher";
41
42
  import type { WatcherOptions } from "./watcher";
@@ -145,8 +146,8 @@ export class FirecrawlClient {
145
146
  * Get the status and partial data of a crawl job.
146
147
  * @param jobId Crawl job id.
147
148
  */
148
- async getCrawlStatus(jobId: string): Promise<CrawlJob> {
149
- return getCrawlStatus(this.http, jobId);
149
+ async getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob> {
150
+ return getCrawlStatus(this.http, jobId, pagination);
150
151
  }
151
152
  /**
152
153
  * Cancel a crawl job.
@@ -201,8 +202,8 @@ export class FirecrawlClient {
201
202
  * Get the status and partial data of a batch scrape job.
202
203
  * @param jobId Batch job id.
203
204
  */
204
- async getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob> {
205
- return getBatchScrapeStatus(this.http, jobId);
205
+ async getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob> {
206
+ return getBatchScrapeStatus(this.http, jobId, pagination);
206
207
  }
207
208
  /**
208
209
  * Retrieve batch scrape errors and robots.txt blocks.
@@ -4,9 +4,11 @@ import {
4
4
  type CrawlErrorsResponse,
5
5
  type Document,
6
6
  type BatchScrapeOptions,
7
+ type PaginationConfig,
7
8
  } from "../types";
8
9
  import { HttpClient } from "../utils/httpClient";
9
10
  import { ensureValidScrapeOptions } from "../utils/validation";
11
+ import { fetchAllPages } from "../utils/pagination";
10
12
  import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
11
13
 
12
14
  export async function startBatchScrape(
@@ -47,19 +49,38 @@ export async function startBatchScrape(
47
49
  }
48
50
  }
49
51
 
50
- export async function getBatchScrapeStatus(http: HttpClient, jobId: string): Promise<BatchScrapeJob> {
52
+ export async function getBatchScrapeStatus(
53
+ http: HttpClient,
54
+ jobId: string,
55
+ pagination?: PaginationConfig
56
+ ): Promise<BatchScrapeJob> {
51
57
  try {
52
58
  const res = await http.get<{ success: boolean; status: BatchScrapeJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/batch/scrape/${jobId}`);
53
59
  if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
54
60
  const body = res.data;
61
+ const initialDocs = (body.data || []) as Document[];
62
+ const auto = pagination?.autoPaginate ?? true;
63
+ if (!auto || !body.next) {
64
+ return {
65
+ status: body.status,
66
+ completed: body.completed ?? 0,
67
+ total: body.total ?? 0,
68
+ creditsUsed: body.creditsUsed,
69
+ expiresAt: body.expiresAt,
70
+ next: body.next ?? null,
71
+ data: initialDocs,
72
+ };
73
+ }
74
+
75
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
55
76
  return {
56
77
  status: body.status,
57
78
  completed: body.completed ?? 0,
58
79
  total: body.total ?? 0,
59
80
  creditsUsed: body.creditsUsed,
60
81
  expiresAt: body.expiresAt,
61
- next: body.next ?? null,
62
- data: (body.data || []) as Document[],
82
+ next: null,
83
+ data: aggregated,
63
84
  };
64
85
  } catch (err: any) {
65
86
  if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
@@ -115,5 +136,4 @@ export function chunkUrls(urls: string[], chunkSize = 100): string[][] {
115
136
  const chunks: string[][] = [];
116
137
  for (let i = 0; i < urls.length; i += chunkSize) chunks.push(urls.slice(i, i + chunkSize));
117
138
  return chunks;
118
- }
119
-
139
+ }
@@ -5,10 +5,13 @@ import {
5
5
  type CrawlResponse,
6
6
  type Document,
7
7
  type CrawlOptions,
8
+ type PaginationConfig,
8
9
  } from "../types";
9
10
  import { HttpClient } from "../utils/httpClient";
10
11
  import { ensureValidScrapeOptions } from "../utils/validation";
11
12
  import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
13
+ import type { HttpClient as _Http } from "../utils/httpClient";
14
+ import { fetchAllPages } from "../utils/pagination";
12
15
 
13
16
  export type CrawlRequest = CrawlOptions & {
14
17
  url: string;
@@ -52,21 +55,42 @@ export async function startCrawl(http: HttpClient, request: CrawlRequest): Promi
52
55
  }
53
56
  }
54
57
 
55
- export async function getCrawlStatus(http: HttpClient, jobId: string): Promise<CrawlJob> {
58
+ export async function getCrawlStatus(
59
+ http: HttpClient,
60
+ jobId: string,
61
+ pagination?: PaginationConfig
62
+ ): Promise<CrawlJob> {
56
63
  try {
57
64
  const res = await http.get<{ success: boolean; status: CrawlJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/crawl/${jobId}`);
58
65
  if (res.status !== 200 || !res.data?.success) {
59
66
  throwForBadResponse(res, "get crawl status");
60
67
  }
61
68
  const body = res.data;
69
+ const initialDocs = (body.data || []) as Document[];
70
+
71
+ const auto = pagination?.autoPaginate ?? true;
72
+ if (!auto || !body.next) {
73
+ return {
74
+ status: body.status,
75
+ completed: body.completed ?? 0,
76
+ total: body.total ?? 0,
77
+ creditsUsed: body.creditsUsed,
78
+ expiresAt: body.expiresAt,
79
+ next: body.next ?? null,
80
+ data: initialDocs,
81
+ };
82
+ }
83
+
84
+ const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
85
+
62
86
  return {
63
87
  status: body.status,
64
88
  completed: body.completed ?? 0,
65
89
  total: body.total ?? 0,
66
90
  creditsUsed: body.creditsUsed,
67
91
  expiresAt: body.expiresAt,
68
- next: body.next ?? null,
69
- data: (body.data || []) as Document[],
92
+ next: null,
93
+ data: aggregated,
70
94
  };
71
95
  } catch (err: any) {
72
96
  if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
@@ -140,5 +164,4 @@ export async function crawlParamsPreview(http: HttpClient, url: string, prompt:
140
164
  if (err?.isAxiosError) return normalizeAxiosError(err, "crawl params preview");
141
165
  throw err;
142
166
  }
143
- }
144
-
167
+ }
package/src/v2/types.ts CHANGED
@@ -6,10 +6,12 @@ export type FormatString =
6
6
  | "html"
7
7
  | "rawHtml"
8
8
  | "links"
9
+ | "images"
9
10
  | "screenshot"
10
11
  | "summary"
11
12
  | "changeTracking"
12
- | "json";
13
+ | "json"
14
+ | "attributes";
13
15
 
14
16
  export interface Viewport {
15
17
  width: number;
@@ -40,13 +42,21 @@ export interface ChangeTrackingFormat extends Format {
40
42
  prompt?: string;
41
43
  tag?: string;
42
44
  }
45
+ export interface AttributesFormat extends Format {
46
+ type: "attributes";
47
+ selectors: Array<{
48
+ selector: string;
49
+ attribute: string;
50
+ }>;
51
+ }
43
52
 
44
53
  export type FormatOption =
45
54
  | FormatString
46
55
  | Format
47
56
  | JsonFormat
48
57
  | ChangeTrackingFormat
49
- | ScreenshotFormat;
58
+ | ScreenshotFormat
59
+ | AttributesFormat;
50
60
 
51
61
  export interface LocationConfig {
52
62
  country?: string;
@@ -167,12 +177,30 @@ export interface Document {
167
177
  summary?: string;
168
178
  metadata?: DocumentMetadata;
169
179
  links?: string[];
180
+ images?: string[];
170
181
  screenshot?: string;
182
+ attributes?: Array<{
183
+ selector: string;
184
+ attribute: string;
185
+ values: string[];
186
+ }>;
171
187
  actions?: Record<string, unknown>;
172
188
  warning?: string;
173
189
  changeTracking?: Record<string, unknown>;
174
190
  }
175
191
 
192
+ // Pagination configuration for auto-fetching pages from v2 endpoints that return a `next` URL
193
+ export interface PaginationConfig {
194
+ /** When true (default), automatically follow `next` links and aggregate all documents. */
195
+ autoPaginate?: boolean;
196
+ /** Maximum number of additional pages to fetch after the first response. */
197
+ maxPages?: number;
198
+ /** Maximum total number of documents to return across all pages. */
199
+ maxResults?: number;
200
+ /** Maximum time to spend fetching additional pages (in seconds). */
201
+ maxWaitTime?: number;
202
+ }
203
+
176
204
  export interface SearchResultWeb {
177
205
  url: string;
178
206
  title?: string;
@@ -0,0 +1,45 @@
1
+ import type { HttpClient } from "../utils/httpClient";
2
+ import type { Document, PaginationConfig } from "../types";
3
+
4
+ /**
5
+ * Shared helper to follow `next` cursors and aggregate documents with limits.
6
+ */
7
+ export async function fetchAllPages(
8
+ http: HttpClient,
9
+ nextUrl: string,
10
+ initial: Document[],
11
+ pagination?: PaginationConfig
12
+ ): Promise<Document[]> {
13
+ const docs = initial.slice();
14
+ let current: string | null = nextUrl;
15
+ let pageCount = 0;
16
+ const maxPages = pagination?.maxPages ?? undefined;
17
+ const maxResults = pagination?.maxResults ?? undefined;
18
+ const maxWaitTime = pagination?.maxWaitTime ?? undefined;
19
+ const started = Date.now();
20
+
21
+ while (current) {
22
+ if (maxPages != null && pageCount >= maxPages) break;
23
+ if (maxWaitTime != null && (Date.now() - started) / 1000 > maxWaitTime) break;
24
+
25
+ let payload: { success: boolean; next?: string | null; data?: Document[] } | null = null;
26
+ try {
27
+ const res = await http.get<{ success: boolean; next?: string | null; data?: Document[] }>(current);
28
+ payload = res.data;
29
+ } catch {
30
+ break; // axios rejects on non-2xx; stop pagination gracefully
31
+ }
32
+ if (!payload?.success) break;
33
+
34
+ for (const d of payload.data || []) {
35
+ if (maxResults != null && docs.length >= maxResults) break;
36
+ docs.push(d as Document);
37
+ }
38
+ if (maxResults != null && docs.length >= maxResults) break;
39
+ current = (payload.next ?? null) as string | null;
40
+ pageCount += 1;
41
+ }
42
+ return docs;
43
+ }
44
+
45
+