npm - @mendable/firecrawl - Versions diffs - 3.2.1 → 4.0.0 - Mend

@mendable/firecrawl 3.2.1 → 4.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (14) hide show

package/dist/{chunk-QPAPMZLC.js → chunk-YH34PXKT.js} +1 -1
package/dist/index.cjs +70 -11
package/dist/index.d.cts +28 -5
package/dist/index.d.ts +28 -5
package/dist/index.js +71 -12
package/dist/{package-VNFDXLYR.js → package-CW75NWUC.js} +1 -1
package/package.json +1 -1
package/src/__tests__/e2e/v2/scrape.test.ts +33 -0
package/src/__tests__/unit/v2/pagination.test.ts +112 -0
package/src/v2/client.ts +5 -4
package/src/v2/methods/batch.ts +25 -5
package/src/v2/methods/crawl.ts +28 -5
package/src/v2/types.ts +30 -2
package/src/v2/utils/pagination.ts +45 -0

package/dist/{chunk-QPAPMZLC.js → chunk-YH34PXKT.js} RENAMED Viewed

@@ -8,7 +8,7 @@ var require_package = __commonJS({
   "package.json"(exports, module) {
     module.exports = {
       name: "@mendable/firecrawl-js",
-      version: "3.2.1",
+      version: "4.0.0",
       description: "JavaScript SDK for Firecrawl API",
       main: "dist/index.js",
       types: "dist/index.d.ts",

package/dist/index.cjs CHANGED Viewed

@@ -35,7 +35,7 @@ var require_package = __commonJS({
   "package.json"(exports2, module2) {
     module2.exports = {
       name: "@mendable/firecrawl-js",
-      version: "3.2.1",
+      version: "4.0.0",
       description: "JavaScript SDK for Firecrawl API",
       main: "dist/index.js",
       types: "dist/index.d.ts",
@@ -395,6 +395,37 @@ async function map(http, url, options) {
   }
 }
+// src/v2/utils/pagination.ts
+async function fetchAllPages(http, nextUrl, initial, pagination) {
+  const docs = initial.slice();
+  let current = nextUrl;
+  let pageCount = 0;
+  const maxPages = pagination?.maxPages ?? void 0;
+  const maxResults = pagination?.maxResults ?? void 0;
+  const maxWaitTime = pagination?.maxWaitTime ?? void 0;
+  const started = Date.now();
+  while (current) {
+    if (maxPages != null && pageCount >= maxPages) break;
+    if (maxWaitTime != null && (Date.now() - started) / 1e3 > maxWaitTime) break;
+    let payload = null;
+    try {
+      const res = await http.get(current);
+      payload = res.data;
+    } catch {
+      break;
+    }
+    if (!payload?.success) break;
+    for (const d of payload.data || []) {
+      if (maxResults != null && docs.length >= maxResults) break;
+      docs.push(d);
+    }
+    if (maxResults != null && docs.length >= maxResults) break;
+    current = payload.next ?? null;
+    pageCount += 1;
+  }
+  return docs;
+}
 // src/v2/methods/crawl.ts
 function prepareCrawlPayload(request) {
   if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty");
@@ -432,21 +463,35 @@ async function startCrawl(http, request) {
     throw err;
   }
 }
-async function getCrawlStatus(http, jobId) {
+async function getCrawlStatus(http, jobId, pagination) {
   try {
     const res = await http.get(`/v2/crawl/${jobId}`);
     if (res.status !== 200 || !res.data?.success) {
       throwForBadResponse(res, "get crawl status");
     }
     const body = res.data;
+    const initialDocs = body.data || [];
+    const auto = pagination?.autoPaginate ?? true;
+    if (!auto || !body.next) {
+      return {
+        status: body.status,
+        completed: body.completed ?? 0,
+        total: body.total ?? 0,
+        creditsUsed: body.creditsUsed,
+        expiresAt: body.expiresAt,
+        next: body.next ?? null,
+        data: initialDocs
+      };
+    }
+    const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
     return {
       status: body.status,
       completed: body.completed ?? 0,
       total: body.total ?? 0,
       creditsUsed: body.creditsUsed,
       expiresAt: body.expiresAt,
-      next: body.next ?? null,
-      data: body.data || []
+      next: null,
+      data: aggregated
     };
   } catch (err) {
     if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
@@ -549,19 +594,33 @@ async function startBatchScrape(http, urls, {
     throw err;
   }
 }
-async function getBatchScrapeStatus(http, jobId) {
+async function getBatchScrapeStatus(http, jobId, pagination) {
   try {
     const res = await http.get(`/v2/batch/scrape/${jobId}`);
     if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
     const body = res.data;
+    const initialDocs = body.data || [];
+    const auto = pagination?.autoPaginate ?? true;
+    if (!auto || !body.next) {
+      return {
+        status: body.status,
+        completed: body.completed ?? 0,
+        total: body.total ?? 0,
+        creditsUsed: body.creditsUsed,
+        expiresAt: body.expiresAt,
+        next: body.next ?? null,
+        data: initialDocs
+      };
+    }
+    const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
     return {
       status: body.status,
       completed: body.completed ?? 0,
       total: body.total ?? 0,
       creditsUsed: body.creditsUsed,
       expiresAt: body.expiresAt,
-      next: body.next ?? null,
-      data: body.data || []
+      next: null,
+      data: aggregated
     };
   } catch (err) {
     if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
@@ -885,8 +944,8 @@ var FirecrawlClient = class {
    * Get the status and partial data of a crawl job.
    * @param jobId Crawl job id.
    */
-  async getCrawlStatus(jobId) {
-    return getCrawlStatus(this.http, jobId);
+  async getCrawlStatus(jobId, pagination) {
+    return getCrawlStatus(this.http, jobId, pagination);
   }
   /**
    * Cancel a crawl job.
@@ -940,8 +999,8 @@ var FirecrawlClient = class {
    * Get the status and partial data of a batch scrape job.
    * @param jobId Batch job id.
    */
-  async getBatchScrapeStatus(jobId) {
-    return getBatchScrapeStatus(this.http, jobId);
+  async getBatchScrapeStatus(jobId, pagination) {
+    return getBatchScrapeStatus(this.http, jobId, pagination);
   }
   /**
    * Retrieve batch scrape errors and robots.txt blocks.

package/dist/index.d.cts CHANGED Viewed

@@ -4,7 +4,7 @@ import { AxiosResponse, AxiosRequestHeaders } from 'axios';
 import { EventEmitter } from 'events';
 import { TypedEventTarget } from 'typescript-event-target';
-type FormatString = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "summary" | "changeTracking" | "json";
+type FormatString = "markdown" | "html" | "rawHtml" | "links" | "images" | "screenshot" | "summary" | "changeTracking" | "json" | "attributes";
 interface Viewport {
     width: number;
     height: number;
@@ -33,7 +33,14 @@ interface ChangeTrackingFormat extends Format {
     prompt?: string;
     tag?: string;
 }
-type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat;
+interface AttributesFormat extends Format {
+    type: "attributes";
+    selectors: Array<{
+        selector: string;
+        attribute: string;
+    }>;
+}
+type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat | AttributesFormat;
 interface LocationConfig {
     country?: string;
     languages?: string[];
@@ -133,11 +140,27 @@ interface Document {
     summary?: string;
     metadata?: DocumentMetadata;
     links?: string[];
+    images?: string[];
     screenshot?: string;
+    attributes?: Array<{
+        selector: string;
+        attribute: string;
+        values: string[];
+    }>;
     actions?: Record<string, unknown>;
     warning?: string;
     changeTracking?: Record<string, unknown>;
 }
+interface PaginationConfig {
+    /** When true (default), automatically follow `next` links and aggregate all documents. */
+    autoPaginate?: boolean;
+    /** Maximum number of additional pages to fetch after the first response. */
+    maxPages?: number;
+    /** Maximum total number of documents to return across all pages. */
+    maxResults?: number;
+    /** Maximum time to spend fetching additional pages (in seconds). */
+    maxWaitTime?: number;
+}
 interface SearchResultWeb {
     url: string;
     title?: string;
@@ -427,7 +450,7 @@ declare class FirecrawlClient {
      * Get the status and partial data of a crawl job.
      * @param jobId Crawl job id.
      */
-    getCrawlStatus(jobId: string): Promise<CrawlJob>;
+    getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
     /**
      * Cancel a crawl job.
      * @param jobId Crawl job id.
@@ -470,7 +493,7 @@ declare class FirecrawlClient {
      * Get the status and partial data of a batch scrape job.
      * @param jobId Batch job id.
      */
-    getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
+    getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;
     /**
      * Retrieve batch scrape errors and robots.txt blocks.
      * @param jobId Batch job id.
@@ -1348,4 +1371,4 @@ declare class Firecrawl extends FirecrawlClient {
     get v1(): FirecrawlApp;
 }
-export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
+export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };

package/dist/index.d.ts CHANGED Viewed

@@ -4,7 +4,7 @@ import { AxiosResponse, AxiosRequestHeaders } from 'axios';
 import { EventEmitter } from 'events';
 import { TypedEventTarget } from 'typescript-event-target';
-type FormatString = "markdown" | "html" | "rawHtml" | "links" | "screenshot" | "summary" | "changeTracking" | "json";
+type FormatString = "markdown" | "html" | "rawHtml" | "links" | "images" | "screenshot" | "summary" | "changeTracking" | "json" | "attributes";
 interface Viewport {
     width: number;
     height: number;
@@ -33,7 +33,14 @@ interface ChangeTrackingFormat extends Format {
     prompt?: string;
     tag?: string;
 }
-type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat;
+interface AttributesFormat extends Format {
+    type: "attributes";
+    selectors: Array<{
+        selector: string;
+        attribute: string;
+    }>;
+}
+type FormatOption = FormatString | Format | JsonFormat | ChangeTrackingFormat | ScreenshotFormat | AttributesFormat;
 interface LocationConfig {
     country?: string;
     languages?: string[];
@@ -133,11 +140,27 @@ interface Document {
     summary?: string;
     metadata?: DocumentMetadata;
     links?: string[];
+    images?: string[];
     screenshot?: string;
+    attributes?: Array<{
+        selector: string;
+        attribute: string;
+        values: string[];
+    }>;
     actions?: Record<string, unknown>;
     warning?: string;
     changeTracking?: Record<string, unknown>;
 }
+interface PaginationConfig {
+    /** When true (default), automatically follow `next` links and aggregate all documents. */
+    autoPaginate?: boolean;
+    /** Maximum number of additional pages to fetch after the first response. */
+    maxPages?: number;
+    /** Maximum total number of documents to return across all pages. */
+    maxResults?: number;
+    /** Maximum time to spend fetching additional pages (in seconds). */
+    maxWaitTime?: number;
+}
 interface SearchResultWeb {
     url: string;
     title?: string;
@@ -427,7 +450,7 @@ declare class FirecrawlClient {
      * Get the status and partial data of a crawl job.
      * @param jobId Crawl job id.
      */
-    getCrawlStatus(jobId: string): Promise<CrawlJob>;
+    getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob>;
     /**
      * Cancel a crawl job.
      * @param jobId Crawl job id.
@@ -470,7 +493,7 @@ declare class FirecrawlClient {
      * Get the status and partial data of a batch scrape job.
      * @param jobId Batch job id.
      */
-    getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob>;
+    getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob>;
     /**
      * Retrieve batch scrape errors and robots.txt blocks.
      * @param jobId Batch job id.
@@ -1348,4 +1371,4 @@ declare class Firecrawl extends FirecrawlClient {
     get v1(): FirecrawlApp;
 }
-export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };
+export { type ActionOption, type ActiveCrawl, type ActiveCrawlsResponse, type AttributesFormat, type BatchScrapeJob, type BatchScrapeOptions, type BatchScrapeResponse$1 as BatchScrapeResponse, type CategoryOption, type ChangeTrackingFormat, type ClickAction, type ConcurrencyCheck, type CrawlErrorsResponse$1 as CrawlErrorsResponse, type CrawlJob, type CrawlOptions, type CrawlResponse$1 as CrawlResponse, type CreditUsage, type Document, type DocumentMetadata, type ErrorDetails, type ExecuteJavascriptAction, type ExtractResponse$1 as ExtractResponse, Firecrawl, FirecrawlApp as FirecrawlAppV1, FirecrawlClient, type FirecrawlClientOptions, type Format, type FormatOption, type FormatString, type JsonFormat, type LocationConfig, type MapData, type MapOptions, type PDFAction, type PaginationConfig, type PressAction, type ScrapeAction, type ScrapeOptions, type ScreenshotAction, type ScreenshotFormat, type ScrollAction, SdkError, type SearchData, type SearchRequest, type SearchResultImages, type SearchResultNews, type SearchResultWeb, type TokenUsage, type Viewport, type WaitAction, type WebhookConfig, type WriteAction, Firecrawl as default };

package/dist/index.js CHANGED Viewed

@@ -1,6 +1,6 @@
 import {
   require_package
-} from "./chunk-QPAPMZLC.js";
+} from "./chunk-YH34PXKT.js";
 // src/v2/utils/httpClient.ts
 import axios from "axios";
@@ -279,6 +279,37 @@ async function map(http, url, options) {
   }
 }
+// src/v2/utils/pagination.ts
+async function fetchAllPages(http, nextUrl, initial, pagination) {
+  const docs = initial.slice();
+  let current = nextUrl;
+  let pageCount = 0;
+  const maxPages = pagination?.maxPages ?? void 0;
+  const maxResults = pagination?.maxResults ?? void 0;
+  const maxWaitTime = pagination?.maxWaitTime ?? void 0;
+  const started = Date.now();
+  while (current) {
+    if (maxPages != null && pageCount >= maxPages) break;
+    if (maxWaitTime != null && (Date.now() - started) / 1e3 > maxWaitTime) break;
+    let payload = null;
+    try {
+      const res = await http.get(current);
+      payload = res.data;
+    } catch {
+      break;
+    }
+    if (!payload?.success) break;
+    for (const d of payload.data || []) {
+      if (maxResults != null && docs.length >= maxResults) break;
+      docs.push(d);
+    }
+    if (maxResults != null && docs.length >= maxResults) break;
+    current = payload.next ?? null;
+    pageCount += 1;
+  }
+  return docs;
+}
 // src/v2/methods/crawl.ts
 function prepareCrawlPayload(request) {
   if (!request.url || !request.url.trim()) throw new Error("URL cannot be empty");
@@ -316,21 +347,35 @@ async function startCrawl(http, request) {
     throw err;
   }
 }
-async function getCrawlStatus(http, jobId) {
+async function getCrawlStatus(http, jobId, pagination) {
   try {
     const res = await http.get(`/v2/crawl/${jobId}`);
     if (res.status !== 200 || !res.data?.success) {
       throwForBadResponse(res, "get crawl status");
     }
     const body = res.data;
+    const initialDocs = body.data || [];
+    const auto = pagination?.autoPaginate ?? true;
+    if (!auto || !body.next) {
+      return {
+        status: body.status,
+        completed: body.completed ?? 0,
+        total: body.total ?? 0,
+        creditsUsed: body.creditsUsed,
+        expiresAt: body.expiresAt,
+        next: body.next ?? null,
+        data: initialDocs
+      };
+    }
+    const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
     return {
       status: body.status,
       completed: body.completed ?? 0,
       total: body.total ?? 0,
       creditsUsed: body.creditsUsed,
       expiresAt: body.expiresAt,
-      next: body.next ?? null,
-      data: body.data || []
+      next: null,
+      data: aggregated
     };
   } catch (err) {
     if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
@@ -433,19 +478,33 @@ async function startBatchScrape(http, urls, {
     throw err;
   }
 }
-async function getBatchScrapeStatus(http, jobId) {
+async function getBatchScrapeStatus(http, jobId, pagination) {
   try {
     const res = await http.get(`/v2/batch/scrape/${jobId}`);
     if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
     const body = res.data;
+    const initialDocs = body.data || [];
+    const auto = pagination?.autoPaginate ?? true;
+    if (!auto || !body.next) {
+      return {
+        status: body.status,
+        completed: body.completed ?? 0,
+        total: body.total ?? 0,
+        creditsUsed: body.creditsUsed,
+        expiresAt: body.expiresAt,
+        next: body.next ?? null,
+        data: initialDocs
+      };
+    }
+    const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
     return {
       status: body.status,
       completed: body.completed ?? 0,
       total: body.total ?? 0,
       creditsUsed: body.creditsUsed,
       expiresAt: body.expiresAt,
-      next: body.next ?? null,
-      data: body.data || []
+      next: null,
+      data: aggregated
     };
   } catch (err) {
     if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
@@ -769,8 +828,8 @@ var FirecrawlClient = class {
    * Get the status and partial data of a crawl job.
    * @param jobId Crawl job id.
    */
-  async getCrawlStatus(jobId) {
-    return getCrawlStatus(this.http, jobId);
+  async getCrawlStatus(jobId, pagination) {
+    return getCrawlStatus(this.http, jobId, pagination);
   }
   /**
    * Cancel a crawl job.
@@ -824,8 +883,8 @@ var FirecrawlClient = class {
    * Get the status and partial data of a batch scrape job.
    * @param jobId Batch job id.
    */
-  async getBatchScrapeStatus(jobId) {
-    return getBatchScrapeStatus(this.http, jobId);
+  async getBatchScrapeStatus(jobId, pagination) {
+    return getBatchScrapeStatus(this.http, jobId, pagination);
   }
   /**
    * Retrieve batch scrape errors and robots.txt blocks.
@@ -933,7 +992,7 @@ var FirecrawlApp = class {
       if (typeof process !== "undefined" && process.env && process.env.npm_package_version) {
         return process.env.npm_package_version;
       }
-      const packageJson = await import("./package-VNFDXLYR.js");
+      const packageJson = await import("./package-CW75NWUC.js");
       return packageJson.default.version;
     } catch (error) {
       const isTest = typeof process !== "undefined" && (process.env.JEST_WORKER_ID != null || false);

package/dist/{package-VNFDXLYR.js → package-CW75NWUC.js} RENAMED Viewed

@@ -1,4 +1,4 @@
 import {
   require_package
-} from "./chunk-QPAPMZLC.js";
+} from "./chunk-YH34PXKT.js";
 export default require_package();

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@mendable/firecrawl",
-  "version": "3.2.1",
+  "version": "4.0.0",
   "description": "JavaScript SDK for Firecrawl API",
   "main": "dist/index.js",
   "types": "dist/index.d.ts",

package/src/__tests__/e2e/v2/scrape.test.ts CHANGED Viewed

@@ -121,6 +121,39 @@ describe("v2.scrape e2e", () => {
     }
   }, 90_000);
+  test("images format: extract all images from webpage", async () => {
+    if (!client) throw new Error();
+    const doc = await client.scrape("https://firecrawl.dev", {
+      formats: ["images"],
+    });
+    expect(doc.images).toBeTruthy();
+    expect(Array.isArray(doc.images)).toBe(true);
+    expect(doc.images.length).toBeGreaterThan(0);
+    // Should find firecrawl logo/branding images
+    expect(doc.images.some(img => img.includes("firecrawl") || img.includes("logo"))).toBe(true);
+  }, 60_000);
+  test("images format: works with multiple formats", async () => {
+    if (!client) throw new Error();
+    const doc = await client.scrape("https://github.com", {
+      formats: ["markdown", "links", "images"],
+    });
+    expect(doc.markdown).toBeTruthy();
+    expect(doc.links).toBeTruthy();
+    expect(doc.images).toBeTruthy();
+    expect(Array.isArray(doc.images)).toBe(true);
+    expect(doc.images.length).toBeGreaterThan(0);
+    // Images should find things not available in links format
+    const imageExtensions = ['.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.ico'];
+    const linkImages = doc.links?.filter(link =>
+      imageExtensions.some(ext => link.toLowerCase().includes(ext))
+    ) || [];
+    // Should discover additional images beyond those with obvious extensions
+    expect(doc.images.length).toBeGreaterThanOrEqual(linkImages.length);
+  }, 60_000);
   test("invalid url should throw", async () => {
     if (!client) throw new Error();
     await expect(client.scrape("")).rejects.toThrow("URL cannot be empty");

package/src/__tests__/unit/v2/pagination.test.ts ADDED Viewed

@@ -0,0 +1,112 @@
+import { describe, test, expect, jest } from "@jest/globals";
+import { getCrawlStatus } from "../../../v2/methods/crawl";
+import { getBatchScrapeStatus } from "../../../v2/methods/batch";
+describe("JS SDK v2 pagination", () => {
+  function makeHttp(getImpl: (url: string) => any) {
+    return { get: jest.fn(async (u: string) => getImpl(u)) } as any;
+  }
+  test("crawl: autoPaginate=false returns next", async () => {
+    const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 2, next: "https://api/next", data: [{ markdown: "a" }] } };
+    const http = makeHttp(() => first);
+    const res = await getCrawlStatus(http, "job1", { autoPaginate: false });
+    expect(res.data.length).toBe(1);
+    expect(res.next).toBe("https://api/next");
+  });
+  test("crawl: default autoPaginate aggregates and nulls next", async () => {
+    const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 3, next: "https://api/n1", data: [{ markdown: "a" }] } };
+    const second = { status: 200, data: { success: true, next: "https://api/n2", data: [{ markdown: "b" }] } };
+    const third = { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
+    const http = makeHttp((url) => {
+      if (url.includes("/v2/crawl/")) return first;
+      if (url.endsWith("n1")) return second;
+      return third;
+    });
+    const res = await getCrawlStatus(http, "job1");
+    expect(res.data.length).toBe(3);
+    expect(res.next).toBeNull();
+  });
+  test("crawl: respects maxPages and maxResults", async () => {
+    const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 10, next: "https://api/n1", data: [{ markdown: "a" }] } };
+    const page = (n: number) => ({ status: 200, data: { success: true, next: n < 3 ? `https://api/n${n + 1}` : null, data: [{ markdown: `p${n}` }] } });
+    const http = makeHttp((url) => {
+      if (url.includes("/v2/crawl/")) return first;
+      if (url.endsWith("n1")) return page(1);
+      if (url.endsWith("n2")) return page(2);
+      return page(3);
+    });
+    const res = await getCrawlStatus(http, "job1", { autoPaginate: true, maxPages: 2, maxResults: 2 });
+    expect(res.data.length).toBe(2);
+  });
+  test("batch: default autoPaginate aggregates and nulls next", async () => {
+    const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 3, next: "https://api/b1", data: [{ markdown: "a" }] } };
+    const second = { status: 200, data: { success: true, next: "https://api/b2", data: [{ markdown: "b" }] } };
+    const third = { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
+    const http = makeHttp((url) => {
+      if (url.includes("/v2/batch/scrape/")) return first;
+      if (url.endsWith("b1")) return second;
+      return third;
+    });
+    const res = await getBatchScrapeStatus(http, "jobB");
+    expect(res.data.length).toBe(3);
+    expect(res.next).toBeNull();
+  });
+  test("batch: autoPaginate=false returns next", async () => {
+    const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 2, next: "https://api/nextBatch", data: [{ markdown: "a" }] } };
+    const http = makeHttp(() => first);
+    const res = await getBatchScrapeStatus(http, "jobB", { autoPaginate: false });
+    expect(res.data.length).toBe(1);
+    expect(res.next).toBe("https://api/nextBatch");
+  });
+  test("crawl: maxWaitTime stops pagination after first page", async () => {
+    const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 5, next: "https://api/n1", data: [{ markdown: "a" }] } };
+    const p1 = { status: 200, data: { success: true, next: "https://api/n2", data: [{ markdown: "b" }] } };
+    const http: any = makeHttp((url: string) => {
+      if (url.includes("/v2/crawl/")) return first;
+      if (url.endsWith("n1")) return p1;
+      return { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
+    });
+    const nowSpy = jest.spyOn(Date, "now");
+    try {
+      nowSpy
+        .mockImplementationOnce(() => 0)   // started
+        .mockImplementationOnce(() => 0)   // first loop check
+        .mockImplementationOnce(() => 3000); // second loop check > maxWaitTime
+      const res = await getCrawlStatus(http, "jobC", { autoPaginate: true, maxWaitTime: 1 });
+      expect(res.data.length).toBe(2); // initial + first page
+      expect((http.get as jest.Mock).mock.calls.length).toBe(2); // initial + n1 only
+    } finally {
+      nowSpy.mockRestore();
+    }
+  });
+  test("batch: maxWaitTime stops pagination after first page", async () => {
+    const first = { status: 200, data: { success: true, status: "completed", completed: 1, total: 5, next: "https://api/b1", data: [{ markdown: "a" }] } };
+    const p1 = { status: 200, data: { success: true, next: "https://api/b2", data: [{ markdown: "b" }] } };
+    const http: any = makeHttp((url: string) => {
+      if (url.includes("/v2/batch/scrape/")) return first;
+      if (url.endsWith("b1")) return p1;
+      return { status: 200, data: { success: true, next: null, data: [{ markdown: "c" }] } };
+    });
+    const nowSpy = jest.spyOn(Date, "now");
+    try {
+      nowSpy
+        .mockImplementationOnce(() => 0)   // started
+        .mockImplementationOnce(() => 0)   // first loop check
+        .mockImplementationOnce(() => 3000); // second loop check > maxWaitTime
+      const res = await getBatchScrapeStatus(http, "jobB", { autoPaginate: true, maxWaitTime: 1 });
+      expect(res.data.length).toBe(2);
+      expect((http.get as jest.Mock).mock.calls.length).toBe(2);
+    } finally {
+      nowSpy.mockRestore();
+    }
+  });
+});

package/src/v2/client.ts CHANGED Viewed

@@ -36,6 +36,7 @@ import type {
   ExtractResponse,
   CrawlOptions,
   BatchScrapeOptions,
+  PaginationConfig,
 } from "./types";
 import { Watcher } from "./watcher";
 import type { WatcherOptions } from "./watcher";
@@ -145,8 +146,8 @@ export class FirecrawlClient {
    * Get the status and partial data of a crawl job.
    * @param jobId Crawl job id.
    */
-  async getCrawlStatus(jobId: string): Promise<CrawlJob> {
-    return getCrawlStatus(this.http, jobId);
+  async getCrawlStatus(jobId: string, pagination?: PaginationConfig): Promise<CrawlJob> {
+    return getCrawlStatus(this.http, jobId, pagination);
   }
   /**
    * Cancel a crawl job.
@@ -201,8 +202,8 @@ export class FirecrawlClient {
    * Get the status and partial data of a batch scrape job.
    * @param jobId Batch job id.
    */
-  async getBatchScrapeStatus(jobId: string): Promise<BatchScrapeJob> {
-    return getBatchScrapeStatus(this.http, jobId);
+  async getBatchScrapeStatus(jobId: string, pagination?: PaginationConfig): Promise<BatchScrapeJob> {
+    return getBatchScrapeStatus(this.http, jobId, pagination);
   }
   /**
    * Retrieve batch scrape errors and robots.txt blocks.

package/src/v2/methods/batch.ts CHANGED Viewed

@@ -4,9 +4,11 @@ import {
   type CrawlErrorsResponse,
   type Document,
   type BatchScrapeOptions,
+  type PaginationConfig,
 } from "../types";
 import { HttpClient } from "../utils/httpClient";
 import { ensureValidScrapeOptions } from "../utils/validation";
+import { fetchAllPages } from "../utils/pagination";
 import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
 export async function startBatchScrape(
@@ -47,19 +49,38 @@ export async function startBatchScrape(
   }
 }
-export async function getBatchScrapeStatus(http: HttpClient, jobId: string): Promise<BatchScrapeJob> {
+export async function getBatchScrapeStatus(
+  http: HttpClient,
+  jobId: string,
+  pagination?: PaginationConfig
+): Promise<BatchScrapeJob> {
   try {
     const res = await http.get<{ success: boolean; status: BatchScrapeJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/batch/scrape/${jobId}`);
     if (res.status !== 200 || !res.data?.success) throwForBadResponse(res, "get batch scrape status");
     const body = res.data;
+    const initialDocs = (body.data || []) as Document[];
+    const auto = pagination?.autoPaginate ?? true;
+    if (!auto || !body.next) {
+      return {
+        status: body.status,
+        completed: body.completed ?? 0,
+        total: body.total ?? 0,
+        creditsUsed: body.creditsUsed,
+        expiresAt: body.expiresAt,
+        next: body.next ?? null,
+        data: initialDocs,
+      };
+    }
+    const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
     return {
       status: body.status,
       completed: body.completed ?? 0,
       total: body.total ?? 0,
       creditsUsed: body.creditsUsed,
       expiresAt: body.expiresAt,
-      next: body.next ?? null,
-      data: (body.data || []) as Document[],
+      next: null,
+      data: aggregated,
     };
   } catch (err: any) {
     if (err?.isAxiosError) return normalizeAxiosError(err, "get batch scrape status");
@@ -115,5 +136,4 @@ export function chunkUrls(urls: string[], chunkSize = 100): string[][] {
   const chunks: string[][] = [];
   for (let i = 0; i < urls.length; i += chunkSize) chunks.push(urls.slice(i, i + chunkSize));
   return chunks;
-}
+}

package/src/v2/methods/crawl.ts CHANGED Viewed

@@ -5,10 +5,13 @@ import {
   type CrawlResponse,
   type Document,
   type CrawlOptions,
+  type PaginationConfig,
 } from "../types";
 import { HttpClient } from "../utils/httpClient";
 import { ensureValidScrapeOptions } from "../utils/validation";
 import { normalizeAxiosError, throwForBadResponse } from "../utils/errorHandler";
+import type { HttpClient as _Http } from "../utils/httpClient";
+import { fetchAllPages } from "../utils/pagination";
 export type CrawlRequest = CrawlOptions & {
   url: string;
@@ -52,21 +55,42 @@ export async function startCrawl(http: HttpClient, request: CrawlRequest): Promi
   }
 }
-export async function getCrawlStatus(http: HttpClient, jobId: string): Promise<CrawlJob> {
+export async function getCrawlStatus(
+  http: HttpClient,
+  jobId: string,
+  pagination?: PaginationConfig
+): Promise<CrawlJob> {
   try {
     const res = await http.get<{ success: boolean; status: CrawlJob["status"]; completed?: number; total?: number; creditsUsed?: number; expiresAt?: string; next?: string | null; data?: Document[] }>(`/v2/crawl/${jobId}`);
     if (res.status !== 200 || !res.data?.success) {
       throwForBadResponse(res, "get crawl status");
     }
     const body = res.data;
+    const initialDocs = (body.data || []) as Document[];
+    const auto = pagination?.autoPaginate ?? true;
+    if (!auto || !body.next) {
+      return {
+        status: body.status,
+        completed: body.completed ?? 0,
+        total: body.total ?? 0,
+        creditsUsed: body.creditsUsed,
+        expiresAt: body.expiresAt,
+        next: body.next ?? null,
+        data: initialDocs,
+      };
+    }
+    const aggregated = await fetchAllPages(http, body.next, initialDocs, pagination);
     return {
       status: body.status,
       completed: body.completed ?? 0,
       total: body.total ?? 0,
       creditsUsed: body.creditsUsed,
       expiresAt: body.expiresAt,
-      next: body.next ?? null,
-      data: (body.data || []) as Document[],
+      next: null,
+      data: aggregated,
     };
   } catch (err: any) {
     if (err?.isAxiosError) return normalizeAxiosError(err, "get crawl status");
@@ -140,5 +164,4 @@ export async function crawlParamsPreview(http: HttpClient, url: string, prompt:
     if (err?.isAxiosError) return normalizeAxiosError(err, "crawl params preview");
     throw err;
   }
-}
+}

package/src/v2/types.ts CHANGED Viewed

@@ -6,10 +6,12 @@ export type FormatString =
   | "html"
   | "rawHtml"
   | "links"
+  | "images"
   | "screenshot"
   | "summary"
   | "changeTracking"
-  | "json";
+  | "json"
+  | "attributes";
 export interface Viewport {
   width: number;
@@ -40,13 +42,21 @@ export interface ChangeTrackingFormat extends Format {
   prompt?: string;
   tag?: string;
 }
+export interface AttributesFormat extends Format {
+  type: "attributes";
+  selectors: Array<{
+    selector: string;
+    attribute: string;
+  }>;
+}
 export type FormatOption =
   | FormatString
   | Format
   | JsonFormat
   | ChangeTrackingFormat
-  | ScreenshotFormat;
+  | ScreenshotFormat
+  | AttributesFormat;
 export interface LocationConfig {
   country?: string;
@@ -167,12 +177,30 @@ export interface Document {
   summary?: string;
   metadata?: DocumentMetadata;
   links?: string[];
+  images?: string[];
   screenshot?: string;
+  attributes?: Array<{
+    selector: string;
+    attribute: string;
+    values: string[];
+  }>;
   actions?: Record<string, unknown>;
   warning?: string;
   changeTracking?: Record<string, unknown>;
 }
+// Pagination configuration for auto-fetching pages from v2 endpoints that return a `next` URL
+export interface PaginationConfig {
+  /** When true (default), automatically follow `next` links and aggregate all documents. */
+  autoPaginate?: boolean;
+  /** Maximum number of additional pages to fetch after the first response. */
+  maxPages?: number;
+  /** Maximum total number of documents to return across all pages. */
+  maxResults?: number;
+  /** Maximum time to spend fetching additional pages (in seconds). */
+  maxWaitTime?: number;
+}
 export interface SearchResultWeb {
   url: string;
   title?: string;

package/src/v2/utils/pagination.ts ADDED Viewed

@@ -0,0 +1,45 @@
+import type { HttpClient } from "../utils/httpClient";
+import type { Document, PaginationConfig } from "../types";
+/**
+ * Shared helper to follow `next` cursors and aggregate documents with limits.
+ */
+export async function fetchAllPages(
+  http: HttpClient,
+  nextUrl: string,
+  initial: Document[],
+  pagination?: PaginationConfig
+): Promise<Document[]> {
+  const docs = initial.slice();
+  let current: string | null = nextUrl;
+  let pageCount = 0;
+  const maxPages = pagination?.maxPages ?? undefined;
+  const maxResults = pagination?.maxResults ?? undefined;
+  const maxWaitTime = pagination?.maxWaitTime ?? undefined;
+  const started = Date.now();
+  while (current) {
+    if (maxPages != null && pageCount >= maxPages) break;
+    if (maxWaitTime != null && (Date.now() - started) / 1000 > maxWaitTime) break;
+    let payload: { success: boolean; next?: string | null; data?: Document[] } | null = null;
+    try {
+      const res = await http.get<{ success: boolean; next?: string | null; data?: Document[] }>(current);
+      payload = res.data;
+    } catch {
+      break; // axios rejects on non-2xx; stop pagination gracefully
+    }
+    if (!payload?.success) break;
+    for (const d of payload.data || []) {
+      if (maxResults != null && docs.length >= maxResults) break;
+      docs.push(d as Document);
+    }
+    if (maxResults != null && docs.length >= maxResults) break;
+    current = (payload.next ?? null) as string | null;
+    pageCount += 1;
+  }
+  return docs;
+}