firecrawl 3.4.0__py3-none-any.whl → 4.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

@@ -31,6 +31,7 @@ from .types import (
31
31
  ExecuteJavascriptAction,
32
32
  PDFAction,
33
33
  Location,
34
+ PaginationConfig,
34
35
  )
35
36
  from .utils.http_client import HttpClient
36
37
  from .utils.http_client_async import AsyncHttpClient
@@ -94,8 +95,16 @@ class AsyncFirecrawlClient:
94
95
  timeout = kwargs.get("timeout")
95
96
  return await self.wait_crawl(resp.id, poll_interval=poll_interval, timeout=timeout)
96
97
 
97
- async def get_crawl_status(self, job_id: str) -> CrawlJob:
98
- return await async_crawl.get_crawl_status(self.async_http_client, job_id)
98
+ async def get_crawl_status(
99
+ self,
100
+ job_id: str,
101
+ pagination_config: Optional[PaginationConfig] = None
102
+ ) -> CrawlJob:
103
+ return await async_crawl.get_crawl_status(
104
+ self.async_http_client,
105
+ job_id,
106
+ pagination_config=pagination_config
107
+ )
99
108
 
100
109
  async def cancel_crawl(self, job_id: str) -> bool:
101
110
  return await async_crawl.cancel_crawl(self.async_http_client, job_id)
@@ -154,8 +163,16 @@ class AsyncFirecrawlClient:
154
163
  timeout = kwargs.get("timeout")
155
164
  return await self.wait_batch_scrape(job_id, poll_interval=poll_interval, timeout=timeout)
156
165
 
157
- async def get_batch_scrape_status(self, job_id: str):
158
- return await async_batch.get_batch_scrape_status(self.async_http_client, job_id)
166
+ async def get_batch_scrape_status(
167
+ self,
168
+ job_id: str,
169
+ pagination_config: Optional[PaginationConfig] = None
170
+ ):
171
+ return await async_batch.get_batch_scrape_status(
172
+ self.async_http_client,
173
+ job_id,
174
+ pagination_config=pagination_config
175
+ )
159
176
 
160
177
  async def cancel_batch_scrape(self, job_id: str) -> bool:
161
178
  return await async_batch.cancel_batch_scrape(self.async_http_client, job_id)
@@ -1,8 +1,10 @@
1
1
  from typing import Optional, List, Dict, Any
2
- from ...types import ScrapeOptions, WebhookConfig, Document, BatchScrapeResponse, BatchScrapeJob
2
+ from ...types import ScrapeOptions, WebhookConfig, Document, BatchScrapeResponse, BatchScrapeJob, PaginationConfig
3
3
  from ...utils.http_client_async import AsyncHttpClient
4
4
  from ...utils.validation import prepare_scrape_options
5
5
  from ...utils.error_handler import handle_response_error
6
+ from ...utils.normalize import normalize_document_input
7
+ import time
6
8
 
7
9
 
8
10
  def _prepare(urls: List[str], *, options: Optional[ScrapeOptions] = None, **kwargs) -> Dict[str, Any]:
@@ -39,7 +41,25 @@ async def start_batch_scrape(client: AsyncHttpClient, urls: List[str], **kwargs)
39
41
  return BatchScrapeResponse(id=body.get("id"), url=body.get("url"), invalid_urls=body.get("invalidURLs"))
40
42
 
41
43
 
42
- async def get_batch_scrape_status(client: AsyncHttpClient, job_id: str) -> BatchScrapeJob:
44
+ async def get_batch_scrape_status(
45
+ client: AsyncHttpClient,
46
+ job_id: str,
47
+ pagination_config: Optional[PaginationConfig] = None
48
+ ) -> BatchScrapeJob:
49
+ """
50
+ Get the status of a batch scrape job.
51
+
52
+ Args:
53
+ client: Async HTTP client instance
54
+ job_id: ID of the batch scrape job
55
+ pagination_config: Optional configuration for pagination behavior
56
+
57
+ Returns:
58
+ BatchScrapeJob containing job status and data
59
+
60
+ Raises:
61
+ Exception: If the status check fails
62
+ """
43
63
  response = await client.get(f"/v2/batch/scrape/{job_id}")
44
64
  if response.status_code >= 400:
45
65
  handle_response_error(response, "get batch scrape status")
@@ -49,23 +69,102 @@ async def get_batch_scrape_status(client: AsyncHttpClient, job_id: str) -> Batch
49
69
  docs: List[Document] = []
50
70
  for doc in body.get("data", []) or []:
51
71
  if isinstance(doc, dict):
52
- normalized = dict(doc)
53
- if 'rawHtml' in normalized and 'raw_html' not in normalized:
54
- normalized['raw_html'] = normalized.pop('rawHtml')
55
- if 'changeTracking' in normalized and 'change_tracking' not in normalized:
56
- normalized['change_tracking'] = normalized.pop('changeTracking')
72
+ normalized = normalize_document_input(doc)
57
73
  docs.append(Document(**normalized))
74
+
75
+ # Handle pagination if requested
76
+ auto_paginate = pagination_config.auto_paginate if pagination_config else True
77
+ if auto_paginate and body.get("next"):
78
+ docs = await _fetch_all_batch_pages_async(
79
+ client,
80
+ body.get("next"),
81
+ docs,
82
+ pagination_config
83
+ )
84
+
58
85
  return BatchScrapeJob(
59
86
  status=body.get("status"),
60
87
  completed=body.get("completed", 0),
61
88
  total=body.get("total", 0),
62
89
  credits_used=body.get("creditsUsed"),
63
90
  expires_at=body.get("expiresAt"),
64
- next=body.get("next"),
91
+ next=body.get("next") if not auto_paginate else None,
65
92
  data=docs,
66
93
  )
67
94
 
68
95
 
96
+ async def _fetch_all_batch_pages_async(
97
+ client: AsyncHttpClient,
98
+ next_url: str,
99
+ initial_documents: List[Document],
100
+ pagination_config: Optional[PaginationConfig] = None
101
+ ) -> List[Document]:
102
+ """
103
+ Fetch all pages of batch scrape results asynchronously.
104
+
105
+ Args:
106
+ client: Async HTTP client instance
107
+ next_url: URL for the next page
108
+ initial_documents: Documents from the first page
109
+ pagination_config: Optional configuration for pagination limits
110
+
111
+ Returns:
112
+ List of all documents from all pages
113
+ """
114
+ documents = initial_documents.copy()
115
+ current_url = next_url
116
+ page_count = 0
117
+
118
+ # Apply pagination limits
119
+ max_pages = pagination_config.max_pages if pagination_config else None
120
+ max_results = pagination_config.max_results if pagination_config else None
121
+ max_wait_time = pagination_config.max_wait_time if pagination_config else None
122
+
123
+ start_time = time.monotonic()
124
+
125
+ while current_url:
126
+ # Check pagination limits
127
+ if (max_pages is not None) and (page_count >= max_pages):
128
+ break
129
+
130
+ if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
131
+ break
132
+
133
+ # Fetch next page
134
+ response = await client.get(current_url)
135
+
136
+ if response.status_code >= 400:
137
+ # Log error but continue with what we have
138
+ import logging
139
+ logger = logging.getLogger("firecrawl")
140
+ logger.warning(f"Failed to fetch next page: {response.status_code}")
141
+ break
142
+
143
+ page_data = response.json()
144
+
145
+ if not page_data.get("success"):
146
+ break
147
+
148
+ # Add documents from this page
149
+ for doc in page_data.get("data", []) or []:
150
+ if isinstance(doc, dict):
151
+ # Check max_results limit
152
+ if (max_results is not None) and (len(documents) >= max_results):
153
+ break
154
+ normalized = normalize_document_input(doc)
155
+ documents.append(Document(**normalized))
156
+
157
+ # Check if we hit max_results limit
158
+ if (max_results is not None) and (len(documents) >= max_results):
159
+ break
160
+
161
+ # Get next URL
162
+ current_url = page_data.get("next")
163
+ page_count += 1
164
+
165
+ return documents
166
+
167
+
69
168
  async def cancel_batch_scrape(client: AsyncHttpClient, job_id: str) -> bool:
70
169
  response = await client.delete(f"/v2/batch/scrape/{job_id}")
71
170
  if response.status_code >= 400:
@@ -1,4 +1,4 @@
1
- from typing import Optional, Dict, Any
1
+ from typing import Optional, Dict, Any, List
2
2
  from ...types import (
3
3
  CrawlRequest,
4
4
  CrawlJob,
@@ -10,11 +10,13 @@ from ...types import (
10
10
  CrawlErrorsResponse,
11
11
  ActiveCrawlsResponse,
12
12
  ActiveCrawl,
13
+ PaginationConfig,
13
14
  )
14
15
  from ...utils.error_handler import handle_response_error
15
16
  from ...utils.validation import prepare_scrape_options
16
17
  from ...utils.http_client_async import AsyncHttpClient
17
18
  from ...utils.normalize import normalize_document_input
19
+ import time
18
20
 
19
21
 
20
22
  def _prepare_crawl_request(request: CrawlRequest) -> dict:
@@ -58,6 +60,20 @@ def _prepare_crawl_request(request: CrawlRequest) -> dict:
58
60
 
59
61
 
60
62
  async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlResponse:
63
+ """
64
+ Start a crawl job for a website.
65
+
66
+ Args:
67
+ client: Async HTTP client instance
68
+ request: CrawlRequest containing URL and options
69
+
70
+ Returns:
71
+ CrawlResponse with job information
72
+
73
+ Raises:
74
+ ValueError: If request is invalid
75
+ Exception: If the crawl operation fails to start
76
+ """
61
77
  payload = _prepare_crawl_request(request)
62
78
  response = await client.post("/v2/crawl", payload)
63
79
  if response.status_code >= 400:
@@ -68,7 +84,25 @@ async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlRe
68
84
  raise Exception(body.get("error", "Unknown error occurred"))
69
85
 
70
86
 
71
- async def get_crawl_status(client: AsyncHttpClient, job_id: str) -> CrawlJob:
87
+ async def get_crawl_status(
88
+ client: AsyncHttpClient,
89
+ job_id: str,
90
+ pagination_config: Optional[PaginationConfig] = None
91
+ ) -> CrawlJob:
92
+ """
93
+ Get the status of a crawl job.
94
+
95
+ Args:
96
+ client: Async HTTP client instance
97
+ job_id: ID of the crawl job
98
+ pagination_config: Optional configuration for pagination limits
99
+
100
+ Returns:
101
+ CrawlJob with job information
102
+
103
+ Raises:
104
+ Exception: If the status check fails
105
+ """
72
106
  response = await client.get(f"/v2/crawl/{job_id}")
73
107
  if response.status_code >= 400:
74
108
  handle_response_error(response, "get crawl status")
@@ -79,19 +113,115 @@ async def get_crawl_status(client: AsyncHttpClient, job_id: str) -> CrawlJob:
79
113
  if isinstance(doc_data, dict):
80
114
  normalized = normalize_document_input(doc_data)
81
115
  documents.append(Document(**normalized))
116
+
117
+ # Handle pagination if requested
118
+ auto_paginate = pagination_config.auto_paginate if pagination_config else True
119
+ if auto_paginate and body.get("next"):
120
+ documents = await _fetch_all_pages_async(
121
+ client,
122
+ body.get("next"),
123
+ documents,
124
+ pagination_config
125
+ )
126
+
82
127
  return CrawlJob(
83
128
  status=body.get("status"),
84
129
  completed=body.get("completed", 0),
85
130
  total=body.get("total", 0),
86
131
  credits_used=body.get("creditsUsed", 0),
87
132
  expires_at=body.get("expiresAt"),
88
- next=body.get("next"),
133
+ next=body.get("next") if not auto_paginate else None,
89
134
  data=documents,
90
135
  )
91
136
  raise Exception(body.get("error", "Unknown error occurred"))
92
137
 
93
138
 
139
+ async def _fetch_all_pages_async(
140
+ client: AsyncHttpClient,
141
+ next_url: str,
142
+ initial_documents: List[Document],
143
+ pagination_config: Optional[PaginationConfig] = None
144
+ ) -> List[Document]:
145
+ """
146
+ Fetch all pages of crawl results asynchronously.
147
+
148
+ Args:
149
+ client: Async HTTP client instance
150
+ next_url: URL for the next page
151
+ initial_documents: Documents from the first page
152
+ pagination_config: Optional configuration for pagination limits
153
+
154
+ Returns:
155
+ List of all documents from all pages
156
+ """
157
+ documents = initial_documents.copy()
158
+ current_url = next_url
159
+ page_count = 0
160
+
161
+ # Apply pagination limits
162
+ max_pages = pagination_config.max_pages if pagination_config else None
163
+ max_results = pagination_config.max_results if pagination_config else None
164
+ max_wait_time = pagination_config.max_wait_time if pagination_config else None
165
+
166
+ start_time = time.monotonic()
167
+
168
+ while current_url:
169
+ # Check pagination limits (treat 0 as a valid limit)
170
+ if (max_pages is not None) and page_count >= max_pages:
171
+ break
172
+
173
+ if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
174
+ break
175
+
176
+ # Fetch next page
177
+ response = await client.get(current_url)
178
+
179
+ if response.status_code >= 400:
180
+ # Log error but continue with what we have
181
+ import logging
182
+ logger = logging.getLogger("firecrawl")
183
+ logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
184
+ break
185
+
186
+ page_data = response.json()
187
+
188
+ if not page_data.get("success"):
189
+ break
190
+
191
+ # Add documents from this page
192
+ for doc_data in page_data.get("data", []):
193
+ if isinstance(doc_data, dict):
194
+ # Check max_results limit
195
+ if (max_results is not None) and (len(documents) >= max_results):
196
+ break
197
+ normalized = normalize_document_input(doc_data)
198
+ documents.append(Document(**normalized))
199
+
200
+ # Check if we hit max_results limit
201
+ if (max_results is not None) and (len(documents) >= max_results):
202
+ break
203
+
204
+ # Get next URL
205
+ current_url = page_data.get("next")
206
+ page_count += 1
207
+
208
+ return documents
209
+
210
+
94
211
  async def cancel_crawl(client: AsyncHttpClient, job_id: str) -> bool:
212
+ """
213
+ Cancel a crawl job.
214
+
215
+ Args:
216
+ client: Async HTTP client instance
217
+ job_id: ID of the crawl job
218
+
219
+ Returns:
220
+ True if cancellation was successful
221
+
222
+ Raises:
223
+ Exception: If the cancellation operation fails
224
+ """
95
225
  response = await client.delete(f"/v2/crawl/{job_id}")
96
226
  if response.status_code >= 400:
97
227
  handle_response_error(response, "cancel crawl")
@@ -100,6 +230,20 @@ async def cancel_crawl(client: AsyncHttpClient, job_id: str) -> bool:
100
230
 
101
231
 
102
232
  async def crawl_params_preview(client: AsyncHttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
233
+ """
234
+ Preview crawl parameters before starting a crawl job.
235
+
236
+ Args:
237
+ client: Async HTTP client instance
238
+ request: CrawlParamsRequest containing URL and prompt
239
+
240
+ Returns:
241
+ CrawlParamsData containing crawl configuration
242
+
243
+ Raises:
244
+ ValueError: If request is invalid
245
+ Exception: If the parameter preview fails
246
+ """
103
247
  if not request.url or not request.url.strip():
104
248
  raise ValueError("URL cannot be empty")
105
249
  if not request.prompt or not request.prompt.strip():
@@ -138,6 +282,19 @@ async def crawl_params_preview(client: AsyncHttpClient, request: CrawlParamsRequ
138
282
 
139
283
 
140
284
  async def get_crawl_errors(client: AsyncHttpClient, crawl_id: str) -> CrawlErrorsResponse:
285
+ """
286
+ Get errors from a crawl job.
287
+
288
+ Args:
289
+ client: Async HTTP client instance
290
+ crawl_id: ID of the crawl job
291
+
292
+ Returns:
293
+ CrawlErrorsResponse with errors and robots blocked
294
+
295
+ Raises:
296
+ Exception: If the error check operation fails
297
+ """
141
298
  response = await client.get(f"/v2/crawl/{crawl_id}/errors")
142
299
  if response.status_code >= 400:
143
300
  handle_response_error(response, "check crawl errors")
@@ -151,6 +308,18 @@ async def get_crawl_errors(client: AsyncHttpClient, crawl_id: str) -> CrawlError
151
308
 
152
309
 
153
310
  async def get_active_crawls(client: AsyncHttpClient) -> ActiveCrawlsResponse:
311
+ """
312
+ Get active crawl jobs.
313
+
314
+ Args:
315
+ client: Async HTTP client instance
316
+
317
+ Returns:
318
+ ActiveCrawlsResponse with active crawl jobs
319
+
320
+ Raises:
321
+ Exception: If the active crawl jobs operation fails
322
+ """
154
323
  response = await client.get("/v2/crawl/active")
155
324
  if response.status_code >= 400:
156
325
  handle_response_error(response, "get active crawls")
@@ -11,6 +11,7 @@ from ..types import (
11
11
  ScrapeOptions,
12
12
  Document,
13
13
  WebhookConfig,
14
+ PaginationConfig,
14
15
  )
15
16
  from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
16
17
  from ..utils.normalize import normalize_document_input
@@ -77,7 +78,8 @@ def start_batch_scrape(
77
78
 
78
79
  def get_batch_scrape_status(
79
80
  client: HttpClient,
80
- job_id: str
81
+ job_id: str,
82
+ pagination_config: Optional[PaginationConfig] = None
81
83
  ) -> BatchScrapeJob:
82
84
  """
83
85
  Get the status of a batch scrape job.
@@ -85,9 +87,10 @@ def get_batch_scrape_status(
85
87
  Args:
86
88
  client: HTTP client instance
87
89
  job_id: ID of the batch scrape job
90
+ pagination_config: Optional configuration for pagination behavior
88
91
 
89
92
  Returns:
90
- BatchScrapeStatusResponse containing job status and data
93
+ BatchScrapeJob containing job status and data
91
94
 
92
95
  Raises:
93
96
  FirecrawlError: If the status check fails
@@ -111,17 +114,99 @@ def get_batch_scrape_status(
111
114
  normalized = normalize_document_input(doc)
112
115
  documents.append(Document(**normalized))
113
116
 
117
+ # Handle pagination if requested
118
+ auto_paginate = pagination_config.auto_paginate if pagination_config else True
119
+ if auto_paginate and body.get("next"):
120
+ documents = _fetch_all_batch_pages(
121
+ client,
122
+ body.get("next"),
123
+ documents,
124
+ pagination_config
125
+ )
126
+
114
127
  return BatchScrapeJob(
115
128
  status=body.get("status"),
116
129
  completed=body.get("completed", 0),
117
130
  total=body.get("total", 0),
118
131
  credits_used=body.get("creditsUsed"),
119
132
  expires_at=body.get("expiresAt"),
120
- next=body.get("next"),
133
+ next=body.get("next") if not auto_paginate else None,
121
134
  data=documents,
122
135
  )
123
136
 
124
137
 
138
+ def _fetch_all_batch_pages(
139
+ client: HttpClient,
140
+ next_url: str,
141
+ initial_documents: List[Document],
142
+ pagination_config: Optional[PaginationConfig] = None
143
+ ) -> List[Document]:
144
+ """
145
+ Fetch all pages of batch scrape results.
146
+
147
+ Args:
148
+ client: HTTP client instance
149
+ next_url: URL for the next page
150
+ initial_documents: Documents from the first page
151
+ pagination_config: Optional configuration for pagination limits
152
+
153
+ Returns:
154
+ List of all documents from all pages
155
+ """
156
+ documents = initial_documents.copy()
157
+ current_url = next_url
158
+ page_count = 0
159
+
160
+ # Apply pagination limits
161
+ max_pages = pagination_config.max_pages if pagination_config else None
162
+ max_results = pagination_config.max_results if pagination_config else None
163
+ max_wait_time = pagination_config.max_wait_time if pagination_config else None
164
+
165
+ start_time = time.monotonic()
166
+
167
+ while current_url:
168
+ # Check pagination limits (treat 0 as a valid limit)
169
+ if (max_pages is not None) and page_count >= max_pages:
170
+ break
171
+
172
+ if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
173
+ break
174
+
175
+ # Fetch next page
176
+ response = client.get(current_url)
177
+
178
+ if not response.ok:
179
+ # Log error but continue with what we have
180
+ import logging
181
+ logger = logging.getLogger("firecrawl")
182
+ logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
183
+ break
184
+
185
+ page_data = response.json()
186
+
187
+ if not page_data.get("success"):
188
+ break
189
+
190
+ # Add documents from this page
191
+ for doc in page_data.get("data", []) or []:
192
+ if isinstance(doc, dict):
193
+ # Check max_results limit
194
+ if max_results is not None and len(documents) >= max_results:
195
+ break
196
+ normalized = normalize_document_input(doc)
197
+ documents.append(Document(**normalized))
198
+
199
+ # Check if we hit max_results limit after adding all docs from this page
200
+ if max_results is not None and len(documents) >= max_results:
201
+ break
202
+
203
+ # Get next URL
204
+ current_url = page_data.get("next")
205
+ page_count += 1
206
+
207
+ return documents
208
+
209
+
125
210
  def cancel_batch_scrape(
126
211
  client: HttpClient,
127
212
  job_id: str
@@ -173,7 +258,7 @@ def wait_for_batch_completion(
173
258
  FirecrawlError: If the job fails or timeout is reached
174
259
  TimeoutError: If timeout is reached
175
260
  """
176
- start_time = time.time()
261
+ start_time = time.monotonic()
177
262
 
178
263
  while True:
179
264
  status_job = get_batch_scrape_status(client, job_id)
@@ -183,7 +268,7 @@ def wait_for_batch_completion(
183
268
  return status_job
184
269
 
185
270
  # Check timeout
186
- if timeout and (time.time() - start_time) > timeout:
271
+ if timeout and (time.monotonic() - start_time) > timeout:
187
272
  raise TimeoutError(f"Batch scrape job {job_id} did not complete within {timeout} seconds")
188
273
 
189
274
  # Wait before next poll