firecrawl 4.4.0__py3-none-any.whl → 4.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

firecrawl/__init__.py CHANGED
@@ -17,7 +17,7 @@ from .v1 import (
17
17
  V1ChangeTrackingOptions,
18
18
  )
19
19
 
20
- __version__ = "4.4.0"
20
+ __version__ = "4.5.0"
21
21
 
22
22
  # Define the logger for the Firecrawl project
23
23
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -89,6 +89,40 @@ class TestCrawlPagination:
89
89
  assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
90
90
  assert len(result.data) == 1
91
91
  assert isinstance(result.data[0], Document)
92
+
93
+ def test_get_crawl_status_propagates_request_timeout(self):
94
+ """Ensure request_timeout is forwarded to the HTTP client."""
95
+ mock_response = Mock()
96
+ mock_response.ok = True
97
+ mock_response.json.return_value = {
98
+ "success": True,
99
+ "status": "completed",
100
+ "completed": 1,
101
+ "total": 1,
102
+ "creditsUsed": 1,
103
+ "expiresAt": "2024-01-01T00:00:00Z",
104
+ "next": None,
105
+ "data": [self.sample_doc],
106
+ }
107
+
108
+ self.mock_client.get.return_value = mock_response
109
+
110
+ timeout_seconds = 5.5
111
+ import firecrawl.v2.methods.crawl as crawl_module
112
+
113
+ assert crawl_module.__file__.endswith("firecrawl/v2/methods/crawl.py")
114
+ assert crawl_module.get_crawl_status.__kwdefaults__ is not None
115
+ assert "request_timeout" in crawl_module.get_crawl_status.__kwdefaults__
116
+ result = get_crawl_status(
117
+ self.mock_client,
118
+ self.job_id,
119
+ request_timeout=timeout_seconds,
120
+ )
121
+
122
+ assert result.status == "completed"
123
+ self.mock_client.get.assert_called_with(
124
+ f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
125
+ )
92
126
 
93
127
  def test_get_crawl_status_with_pagination(self):
94
128
  """Test get_crawl_status with auto_paginate=True."""
@@ -423,7 +457,42 @@ class TestAsyncPagination:
423
457
  assert result.next is None
424
458
  assert len(result.data) == 2
425
459
  assert self.mock_client.get.call_count == 2
426
-
460
+
461
+ @pytest.mark.asyncio
462
+ async def test_get_crawl_status_async_propagates_request_timeout(self):
463
+ """Ensure async request_timeout is forwarded to the HTTP client."""
464
+ mock_response = Mock()
465
+ mock_response.status_code = 200
466
+ mock_response.json.return_value = {
467
+ "success": True,
468
+ "status": "completed",
469
+ "completed": 1,
470
+ "total": 1,
471
+ "creditsUsed": 1,
472
+ "expiresAt": "2024-01-01T00:00:00Z",
473
+ "next": None,
474
+ "data": [self.sample_doc],
475
+ }
476
+
477
+ self.mock_client.get.return_value = mock_response
478
+
479
+ timeout_seconds = 3.3
480
+ import firecrawl.v2.methods.aio.crawl as crawl_module_async
481
+
482
+ assert crawl_module_async.__file__.endswith("firecrawl/v2/methods/aio/crawl.py")
483
+ assert crawl_module_async.get_crawl_status.__kwdefaults__ is not None
484
+ assert "request_timeout" in crawl_module_async.get_crawl_status.__kwdefaults__
485
+ result = await get_crawl_status_async(
486
+ self.mock_client,
487
+ self.job_id,
488
+ request_timeout=timeout_seconds,
489
+ )
490
+
491
+ assert result.status == "completed"
492
+ self.mock_client.get.assert_awaited_with(
493
+ f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
494
+ )
495
+
427
496
  @pytest.mark.asyncio
428
497
  async def test_get_batch_scrape_status_async_with_pagination(self):
429
498
  """Test async get_batch_scrape_status with pagination."""
firecrawl/v2/client.py CHANGED
@@ -54,10 +54,14 @@ from .watcher import Watcher
54
54
  class FirecrawlClient:
55
55
  """
56
56
  Main Firecrawl v2 API client.
57
-
57
+
58
58
  This client provides a clean, modular interface to all Firecrawl functionality.
59
59
  """
60
-
60
+
61
+ @staticmethod
62
+ def _is_cloud_service(url: str) -> bool:
63
+ return "api.firecrawl.dev" in url.lower()
64
+
61
65
  def __init__(
62
66
  self,
63
67
  api_key: Optional[str] = None,
@@ -68,7 +72,7 @@ class FirecrawlClient:
68
72
  ):
69
73
  """
70
74
  Initialize the Firecrawl client.
71
-
75
+
72
76
  Args:
73
77
  api_key: Firecrawl API key (or set FIRECRAWL_API_KEY env var)
74
78
  api_url: Base URL for the Firecrawl API
@@ -78,13 +82,13 @@ class FirecrawlClient:
78
82
  """
79
83
  if api_key is None:
80
84
  api_key = os.getenv("FIRECRAWL_API_KEY")
81
-
82
- if not api_key:
85
+
86
+ if self._is_cloud_service(api_url) and not api_key:
83
87
  raise ValueError(
84
- "API key is required. Set FIRECRAWL_API_KEY environment variable "
88
+ "API key is required for the cloud API. Set FIRECRAWL_API_KEY environment variable "
85
89
  "or pass api_key parameter."
86
90
  )
87
-
91
+
88
92
  self.config = ClientConfig(
89
93
  api_key=api_key,
90
94
  api_url=api_url,
@@ -92,7 +96,7 @@ class FirecrawlClient:
92
96
  max_retries=max_retries,
93
97
  backoff_factor=backoff_factor
94
98
  )
95
-
99
+
96
100
  self.http_client = HttpClient(api_key, api_url)
97
101
 
98
102
  def scrape(
@@ -236,6 +240,7 @@ class FirecrawlClient:
236
240
  zero_data_retention: bool = False,
237
241
  poll_interval: int = 2,
238
242
  timeout: Optional[int] = None,
243
+ request_timeout: Optional[float] = None,
239
244
  integration: Optional[str] = None,
240
245
  ) -> CrawlJob:
241
246
  """
@@ -259,7 +264,8 @@ class FirecrawlClient:
259
264
  scrape_options: Page scraping configuration
260
265
  zero_data_retention: Whether to delete data after 24 hours
261
266
  poll_interval: Seconds between status checks
262
- timeout: Maximum seconds to wait (None for no timeout)
267
+ timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
268
+ request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout
263
269
 
264
270
  Returns:
265
271
  CrawlJob when job completes
@@ -290,10 +296,11 @@ class FirecrawlClient:
290
296
  )
291
297
 
292
298
  return crawl_module.crawl(
293
- self.http_client,
294
- request,
295
- poll_interval=poll_interval,
296
- timeout=timeout
299
+ self.http_client,
300
+ request,
301
+ poll_interval=poll_interval,
302
+ timeout=timeout,
303
+ request_timeout=request_timeout,
297
304
  )
298
305
 
299
306
  def start_crawl(
@@ -368,9 +375,11 @@ class FirecrawlClient:
368
375
  return crawl_module.start_crawl(self.http_client, request)
369
376
 
370
377
  def get_crawl_status(
371
- self,
378
+ self,
372
379
  job_id: str,
373
- pagination_config: Optional[PaginationConfig] = None
380
+ pagination_config: Optional[PaginationConfig] = None,
381
+ *,
382
+ request_timeout: Optional[float] = None,
374
383
  ) -> CrawlJob:
375
384
  """
376
385
  Get the status of a crawl job.
@@ -378,6 +387,9 @@ class FirecrawlClient:
378
387
  Args:
379
388
  job_id: ID of the crawl job
380
389
  pagination_config: Optional configuration for pagination behavior
390
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
391
+ is enabled (default) and there are multiple pages of results, this timeout applies to
392
+ each page request separately, not to the entire operation
381
393
 
382
394
  Returns:
383
395
  CrawlJob with current status and data
@@ -386,9 +398,10 @@ class FirecrawlClient:
386
398
  Exception: If the status check fails
387
399
  """
388
400
  return crawl_module.get_crawl_status(
389
- self.http_client,
401
+ self.http_client,
390
402
  job_id,
391
- pagination_config=pagination_config
403
+ pagination_config=pagination_config,
404
+ request_timeout=request_timeout,
392
405
  )
393
406
 
394
407
  def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
@@ -4,6 +4,7 @@ Async v2 client mirroring the regular client surface using true async HTTP trans
4
4
 
5
5
  import os
6
6
  import asyncio
7
+ import time
7
8
  from typing import Optional, List, Dict, Any, Union, Callable, Literal
8
9
  from .types import (
9
10
  ScrapeOptions,
@@ -47,11 +48,15 @@ from .methods.aio import extract as async_extract # type: ignore[attr-defined]
47
48
  from .watcher_async import AsyncWatcher
48
49
 
49
50
  class AsyncFirecrawlClient:
51
+ @staticmethod
52
+ def _is_cloud_service(url: str) -> bool:
53
+ return "api.firecrawl.dev" in url.lower()
54
+
50
55
  def __init__(self, api_key: Optional[str] = None, api_url: str = "https://api.firecrawl.dev"):
51
56
  if api_key is None:
52
57
  api_key = os.getenv("FIRECRAWL_API_KEY")
53
- if not api_key:
54
- raise ValueError("API key is required. Set FIRECRAWL_API_KEY or pass api_key.")
58
+ if self._is_cloud_service(api_url) and not api_key:
59
+ raise ValueError("API key is required for the cloud API. Set FIRECRAWL_API_KEY or pass api_key.")
55
60
  self.http_client = HttpClient(api_key, api_url)
56
61
  self.async_http_client = AsyncHttpClient(api_key, api_url)
57
62
 
@@ -77,33 +82,91 @@ class AsyncFirecrawlClient:
77
82
  request = CrawlRequest(url=url, **kwargs)
78
83
  return await async_crawl.start_crawl(self.async_http_client, request)
79
84
 
80
- async def wait_crawl(self, job_id: str, poll_interval: int = 2, timeout: Optional[int] = None) -> CrawlJob:
81
- # simple polling loop using blocking get (ok for test-level async)
82
- start = asyncio.get_event_loop().time()
85
+ async def wait_crawl(
86
+ self,
87
+ job_id: str,
88
+ poll_interval: int = 2,
89
+ timeout: Optional[int] = None,
90
+ *,
91
+ request_timeout: Optional[float] = None,
92
+ ) -> CrawlJob:
93
+ """
94
+ Polls the status of a crawl job until it reaches a terminal state.
95
+
96
+ Args:
97
+ job_id (str): The ID of the crawl job to poll.
98
+ poll_interval (int, optional): Number of seconds to wait between polling attempts. Defaults to 2.
99
+ timeout (Optional[int], optional): Maximum number of seconds to wait for the entire crawl job to complete before timing out. If None, waits indefinitely. Defaults to None.
100
+ request_timeout (Optional[float], optional): Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout. If None, no per-request timeout is set. Defaults to None.
101
+
102
+ Returns:
103
+ CrawlJob: The final status of the crawl job when it reaches a terminal state.
104
+
105
+ Raises:
106
+ TimeoutError: If the crawl does not reach a terminal state within the specified timeout.
107
+
108
+ Terminal states:
109
+ - "completed": The crawl finished successfully.
110
+ - "failed": The crawl finished with an error.
111
+ - "cancelled": The crawl was cancelled.
112
+ """
113
+ start = time.monotonic()
83
114
  while True:
84
- status = await async_crawl.get_crawl_status(self.async_http_client, job_id)
85
- if status.status in ["completed", "failed"]:
115
+ status = await async_crawl.get_crawl_status(
116
+ self.async_http_client,
117
+ job_id,
118
+ request_timeout=request_timeout,
119
+ )
120
+ if status.status in ["completed", "failed", "cancelled"]:
86
121
  return status
87
- if timeout and (asyncio.get_event_loop().time() - start) > timeout:
122
+ if timeout and (time.monotonic() - start) > timeout:
88
123
  raise TimeoutError("Crawl wait timed out")
89
124
  await asyncio.sleep(poll_interval)
90
125
 
91
126
  async def crawl(self, **kwargs) -> CrawlJob:
92
127
  # wrapper combining start and wait
93
- resp = await self.start_crawl(**{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout")})
128
+ resp = await self.start_crawl(
129
+ **{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout", "request_timeout")}
130
+ )
94
131
  poll_interval = kwargs.get("poll_interval", 2)
95
132
  timeout = kwargs.get("timeout")
96
- return await self.wait_crawl(resp.id, poll_interval=poll_interval, timeout=timeout)
133
+ request_timeout = kwargs.get("request_timeout")
134
+ effective_request_timeout = request_timeout if request_timeout is not None else timeout
135
+ return await self.wait_crawl(
136
+ resp.id,
137
+ poll_interval=poll_interval,
138
+ timeout=timeout,
139
+ request_timeout=effective_request_timeout,
140
+ )
97
141
 
98
142
  async def get_crawl_status(
99
- self,
143
+ self,
100
144
  job_id: str,
101
- pagination_config: Optional[PaginationConfig] = None
145
+ pagination_config: Optional[PaginationConfig] = None,
146
+ *,
147
+ request_timeout: Optional[float] = None,
102
148
  ) -> CrawlJob:
149
+ """
150
+ Get the status of a crawl job.
151
+
152
+ Args:
153
+ job_id: ID of the crawl job
154
+ pagination_config: Optional configuration for pagination behavior
155
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
156
+ is enabled (default) and there are multiple pages of results, this timeout applies to
157
+ each page request separately, not to the entire operation
158
+
159
+ Returns:
160
+ CrawlJob with current status and data
161
+
162
+ Raises:
163
+ Exception: If the status check fails
164
+ """
103
165
  return await async_crawl.get_crawl_status(
104
- self.async_http_client,
166
+ self.async_http_client,
105
167
  job_id,
106
- pagination_config=pagination_config
168
+ pagination_config=pagination_config,
169
+ request_timeout=request_timeout,
107
170
  )
108
171
 
109
172
  async def cancel_crawl(self, job_id: str) -> bool:
@@ -87,9 +87,11 @@ async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlRe
87
87
 
88
88
 
89
89
  async def get_crawl_status(
90
- client: AsyncHttpClient,
90
+ client: AsyncHttpClient,
91
91
  job_id: str,
92
- pagination_config: Optional[PaginationConfig] = None
92
+ pagination_config: Optional[PaginationConfig] = None,
93
+ *,
94
+ request_timeout: Optional[float] = None,
93
95
  ) -> CrawlJob:
94
96
  """
95
97
  Get the status of a crawl job.
@@ -98,6 +100,9 @@ async def get_crawl_status(
98
100
  client: Async HTTP client instance
99
101
  job_id: ID of the crawl job
100
102
  pagination_config: Optional configuration for pagination limits
103
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
104
+ is enabled (default) and there are multiple pages of results, this timeout applies to
105
+ each page request separately, not to the entire operation
101
106
 
102
107
  Returns:
103
108
  CrawlJob with job information
@@ -105,7 +110,7 @@ async def get_crawl_status(
105
110
  Raises:
106
111
  Exception: If the status check fails
107
112
  """
108
- response = await client.get(f"/v2/crawl/{job_id}")
113
+ response = await client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
109
114
  if response.status_code >= 400:
110
115
  handle_response_error(response, "get crawl status")
111
116
  body = response.json()
@@ -120,10 +125,11 @@ async def get_crawl_status(
120
125
  auto_paginate = pagination_config.auto_paginate if pagination_config else True
121
126
  if auto_paginate and body.get("next"):
122
127
  documents = await _fetch_all_pages_async(
123
- client,
124
- body.get("next"),
125
- documents,
126
- pagination_config
128
+ client,
129
+ body.get("next"),
130
+ documents,
131
+ pagination_config,
132
+ request_timeout=request_timeout,
127
133
  )
128
134
 
129
135
  return CrawlJob(
@@ -142,7 +148,9 @@ async def _fetch_all_pages_async(
142
148
  client: AsyncHttpClient,
143
149
  next_url: str,
144
150
  initial_documents: List[Document],
145
- pagination_config: Optional[PaginationConfig] = None
151
+ pagination_config: Optional[PaginationConfig] = None,
152
+ *,
153
+ request_timeout: Optional[float] = None,
146
154
  ) -> List[Document]:
147
155
  """
148
156
  Fetch all pages of crawl results asynchronously.
@@ -152,6 +160,7 @@ async def _fetch_all_pages_async(
152
160
  next_url: URL for the next page
153
161
  initial_documents: Documents from the first page
154
162
  pagination_config: Optional configuration for pagination limits
163
+ request_timeout: Optional timeout (in seconds) for the underlying HTTP request
155
164
 
156
165
  Returns:
157
166
  List of all documents from all pages
@@ -176,7 +185,7 @@ async def _fetch_all_pages_async(
176
185
  break
177
186
 
178
187
  # Fetch next page
179
- response = await client.get(current_url)
188
+ response = await client.get(current_url, timeout=request_timeout)
180
189
 
181
190
  if response.status_code >= 400:
182
191
  # Log error but continue with what we have
@@ -142,37 +142,42 @@ def start_crawl(client: HttpClient, request: CrawlRequest) -> CrawlResponse:
142
142
 
143
143
 
144
144
  def get_crawl_status(
145
- client: HttpClient,
145
+ client: HttpClient,
146
146
  job_id: str,
147
- pagination_config: Optional[PaginationConfig] = None
147
+ pagination_config: Optional[PaginationConfig] = None,
148
+ *,
149
+ request_timeout: Optional[float] = None,
148
150
  ) -> CrawlJob:
149
151
  """
150
152
  Get the status of a crawl job.
151
-
153
+
152
154
  Args:
153
155
  client: HTTP client instance
154
156
  job_id: ID of the crawl job
155
157
  pagination_config: Optional configuration for pagination behavior
156
-
158
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
159
+ is enabled (default) and there are multiple pages of results, this timeout applies to
160
+ each page request separately, not to the entire operation
161
+
157
162
  Returns:
158
163
  CrawlJob with current status and data
159
-
164
+
160
165
  Raises:
161
166
  Exception: If the status check fails
162
167
  """
163
168
  # Make the API request
164
- response = client.get(f"/v2/crawl/{job_id}")
165
-
169
+ response = client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
170
+
166
171
  # Handle errors
167
172
  if not response.ok:
168
173
  handle_response_error(response, "get crawl status")
169
-
174
+
170
175
  # Parse response
171
176
  response_data = response.json()
172
-
177
+
173
178
  if response_data.get("success"):
174
179
  # The API returns status fields at the top level, not in a data field
175
-
180
+
176
181
  # Convert documents
177
182
  documents = []
178
183
  data_list = response_data.get("data", [])
@@ -183,17 +188,22 @@ def get_crawl_status(
183
188
  continue
184
189
  else:
185
190
  documents.append(Document(**normalize_document_input(doc_data)))
186
-
191
+
187
192
  # Handle pagination if requested
188
193
  auto_paginate = pagination_config.auto_paginate if pagination_config else True
189
- if auto_paginate and response_data.get("next") and not (pagination_config and pagination_config.max_results is not None and len(documents) >= pagination_config.max_results):
194
+ if auto_paginate and response_data.get("next") and not (
195
+ pagination_config
196
+ and pagination_config.max_results is not None
197
+ and len(documents) >= pagination_config.max_results
198
+ ):
190
199
  documents = _fetch_all_pages(
191
- client,
192
- response_data.get("next"),
193
- documents,
194
- pagination_config
200
+ client,
201
+ response_data.get("next"),
202
+ documents,
203
+ pagination_config,
204
+ request_timeout=request_timeout,
195
205
  )
196
-
206
+
197
207
  # Create CrawlJob with current status and data
198
208
  return CrawlJob(
199
209
  status=response_data.get("status"),
@@ -212,31 +222,34 @@ def _fetch_all_pages(
212
222
  client: HttpClient,
213
223
  next_url: str,
214
224
  initial_documents: List[Document],
215
- pagination_config: Optional[PaginationConfig] = None
225
+ pagination_config: Optional[PaginationConfig] = None,
226
+ *,
227
+ request_timeout: Optional[float] = None,
216
228
  ) -> List[Document]:
217
229
  """
218
230
  Fetch all pages of crawl results.
219
-
231
+
220
232
  Args:
221
233
  client: HTTP client instance
222
234
  next_url: URL for the next page
223
235
  initial_documents: Documents from the first page
224
236
  pagination_config: Optional configuration for pagination limits
225
-
237
+ request_timeout: Optional timeout (in seconds) for the underlying HTTP request
238
+
226
239
  Returns:
227
240
  List of all documents from all pages
228
241
  """
229
242
  documents = initial_documents.copy()
230
243
  current_url = next_url
231
244
  page_count = 0
232
-
245
+
233
246
  # Apply pagination limits
234
247
  max_pages = pagination_config.max_pages if pagination_config else None
235
248
  max_results = pagination_config.max_results if pagination_config else None
236
249
  max_wait_time = pagination_config.max_wait_time if pagination_config else None
237
-
250
+
238
251
  start_time = time.monotonic()
239
-
252
+
240
253
  while current_url:
241
254
  # Check pagination limits (treat 0 as a valid limit)
242
255
  if (max_pages is not None) and page_count >= max_pages:
@@ -244,22 +257,22 @@ def _fetch_all_pages(
244
257
 
245
258
  if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
246
259
  break
247
-
260
+
248
261
  # Fetch next page
249
- response = client.get(current_url)
250
-
262
+ response = client.get(current_url, timeout=request_timeout)
263
+
251
264
  if not response.ok:
252
265
  # Log error but continue with what we have
253
266
  import logging
254
267
  logger = logging.getLogger("firecrawl")
255
268
  logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
256
269
  break
257
-
270
+
258
271
  page_data = response.json()
259
-
272
+
260
273
  if not page_data.get("success"):
261
274
  break
262
-
275
+
263
276
  # Add documents from this page
264
277
  data_list = page_data.get("data", [])
265
278
  for doc_data in data_list:
@@ -270,15 +283,15 @@ def _fetch_all_pages(
270
283
  if max_results is not None and len(documents) >= max_results:
271
284
  break
272
285
  documents.append(Document(**normalize_document_input(doc_data)))
273
-
286
+
274
287
  # Check if we hit max_results limit
275
288
  if max_results is not None and len(documents) >= max_results:
276
289
  break
277
-
290
+
278
291
  # Get next URL
279
292
  current_url = page_data.get("next")
280
293
  page_count += 1
281
-
294
+
282
295
  return documents
283
296
 
284
297
 
@@ -309,7 +322,9 @@ def wait_for_crawl_completion(
309
322
  client: HttpClient,
310
323
  job_id: str,
311
324
  poll_interval: int = 2,
312
- timeout: Optional[int] = None
325
+ timeout: Optional[int] = None,
326
+ *,
327
+ request_timeout: Optional[float] = None,
313
328
  ) -> CrawlJob:
314
329
  """
315
330
  Wait for a crawl job to complete, polling for status updates.
@@ -319,6 +334,7 @@ def wait_for_crawl_completion(
319
334
  job_id: ID of the crawl job
320
335
  poll_interval: Seconds between status checks
321
336
  timeout: Maximum seconds to wait (None for no timeout)
337
+ request_timeout: Optional timeout (in seconds) for each status request
322
338
 
323
339
  Returns:
324
340
  CrawlJob when job completes
@@ -330,7 +346,11 @@ def wait_for_crawl_completion(
330
346
  start_time = time.monotonic()
331
347
 
332
348
  while True:
333
- crawl_job = get_crawl_status(client, job_id)
349
+ crawl_job = get_crawl_status(
350
+ client,
351
+ job_id,
352
+ request_timeout=request_timeout,
353
+ )
334
354
 
335
355
  # Check if job is complete
336
356
  if crawl_job.status in ["completed", "failed", "cancelled"]:
@@ -348,7 +368,9 @@ def crawl(
348
368
  client: HttpClient,
349
369
  request: CrawlRequest,
350
370
  poll_interval: int = 2,
351
- timeout: Optional[int] = None
371
+ timeout: Optional[int] = None,
372
+ *,
373
+ request_timeout: Optional[float] = None,
352
374
  ) -> CrawlJob:
353
375
  """
354
376
  Start a crawl job and wait for it to complete.
@@ -357,7 +379,9 @@ def crawl(
357
379
  client: HTTP client instance
358
380
  request: CrawlRequest containing URL and options
359
381
  poll_interval: Seconds between status checks
360
- timeout: Maximum seconds to wait (None for no timeout)
382
+ timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
383
+ request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination
384
+ requests when fetching results. If there are multiple pages, each page request gets this timeout
361
385
 
362
386
  Returns:
363
387
  CrawlJob when job completes
@@ -371,9 +395,16 @@ def crawl(
371
395
  crawl_job = start_crawl(client, request)
372
396
  job_id = crawl_job.id
373
397
 
398
+ # Determine the per-request timeout. If not provided, reuse the overall timeout value.
399
+ effective_request_timeout = request_timeout if request_timeout is not None else timeout
400
+
374
401
  # Wait for completion
375
402
  return wait_for_crawl_completion(
376
- client, job_id, poll_interval, timeout
403
+ client,
404
+ job_id,
405
+ poll_interval,
406
+ timeout,
407
+ request_timeout=effective_request_timeout,
377
408
  )
378
409
 
379
410
 
firecrawl/v2/types.py CHANGED
@@ -768,7 +768,7 @@ class ActiveCrawlsRequest(BaseModel):
768
768
  # Configuration types
769
769
  class ClientConfig(BaseModel):
770
770
  """Configuration for the Firecrawl client."""
771
- api_key: str
771
+ api_key: Optional[str] = None
772
772
  api_url: str = "https://api.firecrawl.dev"
773
773
  timeout: Optional[float] = None
774
774
  max_retries: int = 3
@@ -12,8 +12,8 @@ version = get_version()
12
12
 
13
13
  class HttpClient:
14
14
  """HTTP client with retry logic and error handling."""
15
-
16
- def __init__(self, api_key: str, api_url: str):
15
+
16
+ def __init__(self, api_key: Optional[str], api_url: str):
17
17
  self.api_key = api_key
18
18
  self.api_url = api_url
19
19
 
@@ -43,8 +43,10 @@ class HttpClient:
43
43
  """Prepare headers for API requests."""
44
44
  headers = {
45
45
  'Content-Type': 'application/json',
46
- 'Authorization': f'Bearer {self.api_key}',
47
46
  }
47
+
48
+ if self.api_key:
49
+ headers['Authorization'] = f'Bearer {self.api_key}'
48
50
 
49
51
  if idempotency_key:
50
52
  headers['x-idempotency-key'] = idempotency_key
@@ -6,15 +6,19 @@ version = get_version()
6
6
 
7
7
 
8
8
  class AsyncHttpClient:
9
- def __init__(self, api_key: str, api_url: str):
9
+ def __init__(self, api_key: Optional[str], api_url: str):
10
10
  self.api_key = api_key
11
11
  self.api_url = api_url
12
+ headers = {
13
+ "Content-Type": "application/json",
14
+ }
15
+
16
+ if api_key:
17
+ headers["Authorization"] = f"Bearer {api_key}"
18
+
12
19
  self._client = httpx.AsyncClient(
13
20
  base_url=api_url,
14
- headers={
15
- "Authorization": f"Bearer {api_key}",
16
- "Content-Type": "application/json",
17
- },
21
+ headers=headers,
18
22
  limits=httpx.Limits(max_keepalive_connections=0),
19
23
  )
20
24
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: firecrawl
3
- Version: 4.4.0
3
+ Version: 4.5.0
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/firecrawl/firecrawl
6
6
  Author: Mendable.ai
@@ -1,4 +1,4 @@
1
- firecrawl/__init__.py,sha256=-2sr3h_Oqik7BIh0eYYKkdjWLu_tzqPIWmwCPj4bv_Q,2192
1
+ firecrawl/__init__.py,sha256=h5JuKDOp26ACul3BYZGHNqnTW8xeuPd7kiSSPtW1tVo,2192
2
2
  firecrawl/client.py,sha256=Lmrg2jniCETU6_xVMn_fgLrgDXiBixK9hSkkdsCGiog,11840
3
3
  firecrawl/firecrawl.backup.py,sha256=v1FEN3jR4g5Aupg4xp6SLkuFvYMQuUKND2YELbYjE6c,200430
4
4
  firecrawl/types.py,sha256=RmLTq14Z-Nf883wgZxubrtn2HDu9mecsCEdcIdBCu14,2923
@@ -25,7 +25,7 @@ firecrawl/__tests__/unit/v2/methods/test_crawl_params.py,sha256=p9hzg14uAs1iHKXP
25
25
  firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py,sha256=PEKbooNXfQwPpvcPHXABJnveztgAA-RFBhtlSs8uPro,8780
26
26
  firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py,sha256=kErOmHSD01eMjXiMd4rgsMVGd_aU2G9uVymBjbAFoGw,3918
27
27
  firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py,sha256=w4FZrDqk9XGOuCHw3SV5CkbRuyb_F4Kc8C5eJ7zVcFs,1959
28
- firecrawl/__tests__/unit/v2/methods/test_pagination.py,sha256=wNc9UtdauII_jzsjlJh645NBRq4IbQij1NeBwbyTjBU,22463
28
+ firecrawl/__tests__/unit/v2/methods/test_pagination.py,sha256=-L4MLt6P_UVaQZQP9GVajxktABzqJHz7CojxuZnGjwI,24967
29
29
  firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py,sha256=mxx4B7v4cC42ivLUCosFB2cBIaBI7m9uOUsbE8pyyGU,4077
30
30
  firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py,sha256=HVqXDKO3602gPq-Rl0bXpfbAG5o0QBH51dgy8IOmm5s,6163
31
31
  firecrawl/__tests__/unit/v2/methods/test_search_validation.py,sha256=7UGcNHpQzCpZbAPYjthfdPFWmAPcoApY-ED-khtuANs,9498
@@ -44,13 +44,13 @@ firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py,sha256=87w47n0iOihtu4jTR4
44
44
  firecrawl/v1/__init__.py,sha256=aP1oisPeZVGGZynvENc07JySMOZfv_4zAlxQ0ecMJXA,481
45
45
  firecrawl/v1/client.py,sha256=2Rq38RxGnuf2dMCmr4cc3f-ythavcBkUyJmRrwLmMHg,208104
46
46
  firecrawl/v2/__init__.py,sha256=Jc6a8tBjYG5OPkjDM5pl-notyys-7DEj7PLEfepv3fc,137
47
- firecrawl/v2/client.py,sha256=KBDN8p7epuCOG0XNb-jcftxfboufgYLYl6d3RiYyORA,32828
48
- firecrawl/v2/client_async.py,sha256=lnVnnjwVDVYHT1a2IiBooZi4rPt75gdxpzD0WpRrvb8,11457
49
- firecrawl/v2/types.py,sha256=0cv39cqOyLYWV-UloJS-qTyF8jKNt13312mP8kMzRSg,26411
47
+ firecrawl/v2/client.py,sha256=32gO-51LVD2RqPO4o-4WLHLWKQIq-mA8aAbhS8fRftA,33675
48
+ firecrawl/v2/client_async.py,sha256=ZgCcU3Ro_3PfqJ_tilBWcRqJr7BbQVd9VIVVOcftvr4,13963
49
+ firecrawl/v2/types.py,sha256=wQLlJ3gc-EBWI6zMr_01P3U8M02772I_prhB7JUGNYA,26428
50
50
  firecrawl/v2/watcher.py,sha256=FOU71tqSKxgeuGycu4ye0SLc2dw7clIcoQjPsi-4Csc,14229
51
51
  firecrawl/v2/watcher_async.py,sha256=dMMACMgeKrne_xSYeRvPu0m8nXqdNkDEsaiNBiD5ilw,10370
52
52
  firecrawl/v2/methods/batch.py,sha256=-eGnCGgB76pY-BFVKG1DC58XViETWukQXtDU0esU_UU,14865
53
- firecrawl/v2/methods/crawl.py,sha256=bNHdIrknZSSnDzctGtTSoWIJ6vSTUOTB4F_Le8Oy3RY,18810
53
+ firecrawl/v2/methods/crawl.py,sha256=XyR88MPfF11HUOhZR3JJTQIv477ZyJ2uty286H-p5K4,20049
54
54
  firecrawl/v2/methods/extract.py,sha256=5EcgBzF8uNwA7auzco8xWdOycVV-Y44e04xJG4nlfZY,4982
55
55
  firecrawl/v2/methods/map.py,sha256=MH8jhLIFsp-4IC9womVtdCyarnGTeMqBXqwL21TRbFk,2849
56
56
  firecrawl/v2/methods/scrape.py,sha256=CSHBwC-P91UfrW3zHirjNAs2h899FKcWvd1DY_4fJdo,1921
@@ -58,7 +58,7 @@ firecrawl/v2/methods/search.py,sha256=cyHYDioLtr3QKWqFMibXEjl-JeG-UzircmEvzR2NzC
58
58
  firecrawl/v2/methods/usage.py,sha256=NqkmFd-ziw8ijbZxwaxjxZHl85u0LTe_TYqr_NGWFwE,3693
59
59
  firecrawl/v2/methods/aio/__init__.py,sha256=RocMJnGwnLIvGu3G8ZvY8INkipC7WHZiu2bE31eSyJs,35
60
60
  firecrawl/v2/methods/aio/batch.py,sha256=0R01YcWqk4Tkilbec1EH2fqY614F5PPICQmILRJg38A,6840
61
- firecrawl/v2/methods/aio/crawl.py,sha256=zLYmiYgwuqnussrEGyDOsej78lqQBKacg8wFKhRN0Qc,11684
61
+ firecrawl/v2/methods/aio/crawl.py,sha256=X5P4X_kEI2-Fcm4p46I-qdUW-RomlreAQNBxztX2pfo,12244
62
62
  firecrawl/v2/methods/aio/extract.py,sha256=oc7LcjJ3g3nGYJeedEn2YWOg8X0NqgQpd0DrlI0SyiU,4516
63
63
  firecrawl/v2/methods/aio/map.py,sha256=4dIRBz6GRj_Ip6gbfFKi4ojN9nKBKEp8CXW4sdxFZaA,2551
64
64
  firecrawl/v2/methods/aio/scrape.py,sha256=ilA9qco8YGwCFpE0PN1XBQUyuHPQwH2QioZ-xsfxhgU,1386
@@ -67,14 +67,15 @@ firecrawl/v2/methods/aio/usage.py,sha256=iUzTkdAWRheq-V5rRXcW0bc3MSODaVS1AqroRF0
67
67
  firecrawl/v2/utils/__init__.py,sha256=i1GgxySmqEXpWSBQCu3iZBPIJG7fXj0QXCDWGwerWNs,338
68
68
  firecrawl/v2/utils/error_handler.py,sha256=Iuf916dHphDY8ObNNlWy75628DFeJ0Rv8ljRp4LttLE,4199
69
69
  firecrawl/v2/utils/get_version.py,sha256=0CxW_41q2hlzIxEWOivUCaYw3GFiSIH32RPUMcIgwAY,492
70
- firecrawl/v2/utils/http_client.py,sha256=gUrC1CvU5sj03w27Lbq-3-yH38Yi_OXiI01-piwA83w,6027
71
- firecrawl/v2/utils/http_client_async.py,sha256=iy89_bk2HS3afSRHZ8016eMCa9Fk-5MFTntcOHfbPgE,1936
70
+ firecrawl/v2/utils/http_client.py,sha256=0hII3mnF_1Vd1nElu-hC9PipTUABGamUKb27q92_m5E,6068
71
+ firecrawl/v2/utils/http_client_async.py,sha256=Mt6Dw_i2R_W81ONXnl9N_AlPiggfylOPfbD5Rpgi7tA,1991
72
72
  firecrawl/v2/utils/normalize.py,sha256=nlTU6QRghT1YKZzNZlIQj4STSRuSUGrS9cCErZIcY5w,3636
73
73
  firecrawl/v2/utils/validation.py,sha256=zzpCK4McM4P8Cag0_8s-d7Ww0idyTWKB4-yk92MT-rY,15405
74
- firecrawl-4.4.0.dist-info/licenses/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
74
+ firecrawl-4.5.0.dist-info/licenses/LICENSE,sha256=nPCunEDwjRGHlmjvsiDUyIWbkqqyj3Ej84ntnh0g0zA,1084
75
+ tests/test_api_key_handling.py,sha256=iNaHp6zc9bIwpN3DdiWB2Rzk0j7HCP7VgpRE_1byNYc,1303
75
76
  tests/test_change_tracking.py,sha256=_IJ5ShLcoj2fHDBaw-nE4I4lHdmDB617ocK_XMHhXps,4177
76
77
  tests/test_timeout_conversion.py,sha256=PWlIEMASQNhu4cp1OW_ebklnE9NCiigPnEFCtI5N3w0,3996
77
- firecrawl-4.4.0.dist-info/METADATA,sha256=koMmKCkduC_7WcdU1z0_1G0b8sKaQpBQDOYb1b_8OvE,7392
78
- firecrawl-4.4.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
79
- firecrawl-4.4.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
80
- firecrawl-4.4.0.dist-info/RECORD,,
78
+ firecrawl-4.5.0.dist-info/METADATA,sha256=2VyFb6xIR5LW8RItrgSwkVofbg5f4fH68qE9PRfnxc4,7392
79
+ firecrawl-4.5.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
80
+ firecrawl-4.5.0.dist-info/top_level.txt,sha256=8T3jOaSN5mtLghO-R3MQ8KO290gIX8hmfxQmglBPdLE,16
81
+ firecrawl-4.5.0.dist-info/RECORD,,
@@ -0,0 +1,44 @@
1
+ import sys
2
+ from pathlib import Path
3
+
4
+ import pytest
5
+
6
+ ROOT = Path(__file__).resolve().parents[1]
7
+ if str(ROOT) not in sys.path:
8
+ sys.path.insert(0, str(ROOT))
9
+
10
+ from firecrawl.v2.client import FirecrawlClient
11
+ from firecrawl.v2.client_async import AsyncFirecrawlClient
12
+
13
+
14
+ @pytest.fixture(autouse=True)
15
+ def clear_firecrawl_api_key_env(monkeypatch):
16
+ monkeypatch.delenv("FIRECRAWL_API_KEY", raising=False)
17
+ yield
18
+
19
+
20
+ def test_cloud_requires_api_key():
21
+ with pytest.raises(ValueError):
22
+ FirecrawlClient(api_url="https://api.firecrawl.dev")
23
+
24
+
25
+ def test_self_host_allows_missing_api_key():
26
+ client = FirecrawlClient(api_url="http://localhost:3000")
27
+ assert client.http_client.api_key is None
28
+
29
+
30
+ def test_async_cloud_requires_api_key():
31
+ with pytest.raises(ValueError):
32
+ AsyncFirecrawlClient(api_url="https://api.firecrawl.dev")
33
+
34
+
35
+ @pytest.mark.asyncio
36
+ async def test_async_self_host_allows_missing_api_key():
37
+ client = AsyncFirecrawlClient(api_url="http://localhost:3000")
38
+ try:
39
+ assert client.http_client.api_key is None
40
+ await client.async_http_client.close()
41
+ finally:
42
+ # Ensure the underlying HTTPX client is closed even if assertions fail
43
+ if not client.async_http_client._client.is_closed:
44
+ await client.async_http_client.close()