firecrawl 4.3.7__tar.gz → 4.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-4.3.7 → firecrawl-4.5.0}/PKG-INFO +1 -1
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__init__.py +1 -1
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_pagination.py +70 -1
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/client.py +30 -17
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/client_async.py +77 -14
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/aio/crawl.py +18 -9
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/crawl.py +68 -37
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/search.py +1 -1
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/types.py +8 -2
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/utils/http_client.py +5 -3
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/utils/http_client_async.py +9 -5
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl.egg-info/PKG-INFO +1 -1
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl.egg-info/SOURCES.txt +1 -0
- firecrawl-4.5.0/tests/test_api_key_handling.py +44 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/LICENSE +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/README.md +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/conftest.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/test_async.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/test_extract.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/test_map.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/test_search.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/test_usage.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/client.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/firecrawl.backup.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/types.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v1/__init__.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v1/client.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/__init__.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/aio/__init__.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/aio/batch.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/aio/extract.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/aio/map.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/aio/scrape.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/aio/search.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/aio/usage.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/batch.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/extract.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/map.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/scrape.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/methods/usage.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/utils/__init__.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/utils/error_handler.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/utils/get_version.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/utils/normalize.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/utils/validation.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/watcher.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/v2/watcher_async.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl.egg-info/requires.txt +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/pyproject.toml +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/setup.cfg +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/setup.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/tests/test_change_tracking.py +0 -0
- {firecrawl-4.3.7 → firecrawl-4.5.0}/tests/test_timeout_conversion.py +0 -0
|
@@ -89,6 +89,40 @@ class TestCrawlPagination:
|
|
|
89
89
|
assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
|
|
90
90
|
assert len(result.data) == 1
|
|
91
91
|
assert isinstance(result.data[0], Document)
|
|
92
|
+
|
|
93
|
+
def test_get_crawl_status_propagates_request_timeout(self):
|
|
94
|
+
"""Ensure request_timeout is forwarded to the HTTP client."""
|
|
95
|
+
mock_response = Mock()
|
|
96
|
+
mock_response.ok = True
|
|
97
|
+
mock_response.json.return_value = {
|
|
98
|
+
"success": True,
|
|
99
|
+
"status": "completed",
|
|
100
|
+
"completed": 1,
|
|
101
|
+
"total": 1,
|
|
102
|
+
"creditsUsed": 1,
|
|
103
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
104
|
+
"next": None,
|
|
105
|
+
"data": [self.sample_doc],
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
self.mock_client.get.return_value = mock_response
|
|
109
|
+
|
|
110
|
+
timeout_seconds = 5.5
|
|
111
|
+
import firecrawl.v2.methods.crawl as crawl_module
|
|
112
|
+
|
|
113
|
+
assert crawl_module.__file__.endswith("firecrawl/v2/methods/crawl.py")
|
|
114
|
+
assert crawl_module.get_crawl_status.__kwdefaults__ is not None
|
|
115
|
+
assert "request_timeout" in crawl_module.get_crawl_status.__kwdefaults__
|
|
116
|
+
result = get_crawl_status(
|
|
117
|
+
self.mock_client,
|
|
118
|
+
self.job_id,
|
|
119
|
+
request_timeout=timeout_seconds,
|
|
120
|
+
)
|
|
121
|
+
|
|
122
|
+
assert result.status == "completed"
|
|
123
|
+
self.mock_client.get.assert_called_with(
|
|
124
|
+
f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
|
|
125
|
+
)
|
|
92
126
|
|
|
93
127
|
def test_get_crawl_status_with_pagination(self):
|
|
94
128
|
"""Test get_crawl_status with auto_paginate=True."""
|
|
@@ -423,7 +457,42 @@ class TestAsyncPagination:
|
|
|
423
457
|
assert result.next is None
|
|
424
458
|
assert len(result.data) == 2
|
|
425
459
|
assert self.mock_client.get.call_count == 2
|
|
426
|
-
|
|
460
|
+
|
|
461
|
+
@pytest.mark.asyncio
|
|
462
|
+
async def test_get_crawl_status_async_propagates_request_timeout(self):
|
|
463
|
+
"""Ensure async request_timeout is forwarded to the HTTP client."""
|
|
464
|
+
mock_response = Mock()
|
|
465
|
+
mock_response.status_code = 200
|
|
466
|
+
mock_response.json.return_value = {
|
|
467
|
+
"success": True,
|
|
468
|
+
"status": "completed",
|
|
469
|
+
"completed": 1,
|
|
470
|
+
"total": 1,
|
|
471
|
+
"creditsUsed": 1,
|
|
472
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
473
|
+
"next": None,
|
|
474
|
+
"data": [self.sample_doc],
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
self.mock_client.get.return_value = mock_response
|
|
478
|
+
|
|
479
|
+
timeout_seconds = 3.3
|
|
480
|
+
import firecrawl.v2.methods.aio.crawl as crawl_module_async
|
|
481
|
+
|
|
482
|
+
assert crawl_module_async.__file__.endswith("firecrawl/v2/methods/aio/crawl.py")
|
|
483
|
+
assert crawl_module_async.get_crawl_status.__kwdefaults__ is not None
|
|
484
|
+
assert "request_timeout" in crawl_module_async.get_crawl_status.__kwdefaults__
|
|
485
|
+
result = await get_crawl_status_async(
|
|
486
|
+
self.mock_client,
|
|
487
|
+
self.job_id,
|
|
488
|
+
request_timeout=timeout_seconds,
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
assert result.status == "completed"
|
|
492
|
+
self.mock_client.get.assert_awaited_with(
|
|
493
|
+
f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
|
|
494
|
+
)
|
|
495
|
+
|
|
427
496
|
@pytest.mark.asyncio
|
|
428
497
|
async def test_get_batch_scrape_status_async_with_pagination(self):
|
|
429
498
|
"""Test async get_batch_scrape_status with pagination."""
|
|
@@ -54,10 +54,14 @@ from .watcher import Watcher
|
|
|
54
54
|
class FirecrawlClient:
|
|
55
55
|
"""
|
|
56
56
|
Main Firecrawl v2 API client.
|
|
57
|
-
|
|
57
|
+
|
|
58
58
|
This client provides a clean, modular interface to all Firecrawl functionality.
|
|
59
59
|
"""
|
|
60
|
-
|
|
60
|
+
|
|
61
|
+
@staticmethod
|
|
62
|
+
def _is_cloud_service(url: str) -> bool:
|
|
63
|
+
return "api.firecrawl.dev" in url.lower()
|
|
64
|
+
|
|
61
65
|
def __init__(
|
|
62
66
|
self,
|
|
63
67
|
api_key: Optional[str] = None,
|
|
@@ -68,7 +72,7 @@ class FirecrawlClient:
|
|
|
68
72
|
):
|
|
69
73
|
"""
|
|
70
74
|
Initialize the Firecrawl client.
|
|
71
|
-
|
|
75
|
+
|
|
72
76
|
Args:
|
|
73
77
|
api_key: Firecrawl API key (or set FIRECRAWL_API_KEY env var)
|
|
74
78
|
api_url: Base URL for the Firecrawl API
|
|
@@ -78,13 +82,13 @@ class FirecrawlClient:
|
|
|
78
82
|
"""
|
|
79
83
|
if api_key is None:
|
|
80
84
|
api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
81
|
-
|
|
82
|
-
if not api_key:
|
|
85
|
+
|
|
86
|
+
if self._is_cloud_service(api_url) and not api_key:
|
|
83
87
|
raise ValueError(
|
|
84
|
-
"API key is required. Set FIRECRAWL_API_KEY environment variable "
|
|
88
|
+
"API key is required for the cloud API. Set FIRECRAWL_API_KEY environment variable "
|
|
85
89
|
"or pass api_key parameter."
|
|
86
90
|
)
|
|
87
|
-
|
|
91
|
+
|
|
88
92
|
self.config = ClientConfig(
|
|
89
93
|
api_key=api_key,
|
|
90
94
|
api_url=api_url,
|
|
@@ -92,7 +96,7 @@ class FirecrawlClient:
|
|
|
92
96
|
max_retries=max_retries,
|
|
93
97
|
backoff_factor=backoff_factor
|
|
94
98
|
)
|
|
95
|
-
|
|
99
|
+
|
|
96
100
|
self.http_client = HttpClient(api_key, api_url)
|
|
97
101
|
|
|
98
102
|
def scrape(
|
|
@@ -236,6 +240,7 @@ class FirecrawlClient:
|
|
|
236
240
|
zero_data_retention: bool = False,
|
|
237
241
|
poll_interval: int = 2,
|
|
238
242
|
timeout: Optional[int] = None,
|
|
243
|
+
request_timeout: Optional[float] = None,
|
|
239
244
|
integration: Optional[str] = None,
|
|
240
245
|
) -> CrawlJob:
|
|
241
246
|
"""
|
|
@@ -259,7 +264,8 @@ class FirecrawlClient:
|
|
|
259
264
|
scrape_options: Page scraping configuration
|
|
260
265
|
zero_data_retention: Whether to delete data after 24 hours
|
|
261
266
|
poll_interval: Seconds between status checks
|
|
262
|
-
timeout: Maximum seconds to wait (None for no timeout)
|
|
267
|
+
timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
|
|
268
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout
|
|
263
269
|
|
|
264
270
|
Returns:
|
|
265
271
|
CrawlJob when job completes
|
|
@@ -290,10 +296,11 @@ class FirecrawlClient:
|
|
|
290
296
|
)
|
|
291
297
|
|
|
292
298
|
return crawl_module.crawl(
|
|
293
|
-
self.http_client,
|
|
294
|
-
request,
|
|
295
|
-
poll_interval=poll_interval,
|
|
296
|
-
timeout=timeout
|
|
299
|
+
self.http_client,
|
|
300
|
+
request,
|
|
301
|
+
poll_interval=poll_interval,
|
|
302
|
+
timeout=timeout,
|
|
303
|
+
request_timeout=request_timeout,
|
|
297
304
|
)
|
|
298
305
|
|
|
299
306
|
def start_crawl(
|
|
@@ -368,9 +375,11 @@ class FirecrawlClient:
|
|
|
368
375
|
return crawl_module.start_crawl(self.http_client, request)
|
|
369
376
|
|
|
370
377
|
def get_crawl_status(
|
|
371
|
-
self,
|
|
378
|
+
self,
|
|
372
379
|
job_id: str,
|
|
373
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
380
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
381
|
+
*,
|
|
382
|
+
request_timeout: Optional[float] = None,
|
|
374
383
|
) -> CrawlJob:
|
|
375
384
|
"""
|
|
376
385
|
Get the status of a crawl job.
|
|
@@ -378,6 +387,9 @@ class FirecrawlClient:
|
|
|
378
387
|
Args:
|
|
379
388
|
job_id: ID of the crawl job
|
|
380
389
|
pagination_config: Optional configuration for pagination behavior
|
|
390
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
391
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
392
|
+
each page request separately, not to the entire operation
|
|
381
393
|
|
|
382
394
|
Returns:
|
|
383
395
|
CrawlJob with current status and data
|
|
@@ -386,9 +398,10 @@ class FirecrawlClient:
|
|
|
386
398
|
Exception: If the status check fails
|
|
387
399
|
"""
|
|
388
400
|
return crawl_module.get_crawl_status(
|
|
389
|
-
self.http_client,
|
|
401
|
+
self.http_client,
|
|
390
402
|
job_id,
|
|
391
|
-
pagination_config=pagination_config
|
|
403
|
+
pagination_config=pagination_config,
|
|
404
|
+
request_timeout=request_timeout,
|
|
392
405
|
)
|
|
393
406
|
|
|
394
407
|
def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
|
|
@@ -4,6 +4,7 @@ Async v2 client mirroring the regular client surface using true async HTTP trans
|
|
|
4
4
|
|
|
5
5
|
import os
|
|
6
6
|
import asyncio
|
|
7
|
+
import time
|
|
7
8
|
from typing import Optional, List, Dict, Any, Union, Callable, Literal
|
|
8
9
|
from .types import (
|
|
9
10
|
ScrapeOptions,
|
|
@@ -47,11 +48,15 @@ from .methods.aio import extract as async_extract # type: ignore[attr-defined]
|
|
|
47
48
|
from .watcher_async import AsyncWatcher
|
|
48
49
|
|
|
49
50
|
class AsyncFirecrawlClient:
|
|
51
|
+
@staticmethod
|
|
52
|
+
def _is_cloud_service(url: str) -> bool:
|
|
53
|
+
return "api.firecrawl.dev" in url.lower()
|
|
54
|
+
|
|
50
55
|
def __init__(self, api_key: Optional[str] = None, api_url: str = "https://api.firecrawl.dev"):
|
|
51
56
|
if api_key is None:
|
|
52
57
|
api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
53
|
-
if not api_key:
|
|
54
|
-
raise ValueError("API key is required. Set FIRECRAWL_API_KEY or pass api_key.")
|
|
58
|
+
if self._is_cloud_service(api_url) and not api_key:
|
|
59
|
+
raise ValueError("API key is required for the cloud API. Set FIRECRAWL_API_KEY or pass api_key.")
|
|
55
60
|
self.http_client = HttpClient(api_key, api_url)
|
|
56
61
|
self.async_http_client = AsyncHttpClient(api_key, api_url)
|
|
57
62
|
|
|
@@ -77,33 +82,91 @@ class AsyncFirecrawlClient:
|
|
|
77
82
|
request = CrawlRequest(url=url, **kwargs)
|
|
78
83
|
return await async_crawl.start_crawl(self.async_http_client, request)
|
|
79
84
|
|
|
80
|
-
async def wait_crawl(
|
|
81
|
-
|
|
82
|
-
|
|
85
|
+
async def wait_crawl(
|
|
86
|
+
self,
|
|
87
|
+
job_id: str,
|
|
88
|
+
poll_interval: int = 2,
|
|
89
|
+
timeout: Optional[int] = None,
|
|
90
|
+
*,
|
|
91
|
+
request_timeout: Optional[float] = None,
|
|
92
|
+
) -> CrawlJob:
|
|
93
|
+
"""
|
|
94
|
+
Polls the status of a crawl job until it reaches a terminal state.
|
|
95
|
+
|
|
96
|
+
Args:
|
|
97
|
+
job_id (str): The ID of the crawl job to poll.
|
|
98
|
+
poll_interval (int, optional): Number of seconds to wait between polling attempts. Defaults to 2.
|
|
99
|
+
timeout (Optional[int], optional): Maximum number of seconds to wait for the entire crawl job to complete before timing out. If None, waits indefinitely. Defaults to None.
|
|
100
|
+
request_timeout (Optional[float], optional): Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout. If None, no per-request timeout is set. Defaults to None.
|
|
101
|
+
|
|
102
|
+
Returns:
|
|
103
|
+
CrawlJob: The final status of the crawl job when it reaches a terminal state.
|
|
104
|
+
|
|
105
|
+
Raises:
|
|
106
|
+
TimeoutError: If the crawl does not reach a terminal state within the specified timeout.
|
|
107
|
+
|
|
108
|
+
Terminal states:
|
|
109
|
+
- "completed": The crawl finished successfully.
|
|
110
|
+
- "failed": The crawl finished with an error.
|
|
111
|
+
- "cancelled": The crawl was cancelled.
|
|
112
|
+
"""
|
|
113
|
+
start = time.monotonic()
|
|
83
114
|
while True:
|
|
84
|
-
status = await async_crawl.get_crawl_status(
|
|
85
|
-
|
|
115
|
+
status = await async_crawl.get_crawl_status(
|
|
116
|
+
self.async_http_client,
|
|
117
|
+
job_id,
|
|
118
|
+
request_timeout=request_timeout,
|
|
119
|
+
)
|
|
120
|
+
if status.status in ["completed", "failed", "cancelled"]:
|
|
86
121
|
return status
|
|
87
|
-
if timeout and (
|
|
122
|
+
if timeout and (time.monotonic() - start) > timeout:
|
|
88
123
|
raise TimeoutError("Crawl wait timed out")
|
|
89
124
|
await asyncio.sleep(poll_interval)
|
|
90
125
|
|
|
91
126
|
async def crawl(self, **kwargs) -> CrawlJob:
|
|
92
127
|
# wrapper combining start and wait
|
|
93
|
-
resp = await self.start_crawl(
|
|
128
|
+
resp = await self.start_crawl(
|
|
129
|
+
**{k: v for k, v in kwargs.items() if k not in ("poll_interval", "timeout", "request_timeout")}
|
|
130
|
+
)
|
|
94
131
|
poll_interval = kwargs.get("poll_interval", 2)
|
|
95
132
|
timeout = kwargs.get("timeout")
|
|
96
|
-
|
|
133
|
+
request_timeout = kwargs.get("request_timeout")
|
|
134
|
+
effective_request_timeout = request_timeout if request_timeout is not None else timeout
|
|
135
|
+
return await self.wait_crawl(
|
|
136
|
+
resp.id,
|
|
137
|
+
poll_interval=poll_interval,
|
|
138
|
+
timeout=timeout,
|
|
139
|
+
request_timeout=effective_request_timeout,
|
|
140
|
+
)
|
|
97
141
|
|
|
98
142
|
async def get_crawl_status(
|
|
99
|
-
self,
|
|
143
|
+
self,
|
|
100
144
|
job_id: str,
|
|
101
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
145
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
146
|
+
*,
|
|
147
|
+
request_timeout: Optional[float] = None,
|
|
102
148
|
) -> CrawlJob:
|
|
149
|
+
"""
|
|
150
|
+
Get the status of a crawl job.
|
|
151
|
+
|
|
152
|
+
Args:
|
|
153
|
+
job_id: ID of the crawl job
|
|
154
|
+
pagination_config: Optional configuration for pagination behavior
|
|
155
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
156
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
157
|
+
each page request separately, not to the entire operation
|
|
158
|
+
|
|
159
|
+
Returns:
|
|
160
|
+
CrawlJob with current status and data
|
|
161
|
+
|
|
162
|
+
Raises:
|
|
163
|
+
Exception: If the status check fails
|
|
164
|
+
"""
|
|
103
165
|
return await async_crawl.get_crawl_status(
|
|
104
|
-
self.async_http_client,
|
|
166
|
+
self.async_http_client,
|
|
105
167
|
job_id,
|
|
106
|
-
pagination_config=pagination_config
|
|
168
|
+
pagination_config=pagination_config,
|
|
169
|
+
request_timeout=request_timeout,
|
|
107
170
|
)
|
|
108
171
|
|
|
109
172
|
async def cancel_crawl(self, job_id: str) -> bool:
|
|
@@ -87,9 +87,11 @@ async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlRe
|
|
|
87
87
|
|
|
88
88
|
|
|
89
89
|
async def get_crawl_status(
|
|
90
|
-
client: AsyncHttpClient,
|
|
90
|
+
client: AsyncHttpClient,
|
|
91
91
|
job_id: str,
|
|
92
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
92
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
93
|
+
*,
|
|
94
|
+
request_timeout: Optional[float] = None,
|
|
93
95
|
) -> CrawlJob:
|
|
94
96
|
"""
|
|
95
97
|
Get the status of a crawl job.
|
|
@@ -98,6 +100,9 @@ async def get_crawl_status(
|
|
|
98
100
|
client: Async HTTP client instance
|
|
99
101
|
job_id: ID of the crawl job
|
|
100
102
|
pagination_config: Optional configuration for pagination limits
|
|
103
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
104
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
105
|
+
each page request separately, not to the entire operation
|
|
101
106
|
|
|
102
107
|
Returns:
|
|
103
108
|
CrawlJob with job information
|
|
@@ -105,7 +110,7 @@ async def get_crawl_status(
|
|
|
105
110
|
Raises:
|
|
106
111
|
Exception: If the status check fails
|
|
107
112
|
"""
|
|
108
|
-
response = await client.get(f"/v2/crawl/{job_id}")
|
|
113
|
+
response = await client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
|
|
109
114
|
if response.status_code >= 400:
|
|
110
115
|
handle_response_error(response, "get crawl status")
|
|
111
116
|
body = response.json()
|
|
@@ -120,10 +125,11 @@ async def get_crawl_status(
|
|
|
120
125
|
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
121
126
|
if auto_paginate and body.get("next"):
|
|
122
127
|
documents = await _fetch_all_pages_async(
|
|
123
|
-
client,
|
|
124
|
-
body.get("next"),
|
|
125
|
-
documents,
|
|
126
|
-
pagination_config
|
|
128
|
+
client,
|
|
129
|
+
body.get("next"),
|
|
130
|
+
documents,
|
|
131
|
+
pagination_config,
|
|
132
|
+
request_timeout=request_timeout,
|
|
127
133
|
)
|
|
128
134
|
|
|
129
135
|
return CrawlJob(
|
|
@@ -142,7 +148,9 @@ async def _fetch_all_pages_async(
|
|
|
142
148
|
client: AsyncHttpClient,
|
|
143
149
|
next_url: str,
|
|
144
150
|
initial_documents: List[Document],
|
|
145
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
151
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
152
|
+
*,
|
|
153
|
+
request_timeout: Optional[float] = None,
|
|
146
154
|
) -> List[Document]:
|
|
147
155
|
"""
|
|
148
156
|
Fetch all pages of crawl results asynchronously.
|
|
@@ -152,6 +160,7 @@ async def _fetch_all_pages_async(
|
|
|
152
160
|
next_url: URL for the next page
|
|
153
161
|
initial_documents: Documents from the first page
|
|
154
162
|
pagination_config: Optional configuration for pagination limits
|
|
163
|
+
request_timeout: Optional timeout (in seconds) for the underlying HTTP request
|
|
155
164
|
|
|
156
165
|
Returns:
|
|
157
166
|
List of all documents from all pages
|
|
@@ -176,7 +185,7 @@ async def _fetch_all_pages_async(
|
|
|
176
185
|
break
|
|
177
186
|
|
|
178
187
|
# Fetch next page
|
|
179
|
-
response = await client.get(current_url)
|
|
188
|
+
response = await client.get(current_url, timeout=request_timeout)
|
|
180
189
|
|
|
181
190
|
if response.status_code >= 400:
|
|
182
191
|
# Log error but continue with what we have
|
|
@@ -142,37 +142,42 @@ def start_crawl(client: HttpClient, request: CrawlRequest) -> CrawlResponse:
|
|
|
142
142
|
|
|
143
143
|
|
|
144
144
|
def get_crawl_status(
|
|
145
|
-
client: HttpClient,
|
|
145
|
+
client: HttpClient,
|
|
146
146
|
job_id: str,
|
|
147
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
147
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
148
|
+
*,
|
|
149
|
+
request_timeout: Optional[float] = None,
|
|
148
150
|
) -> CrawlJob:
|
|
149
151
|
"""
|
|
150
152
|
Get the status of a crawl job.
|
|
151
|
-
|
|
153
|
+
|
|
152
154
|
Args:
|
|
153
155
|
client: HTTP client instance
|
|
154
156
|
job_id: ID of the crawl job
|
|
155
157
|
pagination_config: Optional configuration for pagination behavior
|
|
156
|
-
|
|
158
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
159
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
160
|
+
each page request separately, not to the entire operation
|
|
161
|
+
|
|
157
162
|
Returns:
|
|
158
163
|
CrawlJob with current status and data
|
|
159
|
-
|
|
164
|
+
|
|
160
165
|
Raises:
|
|
161
166
|
Exception: If the status check fails
|
|
162
167
|
"""
|
|
163
168
|
# Make the API request
|
|
164
|
-
response = client.get(f"/v2/crawl/{job_id}")
|
|
165
|
-
|
|
169
|
+
response = client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
|
|
170
|
+
|
|
166
171
|
# Handle errors
|
|
167
172
|
if not response.ok:
|
|
168
173
|
handle_response_error(response, "get crawl status")
|
|
169
|
-
|
|
174
|
+
|
|
170
175
|
# Parse response
|
|
171
176
|
response_data = response.json()
|
|
172
|
-
|
|
177
|
+
|
|
173
178
|
if response_data.get("success"):
|
|
174
179
|
# The API returns status fields at the top level, not in a data field
|
|
175
|
-
|
|
180
|
+
|
|
176
181
|
# Convert documents
|
|
177
182
|
documents = []
|
|
178
183
|
data_list = response_data.get("data", [])
|
|
@@ -183,17 +188,22 @@ def get_crawl_status(
|
|
|
183
188
|
continue
|
|
184
189
|
else:
|
|
185
190
|
documents.append(Document(**normalize_document_input(doc_data)))
|
|
186
|
-
|
|
191
|
+
|
|
187
192
|
# Handle pagination if requested
|
|
188
193
|
auto_paginate = pagination_config.auto_paginate if pagination_config else True
|
|
189
|
-
if auto_paginate and response_data.get("next") and not (
|
|
194
|
+
if auto_paginate and response_data.get("next") and not (
|
|
195
|
+
pagination_config
|
|
196
|
+
and pagination_config.max_results is not None
|
|
197
|
+
and len(documents) >= pagination_config.max_results
|
|
198
|
+
):
|
|
190
199
|
documents = _fetch_all_pages(
|
|
191
|
-
client,
|
|
192
|
-
response_data.get("next"),
|
|
193
|
-
documents,
|
|
194
|
-
pagination_config
|
|
200
|
+
client,
|
|
201
|
+
response_data.get("next"),
|
|
202
|
+
documents,
|
|
203
|
+
pagination_config,
|
|
204
|
+
request_timeout=request_timeout,
|
|
195
205
|
)
|
|
196
|
-
|
|
206
|
+
|
|
197
207
|
# Create CrawlJob with current status and data
|
|
198
208
|
return CrawlJob(
|
|
199
209
|
status=response_data.get("status"),
|
|
@@ -212,31 +222,34 @@ def _fetch_all_pages(
|
|
|
212
222
|
client: HttpClient,
|
|
213
223
|
next_url: str,
|
|
214
224
|
initial_documents: List[Document],
|
|
215
|
-
pagination_config: Optional[PaginationConfig] = None
|
|
225
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
226
|
+
*,
|
|
227
|
+
request_timeout: Optional[float] = None,
|
|
216
228
|
) -> List[Document]:
|
|
217
229
|
"""
|
|
218
230
|
Fetch all pages of crawl results.
|
|
219
|
-
|
|
231
|
+
|
|
220
232
|
Args:
|
|
221
233
|
client: HTTP client instance
|
|
222
234
|
next_url: URL for the next page
|
|
223
235
|
initial_documents: Documents from the first page
|
|
224
236
|
pagination_config: Optional configuration for pagination limits
|
|
225
|
-
|
|
237
|
+
request_timeout: Optional timeout (in seconds) for the underlying HTTP request
|
|
238
|
+
|
|
226
239
|
Returns:
|
|
227
240
|
List of all documents from all pages
|
|
228
241
|
"""
|
|
229
242
|
documents = initial_documents.copy()
|
|
230
243
|
current_url = next_url
|
|
231
244
|
page_count = 0
|
|
232
|
-
|
|
245
|
+
|
|
233
246
|
# Apply pagination limits
|
|
234
247
|
max_pages = pagination_config.max_pages if pagination_config else None
|
|
235
248
|
max_results = pagination_config.max_results if pagination_config else None
|
|
236
249
|
max_wait_time = pagination_config.max_wait_time if pagination_config else None
|
|
237
|
-
|
|
250
|
+
|
|
238
251
|
start_time = time.monotonic()
|
|
239
|
-
|
|
252
|
+
|
|
240
253
|
while current_url:
|
|
241
254
|
# Check pagination limits (treat 0 as a valid limit)
|
|
242
255
|
if (max_pages is not None) and page_count >= max_pages:
|
|
@@ -244,22 +257,22 @@ def _fetch_all_pages(
|
|
|
244
257
|
|
|
245
258
|
if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
|
|
246
259
|
break
|
|
247
|
-
|
|
260
|
+
|
|
248
261
|
# Fetch next page
|
|
249
|
-
response = client.get(current_url)
|
|
250
|
-
|
|
262
|
+
response = client.get(current_url, timeout=request_timeout)
|
|
263
|
+
|
|
251
264
|
if not response.ok:
|
|
252
265
|
# Log error but continue with what we have
|
|
253
266
|
import logging
|
|
254
267
|
logger = logging.getLogger("firecrawl")
|
|
255
268
|
logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
|
|
256
269
|
break
|
|
257
|
-
|
|
270
|
+
|
|
258
271
|
page_data = response.json()
|
|
259
|
-
|
|
272
|
+
|
|
260
273
|
if not page_data.get("success"):
|
|
261
274
|
break
|
|
262
|
-
|
|
275
|
+
|
|
263
276
|
# Add documents from this page
|
|
264
277
|
data_list = page_data.get("data", [])
|
|
265
278
|
for doc_data in data_list:
|
|
@@ -270,15 +283,15 @@ def _fetch_all_pages(
|
|
|
270
283
|
if max_results is not None and len(documents) >= max_results:
|
|
271
284
|
break
|
|
272
285
|
documents.append(Document(**normalize_document_input(doc_data)))
|
|
273
|
-
|
|
286
|
+
|
|
274
287
|
# Check if we hit max_results limit
|
|
275
288
|
if max_results is not None and len(documents) >= max_results:
|
|
276
289
|
break
|
|
277
|
-
|
|
290
|
+
|
|
278
291
|
# Get next URL
|
|
279
292
|
current_url = page_data.get("next")
|
|
280
293
|
page_count += 1
|
|
281
|
-
|
|
294
|
+
|
|
282
295
|
return documents
|
|
283
296
|
|
|
284
297
|
|
|
@@ -309,7 +322,9 @@ def wait_for_crawl_completion(
|
|
|
309
322
|
client: HttpClient,
|
|
310
323
|
job_id: str,
|
|
311
324
|
poll_interval: int = 2,
|
|
312
|
-
timeout: Optional[int] = None
|
|
325
|
+
timeout: Optional[int] = None,
|
|
326
|
+
*,
|
|
327
|
+
request_timeout: Optional[float] = None,
|
|
313
328
|
) -> CrawlJob:
|
|
314
329
|
"""
|
|
315
330
|
Wait for a crawl job to complete, polling for status updates.
|
|
@@ -319,6 +334,7 @@ def wait_for_crawl_completion(
|
|
|
319
334
|
job_id: ID of the crawl job
|
|
320
335
|
poll_interval: Seconds between status checks
|
|
321
336
|
timeout: Maximum seconds to wait (None for no timeout)
|
|
337
|
+
request_timeout: Optional timeout (in seconds) for each status request
|
|
322
338
|
|
|
323
339
|
Returns:
|
|
324
340
|
CrawlJob when job completes
|
|
@@ -330,7 +346,11 @@ def wait_for_crawl_completion(
|
|
|
330
346
|
start_time = time.monotonic()
|
|
331
347
|
|
|
332
348
|
while True:
|
|
333
|
-
crawl_job = get_crawl_status(
|
|
349
|
+
crawl_job = get_crawl_status(
|
|
350
|
+
client,
|
|
351
|
+
job_id,
|
|
352
|
+
request_timeout=request_timeout,
|
|
353
|
+
)
|
|
334
354
|
|
|
335
355
|
# Check if job is complete
|
|
336
356
|
if crawl_job.status in ["completed", "failed", "cancelled"]:
|
|
@@ -348,7 +368,9 @@ def crawl(
|
|
|
348
368
|
client: HttpClient,
|
|
349
369
|
request: CrawlRequest,
|
|
350
370
|
poll_interval: int = 2,
|
|
351
|
-
timeout: Optional[int] = None
|
|
371
|
+
timeout: Optional[int] = None,
|
|
372
|
+
*,
|
|
373
|
+
request_timeout: Optional[float] = None,
|
|
352
374
|
) -> CrawlJob:
|
|
353
375
|
"""
|
|
354
376
|
Start a crawl job and wait for it to complete.
|
|
@@ -357,7 +379,9 @@ def crawl(
|
|
|
357
379
|
client: HTTP client instance
|
|
358
380
|
request: CrawlRequest containing URL and options
|
|
359
381
|
poll_interval: Seconds between status checks
|
|
360
|
-
timeout: Maximum seconds to wait (None for no timeout)
|
|
382
|
+
timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
|
|
383
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination
|
|
384
|
+
requests when fetching results. If there are multiple pages, each page request gets this timeout
|
|
361
385
|
|
|
362
386
|
Returns:
|
|
363
387
|
CrawlJob when job completes
|
|
@@ -371,9 +395,16 @@ def crawl(
|
|
|
371
395
|
crawl_job = start_crawl(client, request)
|
|
372
396
|
job_id = crawl_job.id
|
|
373
397
|
|
|
398
|
+
# Determine the per-request timeout. If not provided, reuse the overall timeout value.
|
|
399
|
+
effective_request_timeout = request_timeout if request_timeout is not None else timeout
|
|
400
|
+
|
|
374
401
|
# Wait for completion
|
|
375
402
|
return wait_for_crawl_completion(
|
|
376
|
-
client,
|
|
403
|
+
client,
|
|
404
|
+
job_id,
|
|
405
|
+
poll_interval,
|
|
406
|
+
timeout,
|
|
407
|
+
request_timeout=effective_request_timeout,
|
|
377
408
|
)
|
|
378
409
|
|
|
379
410
|
|
|
@@ -123,7 +123,7 @@ def _validate_search_request(request: SearchRequest) -> SearchRequest:
|
|
|
123
123
|
|
|
124
124
|
# Validate categories (if provided)
|
|
125
125
|
if request.categories is not None:
|
|
126
|
-
valid_categories = {"github", "research"}
|
|
126
|
+
valid_categories = {"github", "research", "pdf"}
|
|
127
127
|
for category in request.categories:
|
|
128
128
|
if isinstance(category, str):
|
|
129
129
|
if category not in valid_categories:
|
|
@@ -186,7 +186,13 @@ class Source(BaseModel):
|
|
|
186
186
|
SourceOption = Union[str, Source]
|
|
187
187
|
|
|
188
188
|
class Category(BaseModel):
|
|
189
|
-
"""Configuration for a search category.
|
|
189
|
+
"""Configuration for a search category.
|
|
190
|
+
|
|
191
|
+
Supported categories:
|
|
192
|
+
- "github": Filter results to GitHub repositories
|
|
193
|
+
- "research": Filter results to research papers and academic sites
|
|
194
|
+
- "pdf": Filter results to PDF files (adds filetype:pdf to search)
|
|
195
|
+
"""
|
|
190
196
|
type: str
|
|
191
197
|
|
|
192
198
|
CategoryOption = Union[str, Category]
|
|
@@ -762,7 +768,7 @@ class ActiveCrawlsRequest(BaseModel):
|
|
|
762
768
|
# Configuration types
|
|
763
769
|
class ClientConfig(BaseModel):
|
|
764
770
|
"""Configuration for the Firecrawl client."""
|
|
765
|
-
api_key: str
|
|
771
|
+
api_key: Optional[str] = None
|
|
766
772
|
api_url: str = "https://api.firecrawl.dev"
|
|
767
773
|
timeout: Optional[float] = None
|
|
768
774
|
max_retries: int = 3
|
|
@@ -12,8 +12,8 @@ version = get_version()
|
|
|
12
12
|
|
|
13
13
|
class HttpClient:
|
|
14
14
|
"""HTTP client with retry logic and error handling."""
|
|
15
|
-
|
|
16
|
-
def __init__(self, api_key: str, api_url: str):
|
|
15
|
+
|
|
16
|
+
def __init__(self, api_key: Optional[str], api_url: str):
|
|
17
17
|
self.api_key = api_key
|
|
18
18
|
self.api_url = api_url
|
|
19
19
|
|
|
@@ -43,8 +43,10 @@ class HttpClient:
|
|
|
43
43
|
"""Prepare headers for API requests."""
|
|
44
44
|
headers = {
|
|
45
45
|
'Content-Type': 'application/json',
|
|
46
|
-
'Authorization': f'Bearer {self.api_key}',
|
|
47
46
|
}
|
|
47
|
+
|
|
48
|
+
if self.api_key:
|
|
49
|
+
headers['Authorization'] = f'Bearer {self.api_key}'
|
|
48
50
|
|
|
49
51
|
if idempotency_key:
|
|
50
52
|
headers['x-idempotency-key'] = idempotency_key
|
|
@@ -6,15 +6,19 @@ version = get_version()
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class AsyncHttpClient:
|
|
9
|
-
def __init__(self, api_key: str, api_url: str):
|
|
9
|
+
def __init__(self, api_key: Optional[str], api_url: str):
|
|
10
10
|
self.api_key = api_key
|
|
11
11
|
self.api_url = api_url
|
|
12
|
+
headers = {
|
|
13
|
+
"Content-Type": "application/json",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
if api_key:
|
|
17
|
+
headers["Authorization"] = f"Bearer {api_key}"
|
|
18
|
+
|
|
12
19
|
self._client = httpx.AsyncClient(
|
|
13
20
|
base_url=api_url,
|
|
14
|
-
headers=
|
|
15
|
-
"Authorization": f"Bearer {api_key}",
|
|
16
|
-
"Content-Type": "application/json",
|
|
17
|
-
},
|
|
21
|
+
headers=headers,
|
|
18
22
|
limits=httpx.Limits(max_keepalive_connections=0),
|
|
19
23
|
)
|
|
20
24
|
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import sys
|
|
2
|
+
from pathlib import Path
|
|
3
|
+
|
|
4
|
+
import pytest
|
|
5
|
+
|
|
6
|
+
ROOT = Path(__file__).resolve().parents[1]
|
|
7
|
+
if str(ROOT) not in sys.path:
|
|
8
|
+
sys.path.insert(0, str(ROOT))
|
|
9
|
+
|
|
10
|
+
from firecrawl.v2.client import FirecrawlClient
|
|
11
|
+
from firecrawl.v2.client_async import AsyncFirecrawlClient
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@pytest.fixture(autouse=True)
|
|
15
|
+
def clear_firecrawl_api_key_env(monkeypatch):
|
|
16
|
+
monkeypatch.delenv("FIRECRAWL_API_KEY", raising=False)
|
|
17
|
+
yield
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def test_cloud_requires_api_key():
|
|
21
|
+
with pytest.raises(ValueError):
|
|
22
|
+
FirecrawlClient(api_url="https://api.firecrawl.dev")
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_self_host_allows_missing_api_key():
|
|
26
|
+
client = FirecrawlClient(api_url="http://localhost:3000")
|
|
27
|
+
assert client.http_client.api_key is None
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def test_async_cloud_requires_api_key():
|
|
31
|
+
with pytest.raises(ValueError):
|
|
32
|
+
AsyncFirecrawlClient(api_url="https://api.firecrawl.dev")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@pytest.mark.asyncio
|
|
36
|
+
async def test_async_self_host_allows_missing_api_key():
|
|
37
|
+
client = AsyncFirecrawlClient(api_url="http://localhost:3000")
|
|
38
|
+
try:
|
|
39
|
+
assert client.http_client.api_key is None
|
|
40
|
+
await client.async_http_client.close()
|
|
41
|
+
finally:
|
|
42
|
+
# Ensure the underlying HTTPX client is closed even if assertions fail
|
|
43
|
+
if not client.async_http_client._client.is_closed:
|
|
44
|
+
await client.async_http_client.close()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py
RENAMED
|
File without changes
|
|
File without changes
|
{firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py
RENAMED
|
File without changes
|
|
File without changes
|
{firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{firecrawl-4.3.7 → firecrawl-4.5.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|