firecrawl-py 4.13.2__tar.gz → 4.14.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {firecrawl_py-4.13.2/firecrawl_py.egg-info → firecrawl_py-4.14.0}/PKG-INFO +27 -1
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/README.md +27 -1
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__init__.py +1 -1
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +35 -2
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/test_crawl.py +29 -2
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_pagination.py +228 -4
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/client.py +9 -1
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v1/client.py +8 -8
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/client.py +43 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/client_async.py +34 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/aio/batch.py +78 -26
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/aio/crawl.py +92 -37
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/batch.py +83 -28
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/crawl.py +99 -51
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/types.py +2 -1
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0/firecrawl_py.egg-info}/PKG-INFO +27 -1
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/LICENSE +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/aio/conftest.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/conftest.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/test_async.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/test_extract.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/test_map.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/test_search.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/test_usage.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/test_recursive_schema_v1.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_agent.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_agent_webhook.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_branding.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/firecrawl.backup.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/types.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v1/__init__.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/__init__.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/agent.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/aio/__init__.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/aio/agent.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/aio/extract.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/aio/map.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/aio/scrape.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/aio/search.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/aio/usage.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/extract.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/map.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/scrape.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/search.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/methods/usage.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/utils/__init__.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/utils/error_handler.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/utils/get_version.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/utils/http_client.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/utils/http_client_async.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/utils/normalize.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/utils/validation.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/watcher.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/v2/watcher_async.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl_py.egg-info/SOURCES.txt +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl_py.egg-info/dependency_links.txt +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl_py.egg-info/requires.txt +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl_py.egg-info/top_level.txt +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/pyproject.toml +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/setup.cfg +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/setup.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/tests/test_agent_integration.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/tests/test_api_key_handling.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/tests/test_change_tracking.py +0 -0
- {firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/tests/test_timeout_conversion.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: firecrawl-py
|
|
3
|
-
Version: 4.
|
|
3
|
+
Version: 4.14.0
|
|
4
4
|
Summary: Python SDK for Firecrawl API
|
|
5
5
|
Home-page: https://github.com/firecrawl/firecrawl
|
|
6
6
|
Author: Mendable.ai
|
|
@@ -134,6 +134,32 @@ crawl_status = firecrawl.get_crawl_status("<crawl_id>")
|
|
|
134
134
|
print(crawl_status)
|
|
135
135
|
```
|
|
136
136
|
|
|
137
|
+
### Manual Pagination (v2)
|
|
138
|
+
|
|
139
|
+
Crawl and batch scrape status responses may include a `next` URL when more data is available. The SDK auto-paginates by default; to page manually, disable auto-pagination and pass the opaque `next` URL back to the SDK.
|
|
140
|
+
|
|
141
|
+
```python
|
|
142
|
+
from firecrawl.v2.types import PaginationConfig
|
|
143
|
+
|
|
144
|
+
# Crawl: fetch one page at a time
|
|
145
|
+
crawl_job = firecrawl.start_crawl("https://firecrawl.dev", limit=100)
|
|
146
|
+
status = firecrawl.get_crawl_status(
|
|
147
|
+
crawl_job.id,
|
|
148
|
+
pagination_config=PaginationConfig(auto_paginate=False),
|
|
149
|
+
)
|
|
150
|
+
if status.next:
|
|
151
|
+
page2 = firecrawl.get_crawl_status_page(status.next)
|
|
152
|
+
|
|
153
|
+
# Batch scrape: fetch one page at a time
|
|
154
|
+
batch_job = firecrawl.start_batch_scrape(["https://firecrawl.dev"])
|
|
155
|
+
status = firecrawl.get_batch_scrape_status(
|
|
156
|
+
batch_job.id,
|
|
157
|
+
pagination_config=PaginationConfig(auto_paginate=False),
|
|
158
|
+
)
|
|
159
|
+
if status.next:
|
|
160
|
+
page2 = firecrawl.get_batch_scrape_status_page(status.next)
|
|
161
|
+
```
|
|
162
|
+
|
|
137
163
|
### Cancelling a Crawl
|
|
138
164
|
|
|
139
165
|
To cancel an asynchronous crawl job, use the `cancel_crawl` method. It takes the job ID of the asynchronous crawl as a parameter and returns the cancellation status.
|
|
@@ -87,6 +87,32 @@ crawl_status = firecrawl.get_crawl_status("<crawl_id>")
|
|
|
87
87
|
print(crawl_status)
|
|
88
88
|
```
|
|
89
89
|
|
|
90
|
+
### Manual Pagination (v2)
|
|
91
|
+
|
|
92
|
+
Crawl and batch scrape status responses may include a `next` URL when more data is available. The SDK auto-paginates by default; to page manually, disable auto-pagination and pass the opaque `next` URL back to the SDK.
|
|
93
|
+
|
|
94
|
+
```python
|
|
95
|
+
from firecrawl.v2.types import PaginationConfig
|
|
96
|
+
|
|
97
|
+
# Crawl: fetch one page at a time
|
|
98
|
+
crawl_job = firecrawl.start_crawl("https://firecrawl.dev", limit=100)
|
|
99
|
+
status = firecrawl.get_crawl_status(
|
|
100
|
+
crawl_job.id,
|
|
101
|
+
pagination_config=PaginationConfig(auto_paginate=False),
|
|
102
|
+
)
|
|
103
|
+
if status.next:
|
|
104
|
+
page2 = firecrawl.get_crawl_status_page(status.next)
|
|
105
|
+
|
|
106
|
+
# Batch scrape: fetch one page at a time
|
|
107
|
+
batch_job = firecrawl.start_batch_scrape(["https://firecrawl.dev"])
|
|
108
|
+
status = firecrawl.get_batch_scrape_status(
|
|
109
|
+
batch_job.id,
|
|
110
|
+
pagination_config=PaginationConfig(auto_paginate=False),
|
|
111
|
+
)
|
|
112
|
+
if status.next:
|
|
113
|
+
page2 = firecrawl.get_batch_scrape_status_page(status.next)
|
|
114
|
+
```
|
|
115
|
+
|
|
90
116
|
### Cancelling a Crawl
|
|
91
117
|
|
|
92
118
|
To cancel an asynchronous crawl job, use the `cancel_crawl` method. It takes the job ID of the asynchronous crawl as a parameter and returns the cancellation status.
|
|
@@ -184,4 +210,4 @@ firecrawl = Firecrawl(api_key="YOUR_API_KEY")
|
|
|
184
210
|
doc_v1 = firecrawl.v1.scrape_url('https://firecrawl.dev', formats=['markdown', 'html'])
|
|
185
211
|
crawl_v1 = firecrawl.v1.crawl_url('https://firecrawl.dev', limit=100)
|
|
186
212
|
map_v1 = firecrawl.v1.map_url('https://firecrawl.dev')
|
|
187
|
-
```
|
|
213
|
+
```
|
|
@@ -1,8 +1,9 @@
|
|
|
1
1
|
import os
|
|
2
|
+
import time
|
|
2
3
|
import pytest
|
|
3
4
|
from dotenv import load_dotenv
|
|
4
5
|
from firecrawl import Firecrawl
|
|
5
|
-
from firecrawl.v2.types import ScrapeOptions
|
|
6
|
+
from firecrawl.v2.types import ScrapeOptions, PaginationConfig
|
|
6
7
|
|
|
7
8
|
load_dotenv()
|
|
8
9
|
|
|
@@ -48,6 +49,39 @@ class TestBatchScrapeE2E:
|
|
|
48
49
|
assert job.status in ["scraping", "completed", "failed"]
|
|
49
50
|
assert job.total >= 0
|
|
50
51
|
|
|
52
|
+
def test_get_batch_scrape_status_page(self):
|
|
53
|
+
"""Fetch a single batch scrape page using the next URL."""
|
|
54
|
+
urls = [f"https://docs.firecrawl.dev?batch={i}" for i in range(15)]
|
|
55
|
+
|
|
56
|
+
start_resp = self.client.start_batch_scrape(
|
|
57
|
+
urls,
|
|
58
|
+
formats=["markdown"],
|
|
59
|
+
ignore_invalid_urls=True,
|
|
60
|
+
)
|
|
61
|
+
assert start_resp.id is not None
|
|
62
|
+
|
|
63
|
+
pagination_config = PaginationConfig(auto_paginate=False)
|
|
64
|
+
deadline = time.time() + 120
|
|
65
|
+
status_job = None
|
|
66
|
+
while time.time() < deadline:
|
|
67
|
+
status_job = self.client.get_batch_scrape_status(
|
|
68
|
+
start_resp.id,
|
|
69
|
+
pagination_config=pagination_config,
|
|
70
|
+
)
|
|
71
|
+
if status_job.next:
|
|
72
|
+
break
|
|
73
|
+
if status_job.status in ["completed", "failed", "cancelled"]:
|
|
74
|
+
break
|
|
75
|
+
time.sleep(2)
|
|
76
|
+
|
|
77
|
+
assert status_job is not None
|
|
78
|
+
if not status_job.next:
|
|
79
|
+
pytest.skip("Batch scrape completed without pagination; skipping page fetch.")
|
|
80
|
+
|
|
81
|
+
next_page = self.client.get_batch_scrape_status_page(status_job.next)
|
|
82
|
+
assert isinstance(next_page.data, list)
|
|
83
|
+
assert next_page.status in ["scraping", "completed", "failed", "cancelled"]
|
|
84
|
+
|
|
51
85
|
def test_wait_batch_with_all_params(self):
|
|
52
86
|
"""Blocking waiter with JSON and changeTracking formats plus many options."""
|
|
53
87
|
urls = [
|
|
@@ -103,4 +137,3 @@ class TestBatchScrapeE2E:
|
|
|
103
137
|
|
|
104
138
|
cancelled = self.client.cancel_batch_scrape(start_resp.id)
|
|
105
139
|
assert cancelled is True
|
|
106
|
-
|
|
@@ -3,7 +3,7 @@ import time
|
|
|
3
3
|
import os
|
|
4
4
|
from dotenv import load_dotenv
|
|
5
5
|
from firecrawl import Firecrawl
|
|
6
|
-
from firecrawl.v2.types import ScrapeOptions
|
|
6
|
+
from firecrawl.v2.types import ScrapeOptions, PaginationConfig
|
|
7
7
|
|
|
8
8
|
load_dotenv()
|
|
9
9
|
|
|
@@ -66,6 +66,33 @@ class TestCrawlE2E:
|
|
|
66
66
|
assert status_job.next is None
|
|
67
67
|
assert isinstance(status_job.data, list)
|
|
68
68
|
|
|
69
|
+
def test_get_crawl_status_page(self):
|
|
70
|
+
"""Fetch a single crawl page using the next URL."""
|
|
71
|
+
start_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=25)
|
|
72
|
+
assert start_job.id is not None
|
|
73
|
+
|
|
74
|
+
pagination_config = PaginationConfig(auto_paginate=False)
|
|
75
|
+
deadline = time.time() + 120
|
|
76
|
+
status_job = None
|
|
77
|
+
while time.time() < deadline:
|
|
78
|
+
status_job = self.client.get_crawl_status(
|
|
79
|
+
start_job.id,
|
|
80
|
+
pagination_config=pagination_config,
|
|
81
|
+
)
|
|
82
|
+
if status_job.next:
|
|
83
|
+
break
|
|
84
|
+
if status_job.status in ["completed", "failed", "cancelled"]:
|
|
85
|
+
break
|
|
86
|
+
time.sleep(2)
|
|
87
|
+
|
|
88
|
+
assert status_job is not None
|
|
89
|
+
if not status_job.next:
|
|
90
|
+
pytest.skip("Crawl completed without pagination; skipping page fetch.")
|
|
91
|
+
|
|
92
|
+
next_page = self.client.get_crawl_status_page(status_job.next)
|
|
93
|
+
assert isinstance(next_page.data, list)
|
|
94
|
+
assert next_page.status in ["scraping", "completed", "failed", "cancelled"]
|
|
95
|
+
|
|
69
96
|
def test_cancel_crawl(self):
|
|
70
97
|
"""Test canceling a crawl."""
|
|
71
98
|
start_job = self.client.start_crawl("https://docs.firecrawl.dev", limit=3)
|
|
@@ -275,4 +302,4 @@ class TestCrawlE2E:
|
|
|
275
302
|
assert params_data is not None
|
|
276
303
|
assert params_data.limit is not None or params_data.include_paths is not None or params_data.max_discovery_depth is not None
|
|
277
304
|
assert 'blog/.*' in params_data.include_paths
|
|
278
|
-
assert 'docs/.*' in params_data.include_paths
|
|
305
|
+
assert 'docs/.*' in params_data.include_paths
|
{firecrawl_py-4.13.2 → firecrawl_py-4.14.0}/firecrawl/__tests__/unit/v2/methods/test_pagination.py
RENAMED
|
@@ -14,10 +14,18 @@ from firecrawl.v2.types import (
|
|
|
14
14
|
Document,
|
|
15
15
|
DocumentMetadata
|
|
16
16
|
)
|
|
17
|
-
from firecrawl.v2.methods.crawl import get_crawl_status, _fetch_all_pages
|
|
18
|
-
from firecrawl.v2.methods.batch import get_batch_scrape_status, _fetch_all_batch_pages
|
|
19
|
-
from firecrawl.v2.methods.aio.crawl import
|
|
20
|
-
|
|
17
|
+
from firecrawl.v2.methods.crawl import get_crawl_status, get_crawl_status_page, _fetch_all_pages
|
|
18
|
+
from firecrawl.v2.methods.batch import get_batch_scrape_status, get_batch_scrape_status_page, _fetch_all_batch_pages
|
|
19
|
+
from firecrawl.v2.methods.aio.crawl import (
|
|
20
|
+
get_crawl_status as get_crawl_status_async,
|
|
21
|
+
get_crawl_status_page as get_crawl_status_page_async,
|
|
22
|
+
_fetch_all_pages_async,
|
|
23
|
+
)
|
|
24
|
+
from firecrawl.v2.methods.aio.batch import (
|
|
25
|
+
get_batch_scrape_status as get_batch_scrape_status_async,
|
|
26
|
+
get_batch_scrape_status_page as get_batch_scrape_status_page_async,
|
|
27
|
+
_fetch_all_batch_pages_async,
|
|
28
|
+
)
|
|
21
29
|
|
|
22
30
|
|
|
23
31
|
class TestPaginationConfig:
|
|
@@ -123,6 +131,59 @@ class TestCrawlPagination:
|
|
|
123
131
|
self.mock_client.get.assert_called_with(
|
|
124
132
|
f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
|
|
125
133
|
)
|
|
134
|
+
|
|
135
|
+
def test_get_crawl_status_page(self):
|
|
136
|
+
"""Test get_crawl_status_page returns a single page."""
|
|
137
|
+
mock_response = Mock()
|
|
138
|
+
mock_response.ok = True
|
|
139
|
+
mock_response.json.return_value = {
|
|
140
|
+
"success": True,
|
|
141
|
+
"status": "completed",
|
|
142
|
+
"completed": 10,
|
|
143
|
+
"total": 20,
|
|
144
|
+
"creditsUsed": 5,
|
|
145
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
146
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=3",
|
|
147
|
+
"data": [self.sample_doc],
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
self.mock_client.get.return_value = mock_response
|
|
151
|
+
next_url = "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
|
|
152
|
+
|
|
153
|
+
result = get_crawl_status_page(self.mock_client, next_url)
|
|
154
|
+
|
|
155
|
+
assert result.status == "completed"
|
|
156
|
+
assert result.next == "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=3"
|
|
157
|
+
assert len(result.data) == 1
|
|
158
|
+
self.mock_client.get.assert_called_with(next_url, timeout=None)
|
|
159
|
+
|
|
160
|
+
def test_get_crawl_status_page_propagates_request_timeout(self):
|
|
161
|
+
"""Ensure request_timeout is forwarded to crawl status page requests."""
|
|
162
|
+
mock_response = Mock()
|
|
163
|
+
mock_response.ok = True
|
|
164
|
+
mock_response.json.return_value = {
|
|
165
|
+
"success": True,
|
|
166
|
+
"status": "completed",
|
|
167
|
+
"completed": 1,
|
|
168
|
+
"total": 1,
|
|
169
|
+
"creditsUsed": 1,
|
|
170
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
171
|
+
"next": None,
|
|
172
|
+
"data": [self.sample_doc],
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
self.mock_client.get.return_value = mock_response
|
|
176
|
+
|
|
177
|
+
next_url = "https://api.firecrawl.dev/v2/crawl/test-crawl-123?page=2"
|
|
178
|
+
timeout_seconds = 4.2
|
|
179
|
+
result = get_crawl_status_page(
|
|
180
|
+
self.mock_client,
|
|
181
|
+
next_url,
|
|
182
|
+
request_timeout=timeout_seconds,
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
assert result.status == "completed"
|
|
186
|
+
self.mock_client.get.assert_called_with(next_url, timeout=timeout_seconds)
|
|
126
187
|
|
|
127
188
|
def test_get_crawl_status_with_pagination(self):
|
|
128
189
|
"""Test get_crawl_status with auto_paginate=True."""
|
|
@@ -326,6 +387,59 @@ class TestBatchScrapePagination:
|
|
|
326
387
|
assert result.next == "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2"
|
|
327
388
|
assert len(result.data) == 1
|
|
328
389
|
assert isinstance(result.data[0], Document)
|
|
390
|
+
|
|
391
|
+
def test_get_batch_scrape_status_page(self):
|
|
392
|
+
"""Test get_batch_scrape_status_page returns a single page."""
|
|
393
|
+
mock_response = Mock()
|
|
394
|
+
mock_response.ok = True
|
|
395
|
+
mock_response.json.return_value = {
|
|
396
|
+
"success": True,
|
|
397
|
+
"status": "completed",
|
|
398
|
+
"completed": 10,
|
|
399
|
+
"total": 20,
|
|
400
|
+
"creditsUsed": 5,
|
|
401
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
402
|
+
"next": "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=3",
|
|
403
|
+
"data": [self.sample_doc],
|
|
404
|
+
}
|
|
405
|
+
|
|
406
|
+
self.mock_client.get.return_value = mock_response
|
|
407
|
+
next_url = "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2"
|
|
408
|
+
|
|
409
|
+
result = get_batch_scrape_status_page(self.mock_client, next_url)
|
|
410
|
+
|
|
411
|
+
assert result.status == "completed"
|
|
412
|
+
assert result.next == "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=3"
|
|
413
|
+
assert len(result.data) == 1
|
|
414
|
+
self.mock_client.get.assert_called_with(next_url, timeout=None)
|
|
415
|
+
|
|
416
|
+
def test_get_batch_scrape_status_page_propagates_request_timeout(self):
|
|
417
|
+
"""Ensure request_timeout is forwarded to batch status page requests."""
|
|
418
|
+
mock_response = Mock()
|
|
419
|
+
mock_response.ok = True
|
|
420
|
+
mock_response.json.return_value = {
|
|
421
|
+
"success": True,
|
|
422
|
+
"status": "completed",
|
|
423
|
+
"completed": 1,
|
|
424
|
+
"total": 1,
|
|
425
|
+
"creditsUsed": 1,
|
|
426
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
427
|
+
"next": None,
|
|
428
|
+
"data": [self.sample_doc],
|
|
429
|
+
}
|
|
430
|
+
|
|
431
|
+
self.mock_client.get.return_value = mock_response
|
|
432
|
+
|
|
433
|
+
next_url = "https://api.firecrawl.dev/v2/batch/scrape/test-batch-123?page=2"
|
|
434
|
+
timeout_seconds = 2.7
|
|
435
|
+
result = get_batch_scrape_status_page(
|
|
436
|
+
self.mock_client,
|
|
437
|
+
next_url,
|
|
438
|
+
request_timeout=timeout_seconds,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
assert result.status == "completed"
|
|
442
|
+
self.mock_client.get.assert_called_with(next_url, timeout=timeout_seconds)
|
|
329
443
|
|
|
330
444
|
def test_get_batch_scrape_status_with_pagination(self):
|
|
331
445
|
"""Test get_batch_scrape_status with auto_paginate=True."""
|
|
@@ -493,6 +607,61 @@ class TestAsyncPagination:
|
|
|
493
607
|
f"/v2/crawl/{self.job_id}", timeout=timeout_seconds
|
|
494
608
|
)
|
|
495
609
|
|
|
610
|
+
@pytest.mark.asyncio
|
|
611
|
+
async def test_get_crawl_status_page_async(self):
|
|
612
|
+
"""Test async get_crawl_status_page returns a single page."""
|
|
613
|
+
mock_response = Mock()
|
|
614
|
+
mock_response.status_code = 200
|
|
615
|
+
mock_response.json.return_value = {
|
|
616
|
+
"success": True,
|
|
617
|
+
"status": "completed",
|
|
618
|
+
"completed": 10,
|
|
619
|
+
"total": 20,
|
|
620
|
+
"creditsUsed": 5,
|
|
621
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
622
|
+
"next": "https://api.firecrawl.dev/v2/crawl/test-async-123?page=3",
|
|
623
|
+
"data": [self.sample_doc],
|
|
624
|
+
}
|
|
625
|
+
|
|
626
|
+
self.mock_client.get.return_value = mock_response
|
|
627
|
+
next_url = "https://api.firecrawl.dev/v2/crawl/test-async-123?page=2"
|
|
628
|
+
|
|
629
|
+
result = await get_crawl_status_page_async(self.mock_client, next_url)
|
|
630
|
+
|
|
631
|
+
assert result.status == "completed"
|
|
632
|
+
assert result.next == "https://api.firecrawl.dev/v2/crawl/test-async-123?page=3"
|
|
633
|
+
assert len(result.data) == 1
|
|
634
|
+
self.mock_client.get.assert_awaited_with(next_url, timeout=None)
|
|
635
|
+
|
|
636
|
+
@pytest.mark.asyncio
|
|
637
|
+
async def test_get_crawl_status_page_async_propagates_request_timeout(self):
|
|
638
|
+
"""Ensure async request_timeout is forwarded to crawl status page requests."""
|
|
639
|
+
mock_response = Mock()
|
|
640
|
+
mock_response.status_code = 200
|
|
641
|
+
mock_response.json.return_value = {
|
|
642
|
+
"success": True,
|
|
643
|
+
"status": "completed",
|
|
644
|
+
"completed": 1,
|
|
645
|
+
"total": 1,
|
|
646
|
+
"creditsUsed": 1,
|
|
647
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
648
|
+
"next": None,
|
|
649
|
+
"data": [self.sample_doc],
|
|
650
|
+
}
|
|
651
|
+
|
|
652
|
+
self.mock_client.get.return_value = mock_response
|
|
653
|
+
|
|
654
|
+
next_url = "https://api.firecrawl.dev/v2/crawl/test-async-123?page=2"
|
|
655
|
+
timeout_seconds = 6.1
|
|
656
|
+
result = await get_crawl_status_page_async(
|
|
657
|
+
self.mock_client,
|
|
658
|
+
next_url,
|
|
659
|
+
request_timeout=timeout_seconds,
|
|
660
|
+
)
|
|
661
|
+
|
|
662
|
+
assert result.status == "completed"
|
|
663
|
+
self.mock_client.get.assert_awaited_with(next_url, timeout=timeout_seconds)
|
|
664
|
+
|
|
496
665
|
@pytest.mark.asyncio
|
|
497
666
|
async def test_get_batch_scrape_status_async_with_pagination(self):
|
|
498
667
|
"""Test async get_batch_scrape_status with pagination."""
|
|
@@ -534,6 +703,61 @@ class TestAsyncPagination:
|
|
|
534
703
|
assert result.next is None
|
|
535
704
|
assert len(result.data) == 2
|
|
536
705
|
assert self.mock_client.get.call_count == 2
|
|
706
|
+
|
|
707
|
+
@pytest.mark.asyncio
|
|
708
|
+
async def test_get_batch_scrape_status_page_async(self):
|
|
709
|
+
"""Test async get_batch_scrape_status_page returns a single page."""
|
|
710
|
+
mock_response = Mock()
|
|
711
|
+
mock_response.status_code = 200
|
|
712
|
+
mock_response.json.return_value = {
|
|
713
|
+
"success": True,
|
|
714
|
+
"status": "completed",
|
|
715
|
+
"completed": 10,
|
|
716
|
+
"total": 20,
|
|
717
|
+
"creditsUsed": 5,
|
|
718
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
719
|
+
"next": "https://api.firecrawl.dev/v2/batch/scrape/test-async-123?page=3",
|
|
720
|
+
"data": [self.sample_doc],
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
self.mock_client.get.return_value = mock_response
|
|
724
|
+
next_url = "https://api.firecrawl.dev/v2/batch/scrape/test-async-123?page=2"
|
|
725
|
+
|
|
726
|
+
result = await get_batch_scrape_status_page_async(self.mock_client, next_url)
|
|
727
|
+
|
|
728
|
+
assert result.status == "completed"
|
|
729
|
+
assert result.next == "https://api.firecrawl.dev/v2/batch/scrape/test-async-123?page=3"
|
|
730
|
+
assert len(result.data) == 1
|
|
731
|
+
self.mock_client.get.assert_awaited_with(next_url, timeout=None)
|
|
732
|
+
|
|
733
|
+
@pytest.mark.asyncio
|
|
734
|
+
async def test_get_batch_scrape_status_page_async_propagates_request_timeout(self):
|
|
735
|
+
"""Ensure async request_timeout is forwarded to batch status page requests."""
|
|
736
|
+
mock_response = Mock()
|
|
737
|
+
mock_response.status_code = 200
|
|
738
|
+
mock_response.json.return_value = {
|
|
739
|
+
"success": True,
|
|
740
|
+
"status": "completed",
|
|
741
|
+
"completed": 1,
|
|
742
|
+
"total": 1,
|
|
743
|
+
"creditsUsed": 1,
|
|
744
|
+
"expiresAt": "2024-01-01T00:00:00Z",
|
|
745
|
+
"next": None,
|
|
746
|
+
"data": [self.sample_doc],
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
self.mock_client.get.return_value = mock_response
|
|
750
|
+
|
|
751
|
+
next_url = "https://api.firecrawl.dev/v2/batch/scrape/test-async-123?page=2"
|
|
752
|
+
timeout_seconds = 4.4
|
|
753
|
+
result = await get_batch_scrape_status_page_async(
|
|
754
|
+
self.mock_client,
|
|
755
|
+
next_url,
|
|
756
|
+
request_timeout=timeout_seconds,
|
|
757
|
+
)
|
|
758
|
+
|
|
759
|
+
assert result.status == "completed"
|
|
760
|
+
self.mock_client.get.assert_awaited_with(next_url, timeout=timeout_seconds)
|
|
537
761
|
|
|
538
762
|
@pytest.mark.asyncio
|
|
539
763
|
async def test_fetch_all_pages_async_limits(self):
|
|
@@ -61,6 +61,7 @@ class V2Proxy:
|
|
|
61
61
|
self.crawl = client_instance.crawl
|
|
62
62
|
self.start_crawl = client_instance.start_crawl
|
|
63
63
|
self.get_crawl_status = client_instance.get_crawl_status
|
|
64
|
+
self.get_crawl_status_page = client_instance.get_crawl_status_page
|
|
64
65
|
self.cancel_crawl = client_instance.cancel_crawl
|
|
65
66
|
self.get_crawl_errors = client_instance.get_crawl_errors
|
|
66
67
|
self.get_active_crawls = client_instance.get_active_crawls
|
|
@@ -78,6 +79,7 @@ class V2Proxy:
|
|
|
78
79
|
|
|
79
80
|
self.start_batch_scrape = client_instance.start_batch_scrape
|
|
80
81
|
self.get_batch_scrape_status = client_instance.get_batch_scrape_status
|
|
82
|
+
self.get_batch_scrape_status_page = client_instance.get_batch_scrape_status_page
|
|
81
83
|
self.cancel_batch_scrape = client_instance.cancel_batch_scrape
|
|
82
84
|
self.batch_scrape = client_instance.batch_scrape
|
|
83
85
|
self.get_batch_scrape_errors = client_instance.get_batch_scrape_errors
|
|
@@ -127,6 +129,7 @@ class AsyncV2Proxy:
|
|
|
127
129
|
self.start_crawl = client_instance.start_crawl
|
|
128
130
|
self.wait_crawl = client_instance.wait_crawl
|
|
129
131
|
self.get_crawl_status = client_instance.get_crawl_status
|
|
132
|
+
self.get_crawl_status_page = client_instance.get_crawl_status_page
|
|
130
133
|
self.cancel_crawl = client_instance.cancel_crawl
|
|
131
134
|
self.get_crawl_errors = client_instance.get_crawl_errors
|
|
132
135
|
self.get_active_crawls = client_instance.get_active_crawls
|
|
@@ -144,6 +147,7 @@ class AsyncV2Proxy:
|
|
|
144
147
|
|
|
145
148
|
self.start_batch_scrape = client_instance.start_batch_scrape
|
|
146
149
|
self.get_batch_scrape_status = client_instance.get_batch_scrape_status
|
|
150
|
+
self.get_batch_scrape_status_page = client_instance.get_batch_scrape_status_page
|
|
147
151
|
self.cancel_batch_scrape = client_instance.cancel_batch_scrape
|
|
148
152
|
self.wait_batch_scrape = client_instance.wait_batch_scrape
|
|
149
153
|
self.batch_scrape = client_instance.batch_scrape
|
|
@@ -198,6 +202,7 @@ class Firecrawl:
|
|
|
198
202
|
self.start_crawl = self._v2_client.start_crawl
|
|
199
203
|
self.crawl_params_preview = self._v2_client.crawl_params_preview
|
|
200
204
|
self.get_crawl_status = self._v2_client.get_crawl_status
|
|
205
|
+
self.get_crawl_status_page = self._v2_client.get_crawl_status_page
|
|
201
206
|
self.cancel_crawl = self._v2_client.cancel_crawl
|
|
202
207
|
self.get_crawl_errors = self._v2_client.get_crawl_errors
|
|
203
208
|
self.get_active_crawls = self._v2_client.get_active_crawls
|
|
@@ -205,6 +210,7 @@ class Firecrawl:
|
|
|
205
210
|
|
|
206
211
|
self.start_batch_scrape = self._v2_client.start_batch_scrape
|
|
207
212
|
self.get_batch_scrape_status = self._v2_client.get_batch_scrape_status
|
|
213
|
+
self.get_batch_scrape_status_page = self._v2_client.get_batch_scrape_status_page
|
|
208
214
|
self.cancel_batch_scrape = self._v2_client.cancel_batch_scrape
|
|
209
215
|
self.batch_scrape = self._v2_client.batch_scrape
|
|
210
216
|
self.get_batch_scrape_errors = self._v2_client.get_batch_scrape_errors
|
|
@@ -248,6 +254,7 @@ class AsyncFirecrawl:
|
|
|
248
254
|
|
|
249
255
|
self.start_crawl = self._v2_client.start_crawl
|
|
250
256
|
self.get_crawl_status = self._v2_client.get_crawl_status
|
|
257
|
+
self.get_crawl_status_page = self._v2_client.get_crawl_status_page
|
|
251
258
|
self.cancel_crawl = self._v2_client.cancel_crawl
|
|
252
259
|
self.crawl = self._v2_client.crawl
|
|
253
260
|
self.get_crawl_errors = self._v2_client.get_crawl_errors
|
|
@@ -256,6 +263,7 @@ class AsyncFirecrawl:
|
|
|
256
263
|
|
|
257
264
|
self.start_batch_scrape = self._v2_client.start_batch_scrape
|
|
258
265
|
self.get_batch_scrape_status = self._v2_client.get_batch_scrape_status
|
|
266
|
+
self.get_batch_scrape_status_page = self._v2_client.get_batch_scrape_status_page
|
|
259
267
|
self.cancel_batch_scrape = self._v2_client.cancel_batch_scrape
|
|
260
268
|
self.batch_scrape = self._v2_client.batch_scrape
|
|
261
269
|
self.get_batch_scrape_errors = self._v2_client.get_batch_scrape_errors
|
|
@@ -278,4 +286,4 @@ class AsyncFirecrawl:
|
|
|
278
286
|
|
|
279
287
|
# Export Firecrawl as an alias for FirecrawlApp
|
|
280
288
|
FirecrawlApp = Firecrawl
|
|
281
|
-
AsyncFirecrawlApp = AsyncFirecrawl
|
|
289
|
+
AsyncFirecrawlApp = AsyncFirecrawl
|
|
@@ -150,7 +150,7 @@ class V1ScrapeOptions(pydantic.BaseModel):
|
|
|
150
150
|
skipTlsVerification: Optional[bool] = None
|
|
151
151
|
removeBase64Images: Optional[bool] = None
|
|
152
152
|
blockAds: Optional[bool] = None
|
|
153
|
-
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
153
|
+
proxy: Optional[Literal["basic", "stealth", "enhanced", "auto"]] = None
|
|
154
154
|
changeTrackingOptions: Optional[V1ChangeTrackingOptions] = None
|
|
155
155
|
maxAge: Optional[int] = None
|
|
156
156
|
storeInCache: Optional[bool] = None
|
|
@@ -542,7 +542,7 @@ class V1FirecrawlApp:
|
|
|
542
542
|
skip_tls_verification: Optional[bool] = None,
|
|
543
543
|
remove_base64_images: Optional[bool] = None,
|
|
544
544
|
block_ads: Optional[bool] = None,
|
|
545
|
-
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
545
|
+
proxy: Optional[Literal["basic", "stealth", "enhanced", "auto"]] = None,
|
|
546
546
|
parse_pdf: Optional[bool] = None,
|
|
547
547
|
extract: Optional[V1JsonConfig] = None,
|
|
548
548
|
json_options: Optional[V1JsonConfig] = None,
|
|
@@ -1441,7 +1441,7 @@ class V1FirecrawlApp:
|
|
|
1441
1441
|
skip_tls_verification: Optional[bool] = None,
|
|
1442
1442
|
remove_base64_images: Optional[bool] = None,
|
|
1443
1443
|
block_ads: Optional[bool] = None,
|
|
1444
|
-
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1444
|
+
proxy: Optional[Literal["basic", "stealth", "enhanced", "auto"]] = None,
|
|
1445
1445
|
extract: Optional[V1JsonConfig] = None,
|
|
1446
1446
|
json_options: Optional[V1JsonConfig] = None,
|
|
1447
1447
|
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
@@ -1582,7 +1582,7 @@ class V1FirecrawlApp:
|
|
|
1582
1582
|
skip_tls_verification: Optional[bool] = None,
|
|
1583
1583
|
remove_base64_images: Optional[bool] = None,
|
|
1584
1584
|
block_ads: Optional[bool] = None,
|
|
1585
|
-
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1585
|
+
proxy: Optional[Literal["basic", "stealth", "enhanced", "auto"]] = None,
|
|
1586
1586
|
extract: Optional[V1JsonConfig] = None,
|
|
1587
1587
|
json_options: Optional[V1JsonConfig] = None,
|
|
1588
1588
|
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
@@ -1722,7 +1722,7 @@ class V1FirecrawlApp:
|
|
|
1722
1722
|
skip_tls_verification: Optional[bool] = None,
|
|
1723
1723
|
remove_base64_images: Optional[bool] = None,
|
|
1724
1724
|
block_ads: Optional[bool] = None,
|
|
1725
|
-
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
1725
|
+
proxy: Optional[Literal["basic", "stealth", "enhanced", "auto"]] = None,
|
|
1726
1726
|
extract: Optional[V1JsonConfig] = None,
|
|
1727
1727
|
json_options: Optional[V1JsonConfig] = None,
|
|
1728
1728
|
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
@@ -3523,7 +3523,7 @@ class AsyncV1FirecrawlApp(V1FirecrawlApp):
|
|
|
3523
3523
|
skip_tls_verification: Optional[bool] = None,
|
|
3524
3524
|
remove_base64_images: Optional[bool] = None,
|
|
3525
3525
|
block_ads: Optional[bool] = None,
|
|
3526
|
-
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3526
|
+
proxy: Optional[Literal["basic", "stealth", "enhanced", "auto"]] = None,
|
|
3527
3527
|
parse_pdf: Optional[bool] = None,
|
|
3528
3528
|
extract: Optional[V1JsonConfig] = None,
|
|
3529
3529
|
json_options: Optional[V1JsonConfig] = None,
|
|
@@ -3657,7 +3657,7 @@ class AsyncV1FirecrawlApp(V1FirecrawlApp):
|
|
|
3657
3657
|
skip_tls_verification: Optional[bool] = None,
|
|
3658
3658
|
remove_base64_images: Optional[bool] = None,
|
|
3659
3659
|
block_ads: Optional[bool] = None,
|
|
3660
|
-
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3660
|
+
proxy: Optional[Literal["basic", "stealth", "enhanced", "auto"]] = None,
|
|
3661
3661
|
extract: Optional[V1JsonConfig] = None,
|
|
3662
3662
|
json_options: Optional[V1JsonConfig] = None,
|
|
3663
3663
|
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
@@ -3796,7 +3796,7 @@ class AsyncV1FirecrawlApp(V1FirecrawlApp):
|
|
|
3796
3796
|
skip_tls_verification: Optional[bool] = None,
|
|
3797
3797
|
remove_base64_images: Optional[bool] = None,
|
|
3798
3798
|
block_ads: Optional[bool] = None,
|
|
3799
|
-
proxy: Optional[Literal["basic", "stealth", "auto"]] = None,
|
|
3799
|
+
proxy: Optional[Literal["basic", "stealth", "enhanced", "auto"]] = None,
|
|
3800
3800
|
extract: Optional[V1JsonConfig] = None,
|
|
3801
3801
|
json_options: Optional[V1JsonConfig] = None,
|
|
3802
3802
|
actions: Optional[List[Union[V1WaitAction, V1ScreenshotAction, V1ClickAction, V1WriteAction, V1PressAction, V1ScrollAction, V1ScrapeAction, V1ExecuteJavascriptAction, V1PDFAction]]] = None,
|
|
@@ -423,6 +423,28 @@ class FirecrawlClient:
|
|
|
423
423
|
pagination_config=pagination_config,
|
|
424
424
|
request_timeout=request_timeout,
|
|
425
425
|
)
|
|
426
|
+
|
|
427
|
+
def get_crawl_status_page(
|
|
428
|
+
self,
|
|
429
|
+
next_url: str,
|
|
430
|
+
*,
|
|
431
|
+
request_timeout: Optional[float] = None,
|
|
432
|
+
) -> CrawlJob:
|
|
433
|
+
"""
|
|
434
|
+
Fetch a single page of crawl results using a next URL.
|
|
435
|
+
|
|
436
|
+
Args:
|
|
437
|
+
next_url: Opaque next URL from a prior crawl status response
|
|
438
|
+
request_timeout: Timeout (in seconds) for the HTTP request
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
CrawlJob with the page data and next URL (if any)
|
|
442
|
+
"""
|
|
443
|
+
return crawl_module.get_crawl_status_page(
|
|
444
|
+
self.http_client,
|
|
445
|
+
next_url,
|
|
446
|
+
request_timeout=request_timeout,
|
|
447
|
+
)
|
|
426
448
|
|
|
427
449
|
def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
|
|
428
450
|
"""
|
|
@@ -741,6 +763,27 @@ class FirecrawlClient:
|
|
|
741
763
|
pagination_config=pagination_config
|
|
742
764
|
)
|
|
743
765
|
|
|
766
|
+
def get_batch_scrape_status_page(
|
|
767
|
+
self,
|
|
768
|
+
next_url: str,
|
|
769
|
+
*,
|
|
770
|
+
request_timeout: Optional[float] = None,
|
|
771
|
+
):
|
|
772
|
+
"""Fetch a single page of batch scrape results using a next URL.
|
|
773
|
+
|
|
774
|
+
Args:
|
|
775
|
+
next_url: Opaque next URL from a prior batch scrape status response
|
|
776
|
+
request_timeout: Timeout (in seconds) for the HTTP request
|
|
777
|
+
|
|
778
|
+
Returns:
|
|
779
|
+
BatchScrapeJob with the page data and next URL (if any)
|
|
780
|
+
"""
|
|
781
|
+
return batch_module.get_batch_scrape_status_page(
|
|
782
|
+
self.http_client,
|
|
783
|
+
next_url,
|
|
784
|
+
request_timeout=request_timeout,
|
|
785
|
+
)
|
|
786
|
+
|
|
744
787
|
def cancel_batch_scrape(self, job_id: str) -> bool:
|
|
745
788
|
"""Cancel a running batch scrape job.
|
|
746
789
|
|