firecrawl 4.3.1__tar.gz → 4.3.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl might be problematic. Click here for more details.
- {firecrawl-4.3.1 → firecrawl-4.3.3}/PKG-INFO +1 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__init__.py +1 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/test_crawl.py +4 -2
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/test_extract.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/test_map.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/test_scrape.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/test_search.py +1 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +18 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +2 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +3 -2
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +2 -2
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +3 -2
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +18 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +4 -2
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v1/client.py +7 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/client.py +22 -6
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/client_async.py +7 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/aio/batch.py +3 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/aio/crawl.py +2 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/aio/extract.py +7 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/aio/map.py +4 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/aio/search.py +5 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/batch.py +1 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/crawl.py +3 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/extract.py +7 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/map.py +4 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/search.py +4 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/types.py +51 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/utils/validation.py +3 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl.egg-info/PKG-INFO +1 -1
- {firecrawl-4.3.1 → firecrawl-4.3.3}/LICENSE +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/README.md +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/conftest.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/test_async.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/test_usage.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_pagination.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/client.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/firecrawl.backup.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/types.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v1/__init__.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/__init__.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/aio/__init__.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/aio/scrape.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/aio/usage.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/scrape.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/methods/usage.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/utils/__init__.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/utils/error_handler.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/utils/get_version.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/utils/http_client.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/utils/http_client_async.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/utils/normalize.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/watcher.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/v2/watcher_async.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl.egg-info/SOURCES.txt +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl.egg-info/dependency_links.txt +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl.egg-info/requires.txt +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl.egg-info/top_level.txt +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/pyproject.toml +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/setup.cfg +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/setup.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/tests/test_change_tracking.py +0 -0
- {firecrawl-4.3.1 → firecrawl-4.3.3}/tests/test_timeout_conversion.py +0 -0
|
@@ -32,6 +32,7 @@ async def test_async_extract_with_schema_and_options():
|
|
|
32
32
|
allow_external_links=False,
|
|
33
33
|
enable_web_search=False,
|
|
34
34
|
show_sources=False,
|
|
35
|
+
integration="_e2e-test",
|
|
35
36
|
# agent={"model": "FIRE-1", "prompt": "Extract title"}, # Skipping agent test in CI
|
|
36
37
|
)
|
|
37
38
|
assert res is not None
|
|
@@ -168,7 +168,8 @@ class TestCrawlE2E:
|
|
|
168
168
|
limit=3,
|
|
169
169
|
max_discovery_depth=2,
|
|
170
170
|
poll_interval=1,
|
|
171
|
-
timeout=120
|
|
171
|
+
timeout=120,
|
|
172
|
+
integration="_e2e-test",
|
|
172
173
|
)
|
|
173
174
|
|
|
174
175
|
assert crawl_job.status in ["completed", "failed"]
|
|
@@ -257,7 +258,8 @@ class TestCrawlE2E:
|
|
|
257
258
|
max_concurrency=2,
|
|
258
259
|
webhook="https://example.com/hook",
|
|
259
260
|
scrape_options=scrape_opts,
|
|
260
|
-
zero_data_retention=False
|
|
261
|
+
zero_data_retention=False,
|
|
262
|
+
integration="_e2e-test",
|
|
261
263
|
)
|
|
262
264
|
|
|
263
265
|
assert crawl_job.id is not None
|
|
@@ -59,3 +59,21 @@ class TestAsyncCrawlRequestPreparation:
|
|
|
59
59
|
assert "metadata" not in webhook
|
|
60
60
|
assert "events" not in webhook
|
|
61
61
|
|
|
62
|
+
def test_all_fields_including_integration(self):
|
|
63
|
+
req = CrawlRequest(
|
|
64
|
+
url="https://example.com",
|
|
65
|
+
include_paths=["/docs/*"],
|
|
66
|
+
exclude_paths=["/admin/*"],
|
|
67
|
+
max_discovery_depth=2,
|
|
68
|
+
sitemap="include",
|
|
69
|
+
ignore_query_parameters=True,
|
|
70
|
+
crawl_entire_domain=False,
|
|
71
|
+
allow_external_links=True,
|
|
72
|
+
allow_subdomains=True,
|
|
73
|
+
max_concurrency=3,
|
|
74
|
+
zero_data_retention=False,
|
|
75
|
+
integration=" _unit-test ",
|
|
76
|
+
)
|
|
77
|
+
payload = _prepare_crawl_request(req)
|
|
78
|
+
assert payload["integration"] == "_unit-test"
|
|
79
|
+
|
|
@@ -9,11 +9,12 @@ class TestAsyncMapRequestPreparation:
|
|
|
9
9
|
assert payload["url"] == "https://example.com"
|
|
10
10
|
|
|
11
11
|
def test_fields(self):
|
|
12
|
-
opts = MapOptions(search="docs", include_subdomains=True, limit=10, sitemap="only", timeout=15000)
|
|
12
|
+
opts = MapOptions(search="docs", include_subdomains=True, limit=10, sitemap="only", timeout=15000, integration=" _unit-test ")
|
|
13
13
|
payload = _prepare_map_request("https://example.com", opts)
|
|
14
14
|
assert payload["search"] == "docs"
|
|
15
15
|
assert payload["includeSubdomains"] is True
|
|
16
16
|
assert payload["limit"] == 10
|
|
17
17
|
assert payload["sitemap"] == "only"
|
|
18
18
|
assert payload["timeout"] == 15000
|
|
19
|
+
assert payload["integration"] == "_unit-test"
|
|
19
20
|
|
|
@@ -33,10 +33,12 @@ class TestAsyncSearchRequestPreparation:
|
|
|
33
33
|
ignore_invalid_urls=False,
|
|
34
34
|
timeout=30000,
|
|
35
35
|
scrape_options=scrape_opts,
|
|
36
|
+
integration=" _unit-test ",
|
|
36
37
|
)
|
|
37
38
|
data = _prepare_search_request(request)
|
|
38
39
|
assert data["ignoreInvalidURLs"] is False
|
|
39
40
|
assert "scrapeOptions" in data
|
|
41
|
+
assert data["integration"] == "_unit-test"
|
|
40
42
|
|
|
41
43
|
def test_exclude_none_behavior(self):
|
|
42
44
|
request = SearchRequest(
|
|
@@ -59,5 +61,4 @@ class TestAsyncSearchRequestPreparation:
|
|
|
59
61
|
assert "scrapeOptions" in data
|
|
60
62
|
scrape_data = data["scrapeOptions"]
|
|
61
63
|
assert "onlyMainContent" in scrape_data
|
|
62
|
-
assert "mobile" in scrape_data
|
|
63
|
-
|
|
64
|
+
assert "mobile" in scrape_data
|
|
@@ -76,14 +76,14 @@ class TestBatchScrapeRequestPreparation:
|
|
|
76
76
|
ignore_invalid_urls=True,
|
|
77
77
|
max_concurrency=5,
|
|
78
78
|
zero_data_retention=True,
|
|
79
|
-
integration="test",
|
|
79
|
+
integration="_unit-test",
|
|
80
80
|
)
|
|
81
81
|
assert isinstance(data["webhook"], dict) and data["webhook"]["url"] == "https://hook.test"
|
|
82
82
|
assert data["appendToId"] == "00000000-0000-0000-0000-000000000000"
|
|
83
83
|
assert data["ignoreInvalidURLs"] is True
|
|
84
84
|
assert data["maxConcurrency"] == 5
|
|
85
85
|
assert data["zeroDataRetention"] is True
|
|
86
|
-
assert data["integration"] == "test"
|
|
86
|
+
assert data["integration"] == "_unit-test"
|
|
87
87
|
|
|
88
88
|
def test_string_webhook_is_passed_verbatim(self):
|
|
89
89
|
data = prepare_batch_scrape_request(["https://example.com"], webhook="https://hook.simple")
|
|
@@ -35,6 +35,7 @@ class TestMapRequestPreparation:
|
|
|
35
35
|
limit=25,
|
|
36
36
|
sitemap="only",
|
|
37
37
|
timeout=15000,
|
|
38
|
+
integration=" _unit-test ",
|
|
38
39
|
)
|
|
39
40
|
data = _prepare_map_request("https://example.com", opts)
|
|
40
41
|
|
|
@@ -44,10 +45,10 @@ class TestMapRequestPreparation:
|
|
|
44
45
|
assert data["limit"] == 25
|
|
45
46
|
assert data["sitemap"] == "only"
|
|
46
47
|
assert data["timeout"] == 15000
|
|
48
|
+
assert data["integration"] == "_unit-test"
|
|
47
49
|
|
|
48
50
|
def test_invalid_url(self):
|
|
49
51
|
with pytest.raises(ValueError):
|
|
50
52
|
_prepare_map_request("")
|
|
51
53
|
with pytest.raises(ValueError):
|
|
52
|
-
_prepare_map_request(" ")
|
|
53
|
-
|
|
54
|
+
_prepare_map_request(" ")
|
|
@@ -89,4 +89,21 @@ class TestScrapeRequestPreparation:
|
|
|
89
89
|
def test_whitespace_url_validation(self):
|
|
90
90
|
"""Test validation with whitespace-only URL."""
|
|
91
91
|
with pytest.raises(ValueError, match="URL cannot be empty"):
|
|
92
|
-
_prepare_scrape_request(" ")
|
|
92
|
+
_prepare_scrape_request(" ")
|
|
93
|
+
|
|
94
|
+
def test_all_params_including_integration(self):
|
|
95
|
+
opts = ScrapeOptions(
|
|
96
|
+
formats=["markdown"],
|
|
97
|
+
headers={"User-Agent": "Test"},
|
|
98
|
+
include_tags=["h1"],
|
|
99
|
+
exclude_tags=["nav"],
|
|
100
|
+
only_main_content=False,
|
|
101
|
+
timeout=15000,
|
|
102
|
+
wait_for=2000,
|
|
103
|
+
mobile=True,
|
|
104
|
+
skip_tls_verification=True,
|
|
105
|
+
remove_base64_images=False,
|
|
106
|
+
integration=" _unit-test ",
|
|
107
|
+
)
|
|
108
|
+
data = _prepare_scrape_request("https://example.com", opts)
|
|
109
|
+
assert data["integration"] == "_unit-test"
|
|
@@ -43,7 +43,8 @@ class TestSearchRequestPreparation:
|
|
|
43
43
|
location="US",
|
|
44
44
|
ignore_invalid_urls=False,
|
|
45
45
|
timeout=30000,
|
|
46
|
-
scrape_options=scrape_opts
|
|
46
|
+
scrape_options=scrape_opts,
|
|
47
|
+
integration=" _e2e-test ",
|
|
47
48
|
)
|
|
48
49
|
|
|
49
50
|
data = _prepare_search_request(request)
|
|
@@ -83,6 +84,7 @@ class TestSearchRequestPreparation:
|
|
|
83
84
|
assert scrape_data["skipTlsVerification"] is True
|
|
84
85
|
assert "removeBase64Images" in scrape_data
|
|
85
86
|
assert scrape_data["removeBase64Images"] is False
|
|
87
|
+
assert data["integration"] == "_e2e-test"
|
|
86
88
|
|
|
87
89
|
def test_exclude_none_behavior(self):
|
|
88
90
|
"""Test that exclude_none=True behavior is working."""
|
|
@@ -164,4 +166,4 @@ class TestSearchRequestPreparation:
|
|
|
164
166
|
assert "only_main_content" not in scrape_data
|
|
165
167
|
assert "wait_for" not in scrape_data
|
|
166
168
|
assert "skip_tls_verification" not in scrape_data
|
|
167
|
-
assert "remove_base64_images" not in scrape_data
|
|
169
|
+
assert "remove_base64_images" not in scrape_data
|
|
@@ -309,6 +309,7 @@ class V1MapParams(pydantic.BaseModel):
|
|
|
309
309
|
limit: Optional[int] = None
|
|
310
310
|
timeout: Optional[int] = 30000
|
|
311
311
|
useIndex: Optional[bool] = None
|
|
312
|
+
location: Optional[V1LocationConfig] = None
|
|
312
313
|
|
|
313
314
|
class V1MapResponse(pydantic.BaseModel):
|
|
314
315
|
"""Response from mapping operations."""
|
|
@@ -1333,6 +1334,7 @@ class V1FirecrawlApp:
|
|
|
1333
1334
|
limit: Optional[int] = None,
|
|
1334
1335
|
timeout: Optional[int] = 30000,
|
|
1335
1336
|
use_index: Optional[bool] = None,
|
|
1337
|
+
location: Optional[V1LocationConfig] = None,
|
|
1336
1338
|
**kwargs) -> V1MapResponse:
|
|
1337
1339
|
"""
|
|
1338
1340
|
Map and discover links from a URL.
|
|
@@ -1377,6 +1379,8 @@ class V1FirecrawlApp:
|
|
|
1377
1379
|
map_params['timeout'] = timeout
|
|
1378
1380
|
if use_index is not None:
|
|
1379
1381
|
map_params['useIndex'] = use_index
|
|
1382
|
+
if location is not None:
|
|
1383
|
+
map_params['location'] = location.dict(by_alias=True, exclude_none=True)
|
|
1380
1384
|
|
|
1381
1385
|
# Add any additional kwargs
|
|
1382
1386
|
map_params.update(kwargs)
|
|
@@ -3910,6 +3914,7 @@ class AsyncV1FirecrawlApp(V1FirecrawlApp):
|
|
|
3910
3914
|
sitemap_only: Optional[bool] = None,
|
|
3911
3915
|
limit: Optional[int] = None,
|
|
3912
3916
|
timeout: Optional[int] = 30000,
|
|
3917
|
+
location: Optional[V1LocationConfig] = None,
|
|
3913
3918
|
params: Optional[V1MapParams] = None) -> V1MapResponse:
|
|
3914
3919
|
"""
|
|
3915
3920
|
Asynchronously map and discover links from a URL.
|
|
@@ -3952,6 +3957,8 @@ class AsyncV1FirecrawlApp(V1FirecrawlApp):
|
|
|
3952
3957
|
map_params['limit'] = limit
|
|
3953
3958
|
if timeout is not None:
|
|
3954
3959
|
map_params['timeout'] = timeout
|
|
3960
|
+
if location is not None:
|
|
3961
|
+
map_params['location'] = location.dict(by_alias=True, exclude_none=True)
|
|
3955
3962
|
|
|
3956
3963
|
# Create final params object
|
|
3957
3964
|
final_params = V1MapParams(**map_params)
|
|
@@ -117,6 +117,7 @@ class FirecrawlClient:
|
|
|
117
117
|
proxy: Optional[str] = None,
|
|
118
118
|
max_age: Optional[int] = None,
|
|
119
119
|
store_in_cache: Optional[bool] = None,
|
|
120
|
+
integration: Optional[str] = None,
|
|
120
121
|
) -> Document:
|
|
121
122
|
"""
|
|
122
123
|
Scrape a single URL and return the document.
|
|
@@ -165,8 +166,9 @@ class FirecrawlClient:
|
|
|
165
166
|
proxy=proxy,
|
|
166
167
|
max_age=max_age,
|
|
167
168
|
store_in_cache=store_in_cache,
|
|
169
|
+
integration=integration,
|
|
168
170
|
).items() if v is not None}
|
|
169
|
-
) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
|
|
171
|
+
) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache, integration]) else None
|
|
170
172
|
return scrape_module.scrape(self.http_client, url, options)
|
|
171
173
|
|
|
172
174
|
def search(
|
|
@@ -181,6 +183,7 @@ class FirecrawlClient:
|
|
|
181
183
|
ignore_invalid_urls: Optional[bool] = None,
|
|
182
184
|
timeout: Optional[int] = None,
|
|
183
185
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
186
|
+
integration: Optional[str] = None,
|
|
184
187
|
) -> SearchData:
|
|
185
188
|
"""
|
|
186
189
|
Search for documents.
|
|
@@ -206,6 +209,7 @@ class FirecrawlClient:
|
|
|
206
209
|
ignore_invalid_urls=ignore_invalid_urls,
|
|
207
210
|
timeout=timeout,
|
|
208
211
|
scrape_options=scrape_options,
|
|
212
|
+
integration=integration,
|
|
209
213
|
)
|
|
210
214
|
|
|
211
215
|
return search_module.search(self.http_client, request)
|
|
@@ -230,7 +234,8 @@ class FirecrawlClient:
|
|
|
230
234
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
231
235
|
zero_data_retention: bool = False,
|
|
232
236
|
poll_interval: int = 2,
|
|
233
|
-
timeout: Optional[int] = None
|
|
237
|
+
timeout: Optional[int] = None,
|
|
238
|
+
integration: Optional[str] = None,
|
|
234
239
|
) -> CrawlJob:
|
|
235
240
|
"""
|
|
236
241
|
Start a crawl job and wait for it to complete.
|
|
@@ -279,7 +284,8 @@ class FirecrawlClient:
|
|
|
279
284
|
max_concurrency=max_concurrency,
|
|
280
285
|
webhook=webhook,
|
|
281
286
|
scrape_options=scrape_options,
|
|
282
|
-
zero_data_retention=zero_data_retention
|
|
287
|
+
zero_data_retention=zero_data_retention,
|
|
288
|
+
integration=integration,
|
|
283
289
|
)
|
|
284
290
|
|
|
285
291
|
return crawl_module.crawl(
|
|
@@ -307,7 +313,8 @@ class FirecrawlClient:
|
|
|
307
313
|
max_concurrency: Optional[int] = None,
|
|
308
314
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
309
315
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
310
|
-
zero_data_retention: bool = False
|
|
316
|
+
zero_data_retention: bool = False,
|
|
317
|
+
integration: Optional[str] = None,
|
|
311
318
|
) -> CrawlResponse:
|
|
312
319
|
"""
|
|
313
320
|
Start an asynchronous crawl job.
|
|
@@ -353,7 +360,8 @@ class FirecrawlClient:
|
|
|
353
360
|
max_concurrency=max_concurrency,
|
|
354
361
|
webhook=webhook,
|
|
355
362
|
scrape_options=scrape_options,
|
|
356
|
-
zero_data_retention=zero_data_retention
|
|
363
|
+
zero_data_retention=zero_data_retention,
|
|
364
|
+
integration=integration,
|
|
357
365
|
)
|
|
358
366
|
|
|
359
367
|
return crawl_module.start_crawl(self.http_client, request)
|
|
@@ -421,6 +429,8 @@ class FirecrawlClient:
|
|
|
421
429
|
limit: Optional[int] = None,
|
|
422
430
|
sitemap: Optional[Literal["only", "include", "skip"]] = None,
|
|
423
431
|
timeout: Optional[int] = None,
|
|
432
|
+
integration: Optional[str] = None,
|
|
433
|
+
location: Optional[Location] = None,
|
|
424
434
|
) -> MapData:
|
|
425
435
|
"""Map a URL and return discovered links.
|
|
426
436
|
|
|
@@ -441,7 +451,9 @@ class FirecrawlClient:
|
|
|
441
451
|
limit=limit,
|
|
442
452
|
sitemap=sitemap if sitemap is not None else "include",
|
|
443
453
|
timeout=timeout,
|
|
444
|
-
|
|
454
|
+
integration=integration,
|
|
455
|
+
location=location
|
|
456
|
+
) if any(v is not None for v in [search, include_subdomains, limit, sitemap, timeout, integration, location]) else None
|
|
445
457
|
|
|
446
458
|
return map_module.map(self.http_client, url, options)
|
|
447
459
|
|
|
@@ -482,6 +494,7 @@ class FirecrawlClient:
|
|
|
482
494
|
show_sources: Optional[bool] = None,
|
|
483
495
|
scrape_options: Optional['ScrapeOptions'] = None,
|
|
484
496
|
ignore_invalid_urls: Optional[bool] = None,
|
|
497
|
+
integration: Optional[str] = None,
|
|
485
498
|
):
|
|
486
499
|
"""Start an extract job (non-blocking).
|
|
487
500
|
|
|
@@ -510,6 +523,7 @@ class FirecrawlClient:
|
|
|
510
523
|
show_sources=show_sources,
|
|
511
524
|
scrape_options=scrape_options,
|
|
512
525
|
ignore_invalid_urls=ignore_invalid_urls,
|
|
526
|
+
integration=integration,
|
|
513
527
|
)
|
|
514
528
|
|
|
515
529
|
def extract(
|
|
@@ -526,6 +540,7 @@ class FirecrawlClient:
|
|
|
526
540
|
ignore_invalid_urls: Optional[bool] = None,
|
|
527
541
|
poll_interval: int = 2,
|
|
528
542
|
timeout: Optional[int] = None,
|
|
543
|
+
integration: Optional[str] = None,
|
|
529
544
|
):
|
|
530
545
|
"""Extract structured data and wait until completion.
|
|
531
546
|
|
|
@@ -558,6 +573,7 @@ class FirecrawlClient:
|
|
|
558
573
|
ignore_invalid_urls=ignore_invalid_urls,
|
|
559
574
|
poll_interval=poll_interval,
|
|
560
575
|
timeout=timeout,
|
|
576
|
+
integration=integration,
|
|
561
577
|
)
|
|
562
578
|
|
|
563
579
|
def start_batch_scrape(
|
|
@@ -132,6 +132,7 @@ class AsyncFirecrawlClient:
|
|
|
132
132
|
limit: Optional[int] = None,
|
|
133
133
|
sitemap: Optional[Literal["only", "include", "skip"]] = None,
|
|
134
134
|
timeout: Optional[int] = None,
|
|
135
|
+
integration: Optional[str] = None,
|
|
135
136
|
) -> MapData:
|
|
136
137
|
options = MapOptions(
|
|
137
138
|
search=search,
|
|
@@ -139,7 +140,8 @@ class AsyncFirecrawlClient:
|
|
|
139
140
|
limit=limit,
|
|
140
141
|
sitemap=sitemap if sitemap is not None else "include",
|
|
141
142
|
timeout=timeout,
|
|
142
|
-
|
|
143
|
+
integration=integration,
|
|
144
|
+
) if any(v is not None for v in [search, include_subdomains, limit, sitemap, integration, timeout]) else None
|
|
143
145
|
return await async_map.map(self.async_http_client, url, options)
|
|
144
146
|
|
|
145
147
|
async def start_batch_scrape(self, urls: List[str], **kwargs) -> Any:
|
|
@@ -196,6 +198,7 @@ class AsyncFirecrawlClient:
|
|
|
196
198
|
ignore_invalid_urls: Optional[bool] = None,
|
|
197
199
|
poll_interval: int = 2,
|
|
198
200
|
timeout: Optional[int] = None,
|
|
201
|
+
integration: Optional[str] = None,
|
|
199
202
|
):
|
|
200
203
|
return await async_extract.extract(
|
|
201
204
|
self.async_http_client,
|
|
@@ -210,6 +213,7 @@ class AsyncFirecrawlClient:
|
|
|
210
213
|
ignore_invalid_urls=ignore_invalid_urls,
|
|
211
214
|
poll_interval=poll_interval,
|
|
212
215
|
timeout=timeout,
|
|
216
|
+
integration=integration,
|
|
213
217
|
)
|
|
214
218
|
|
|
215
219
|
async def get_extract_status(self, job_id: str):
|
|
@@ -227,6 +231,7 @@ class AsyncFirecrawlClient:
|
|
|
227
231
|
show_sources: Optional[bool] = None,
|
|
228
232
|
scrape_options: Optional['ScrapeOptions'] = None,
|
|
229
233
|
ignore_invalid_urls: Optional[bool] = None,
|
|
234
|
+
integration: Optional[str] = None,
|
|
230
235
|
):
|
|
231
236
|
return await async_extract.start_extract(
|
|
232
237
|
self.async_http_client,
|
|
@@ -239,6 +244,7 @@ class AsyncFirecrawlClient:
|
|
|
239
244
|
show_sources=show_sources,
|
|
240
245
|
scrape_options=scrape_options,
|
|
241
246
|
ignore_invalid_urls=ignore_invalid_urls,
|
|
247
|
+
integration=integration,
|
|
242
248
|
)
|
|
243
249
|
|
|
244
250
|
# Usage endpoints
|
|
@@ -26,7 +26,9 @@ def _prepare(urls: List[str], *, options: Optional[ScrapeOptions] = None, **kwar
|
|
|
26
26
|
if (v := kwargs.get("zero_data_retention")) is not None:
|
|
27
27
|
payload["zeroDataRetention"] = v
|
|
28
28
|
if (v := kwargs.get("integration")) is not None:
|
|
29
|
-
|
|
29
|
+
trimmed_integration = str(v).strip()
|
|
30
|
+
if trimmed_integration:
|
|
31
|
+
payload["integration"] = trimmed_integration
|
|
30
32
|
return payload
|
|
31
33
|
|
|
32
34
|
|
|
@@ -56,6 +56,8 @@ def _prepare_crawl_request(request: CrawlRequest) -> dict:
|
|
|
56
56
|
if snake in request_data:
|
|
57
57
|
data[camel] = request_data.pop(snake)
|
|
58
58
|
data.update(request_data)
|
|
59
|
+
if getattr(request, "integration", None) is not None:
|
|
60
|
+
data["integration"] = str(getattr(request, "integration")).strip()
|
|
59
61
|
return data
|
|
60
62
|
|
|
61
63
|
|
|
@@ -17,6 +17,7 @@ def _prepare_extract_request(
|
|
|
17
17
|
show_sources: Optional[bool] = None,
|
|
18
18
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
19
19
|
ignore_invalid_urls: Optional[bool] = None,
|
|
20
|
+
integration: Optional[str] = None,
|
|
20
21
|
) -> Dict[str, Any]:
|
|
21
22
|
body: Dict[str, Any] = {}
|
|
22
23
|
if urls is not None:
|
|
@@ -39,6 +40,8 @@ def _prepare_extract_request(
|
|
|
39
40
|
prepared = prepare_scrape_options(scrape_options)
|
|
40
41
|
if prepared:
|
|
41
42
|
body["scrapeOptions"] = prepared
|
|
43
|
+
if integration is not None and str(integration).strip():
|
|
44
|
+
body["integration"] = str(integration).strip()
|
|
42
45
|
return body
|
|
43
46
|
|
|
44
47
|
|
|
@@ -54,6 +57,7 @@ async def start_extract(
|
|
|
54
57
|
show_sources: Optional[bool] = None,
|
|
55
58
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
56
59
|
ignore_invalid_urls: Optional[bool] = None,
|
|
60
|
+
integration: Optional[str] = None,
|
|
57
61
|
) -> ExtractResponse:
|
|
58
62
|
body = _prepare_extract_request(
|
|
59
63
|
urls,
|
|
@@ -65,6 +69,7 @@ async def start_extract(
|
|
|
65
69
|
show_sources=show_sources,
|
|
66
70
|
scrape_options=scrape_options,
|
|
67
71
|
ignore_invalid_urls=ignore_invalid_urls,
|
|
72
|
+
integration=integration,
|
|
68
73
|
)
|
|
69
74
|
resp = await client.post("/v2/extract", body)
|
|
70
75
|
return ExtractResponse(**resp.json())
|
|
@@ -106,6 +111,7 @@ async def extract(
|
|
|
106
111
|
ignore_invalid_urls: Optional[bool] = None,
|
|
107
112
|
poll_interval: int = 2,
|
|
108
113
|
timeout: Optional[int] = None,
|
|
114
|
+
integration: Optional[str] = None,
|
|
109
115
|
) -> ExtractResponse:
|
|
110
116
|
started = await start_extract(
|
|
111
117
|
client,
|
|
@@ -118,6 +124,7 @@ async def extract(
|
|
|
118
124
|
show_sources=show_sources,
|
|
119
125
|
scrape_options=scrape_options,
|
|
120
126
|
ignore_invalid_urls=ignore_invalid_urls,
|
|
127
|
+
integration=integration,
|
|
121
128
|
)
|
|
122
129
|
job_id = getattr(started, "id", None)
|
|
123
130
|
if not job_id:
|
|
@@ -20,6 +20,10 @@ def _prepare_map_request(url: str, options: Optional[MapOptions] = None) -> Dict
|
|
|
20
20
|
data["limit"] = options.limit
|
|
21
21
|
if options.timeout is not None:
|
|
22
22
|
data["timeout"] = options.timeout
|
|
23
|
+
if options.integration is not None:
|
|
24
|
+
data["integration"] = options.integration.strip()
|
|
25
|
+
if options.location is not None:
|
|
26
|
+
data["location"] = options.location.model_dump(exclude_none=True)
|
|
23
27
|
payload.update(data)
|
|
24
28
|
return payload
|
|
25
29
|
|
|
@@ -10,6 +10,7 @@ from ...types import (
|
|
|
10
10
|
)
|
|
11
11
|
from ...utils.http_client_async import AsyncHttpClient
|
|
12
12
|
from ...utils.error_handler import handle_response_error
|
|
13
|
+
from ...utils.normalize import normalize_document_input
|
|
13
14
|
from ...utils.validation import validate_scrape_options, prepare_scrape_options
|
|
14
15
|
|
|
15
16
|
T = TypeVar("T")
|
|
@@ -73,7 +74,7 @@ def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, Docu
|
|
|
73
74
|
"summary" in item or
|
|
74
75
|
"json" in item
|
|
75
76
|
):
|
|
76
|
-
results.append(Document(**item))
|
|
77
|
+
results.append(Document(**normalize_document_input(item)))
|
|
77
78
|
else:
|
|
78
79
|
results.append(result_type(**item))
|
|
79
80
|
else:
|
|
@@ -168,5 +169,8 @@ def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
|
|
|
168
169
|
if scrape_data:
|
|
169
170
|
data["scrapeOptions"] = scrape_data
|
|
170
171
|
data.pop("scrape_options", None)
|
|
172
|
+
|
|
173
|
+
if (v := getattr(validated_request, "integration", None)) is not None and str(v).strip():
|
|
174
|
+
data["integration"] = str(validated_request.integration).strip()
|
|
171
175
|
|
|
172
176
|
return data
|
|
@@ -407,7 +407,7 @@ def prepare_batch_scrape_request(
|
|
|
407
407
|
if zero_data_retention is not None:
|
|
408
408
|
request_data["zeroDataRetention"] = zero_data_retention
|
|
409
409
|
if integration is not None:
|
|
410
|
-
request_data["integration"] = integration
|
|
410
|
+
request_data["integration"] = str(integration).strip()
|
|
411
411
|
|
|
412
412
|
return request_data
|
|
413
413
|
|
|
@@ -99,6 +99,9 @@ def _prepare_crawl_request(request: CrawlRequest) -> dict:
|
|
|
99
99
|
|
|
100
100
|
# Add any remaining fields that don't need conversion (like limit)
|
|
101
101
|
data.update(request_data)
|
|
102
|
+
# Trim integration if present
|
|
103
|
+
if "integration" in data and isinstance(data["integration"], str):
|
|
104
|
+
data["integration"] = data["integration"].strip()
|
|
102
105
|
|
|
103
106
|
return data
|
|
104
107
|
|
|
@@ -18,6 +18,7 @@ def _prepare_extract_request(
|
|
|
18
18
|
show_sources: Optional[bool] = None,
|
|
19
19
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
20
20
|
ignore_invalid_urls: Optional[bool] = None,
|
|
21
|
+
integration: Optional[str] = None,
|
|
21
22
|
) -> Dict[str, Any]:
|
|
22
23
|
body: Dict[str, Any] = {}
|
|
23
24
|
if urls is not None:
|
|
@@ -40,6 +41,8 @@ def _prepare_extract_request(
|
|
|
40
41
|
prepared = prepare_scrape_options(scrape_options)
|
|
41
42
|
if prepared:
|
|
42
43
|
body["scrapeOptions"] = prepared
|
|
44
|
+
if integration is not None and str(integration).strip():
|
|
45
|
+
body["integration"] = str(integration).strip()
|
|
43
46
|
return body
|
|
44
47
|
|
|
45
48
|
|
|
@@ -55,6 +58,7 @@ def start_extract(
|
|
|
55
58
|
show_sources: Optional[bool] = None,
|
|
56
59
|
scrape_options: Optional[ScrapeOptions] = None,
|
|
57
60
|
ignore_invalid_urls: Optional[bool] = None,
|
|
61
|
+
integration: Optional[str] = None,
|
|
58
62
|
) -> ExtractResponse:
|
|
59
63
|
body = _prepare_extract_request(
|
|
60
64
|
urls,
|
|
@@ -66,6 +70,7 @@ def start_extract(
|
|
|
66
70
|
show_sources=show_sources,
|
|
67
71
|
scrape_options=scrape_options,
|
|
68
72
|
ignore_invalid_urls=ignore_invalid_urls,
|
|
73
|
+
integration=integration,
|
|
69
74
|
)
|
|
70
75
|
resp = client.post("/v2/extract", body)
|
|
71
76
|
if not resp.ok:
|
|
@@ -111,6 +116,7 @@ def extract(
|
|
|
111
116
|
ignore_invalid_urls: Optional[bool] = None,
|
|
112
117
|
poll_interval: int = 2,
|
|
113
118
|
timeout: Optional[int] = None,
|
|
119
|
+
integration: Optional[str] = None,
|
|
114
120
|
) -> ExtractResponse:
|
|
115
121
|
started = start_extract(
|
|
116
122
|
client,
|
|
@@ -123,6 +129,7 @@ def extract(
|
|
|
123
129
|
show_sources=show_sources,
|
|
124
130
|
scrape_options=scrape_options,
|
|
125
131
|
ignore_invalid_urls=ignore_invalid_urls,
|
|
132
|
+
integration=integration,
|
|
126
133
|
)
|
|
127
134
|
job_id = getattr(started, "id", None)
|
|
128
135
|
if not job_id:
|
|
@@ -27,6 +27,10 @@ def _prepare_map_request(url: str, options: Optional[MapOptions] = None) -> Dict
|
|
|
27
27
|
data["limit"] = options.limit
|
|
28
28
|
if options.timeout is not None:
|
|
29
29
|
data["timeout"] = options.timeout
|
|
30
|
+
if options.integration is not None and options.integration.strip():
|
|
31
|
+
data["integration"] = options.integration.strip()
|
|
32
|
+
if options.location is not None:
|
|
33
|
+
data["location"] = options.location.model_dump(exclude_none=True)
|
|
30
34
|
payload.update(data)
|
|
31
35
|
|
|
32
36
|
return payload
|
|
@@ -71,7 +71,7 @@ def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, 'Doc
|
|
|
71
71
|
"summary" in item or
|
|
72
72
|
"json" in item
|
|
73
73
|
):
|
|
74
|
-
results.append(Document(**item))
|
|
74
|
+
results.append(Document(**normalize_document_input(item)))
|
|
75
75
|
else:
|
|
76
76
|
results.append(result_type(**item))
|
|
77
77
|
else:
|
|
@@ -194,4 +194,7 @@ def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
|
|
|
194
194
|
data["scrapeOptions"] = scrape_data
|
|
195
195
|
data.pop("scrape_options", None)
|
|
196
196
|
|
|
197
|
+
if (str(getattr(validated_request, "integration", "")).strip()):
|
|
198
|
+
data["integration"] = str(validated_request.integration).strip()
|
|
199
|
+
|
|
197
200
|
return data
|
|
@@ -289,6 +289,7 @@ class ScrapeOptions(BaseModel):
|
|
|
289
289
|
proxy: Optional[Literal["basic", "stealth", "auto"]] = None
|
|
290
290
|
max_age: Optional[int] = None
|
|
291
291
|
store_in_cache: Optional[bool] = None
|
|
292
|
+
integration: Optional[str] = None
|
|
292
293
|
|
|
293
294
|
@field_validator('formats')
|
|
294
295
|
@classmethod
|
|
@@ -334,6 +335,7 @@ class CrawlRequest(BaseModel):
|
|
|
334
335
|
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
335
336
|
scrape_options: Optional[ScrapeOptions] = None
|
|
336
337
|
zero_data_retention: bool = False
|
|
338
|
+
integration: Optional[str] = None
|
|
337
339
|
|
|
338
340
|
class CrawlResponse(BaseModel):
|
|
339
341
|
"""Information about a crawl job."""
|
|
@@ -350,6 +352,10 @@ class CrawlJob(BaseModel):
|
|
|
350
352
|
next: Optional[str] = None
|
|
351
353
|
data: List[Document] = []
|
|
352
354
|
|
|
355
|
+
class CrawlStatusRequest(BaseModel):
|
|
356
|
+
"""Request to get crawl job status."""
|
|
357
|
+
job_id: str
|
|
358
|
+
|
|
353
359
|
class SearchResultWeb(BaseModel):
|
|
354
360
|
"""A web search result with URL, title, and description."""
|
|
355
361
|
url: str
|
|
@@ -410,6 +416,7 @@ class CrawlParamsData(BaseModel):
|
|
|
410
416
|
scrape_options: Optional[ScrapeOptions] = None
|
|
411
417
|
zero_data_retention: bool = False
|
|
412
418
|
warning: Optional[str] = None
|
|
419
|
+
integration: Optional[str] = None
|
|
413
420
|
|
|
414
421
|
class CrawlParamsResponse(BaseResponse[CrawlParamsData]):
|
|
415
422
|
"""Response from crawl params endpoint."""
|
|
@@ -420,6 +427,12 @@ class BatchScrapeRequest(BaseModel):
|
|
|
420
427
|
"""Request for batch scraping multiple URLs (internal helper only)."""
|
|
421
428
|
urls: List[str]
|
|
422
429
|
options: Optional[ScrapeOptions] = None
|
|
430
|
+
webhook: Optional[Union[str, WebhookConfig]] = None
|
|
431
|
+
append_to_id: Optional[str] = None
|
|
432
|
+
ignore_invalid_urls: Optional[bool] = None
|
|
433
|
+
max_concurrency: Optional[int] = None
|
|
434
|
+
zero_data_retention: Optional[bool] = None
|
|
435
|
+
integration: Optional[str] = None
|
|
423
436
|
|
|
424
437
|
class BatchScrapeResponse(BaseModel):
|
|
425
438
|
"""Response from starting a batch scrape job (mirrors CrawlResponse naming)."""
|
|
@@ -437,6 +450,14 @@ class BatchScrapeJob(BaseModel):
|
|
|
437
450
|
next: Optional[str] = None
|
|
438
451
|
data: List[Document] = []
|
|
439
452
|
|
|
453
|
+
class BatchScrapeStatusRequest(BaseModel):
|
|
454
|
+
"""Request to get batch scrape job status."""
|
|
455
|
+
job_id: str
|
|
456
|
+
|
|
457
|
+
class BatchScrapeErrorsRequest(BaseModel):
|
|
458
|
+
"""Request to get errors for a batch scrape job."""
|
|
459
|
+
job_id: str
|
|
460
|
+
|
|
440
461
|
# Map types
|
|
441
462
|
class MapOptions(BaseModel):
|
|
442
463
|
"""Options for mapping operations."""
|
|
@@ -445,12 +466,16 @@ class MapOptions(BaseModel):
|
|
|
445
466
|
include_subdomains: Optional[bool] = None
|
|
446
467
|
limit: Optional[int] = None
|
|
447
468
|
timeout: Optional[int] = None
|
|
469
|
+
integration: Optional[str] = None
|
|
470
|
+
location: Optional['Location'] = None
|
|
448
471
|
|
|
449
472
|
class MapRequest(BaseModel):
|
|
450
473
|
"""Request for mapping a website."""
|
|
451
474
|
url: str
|
|
452
475
|
options: Optional[MapOptions] = None
|
|
453
476
|
|
|
477
|
+
|
|
478
|
+
|
|
454
479
|
class MapData(BaseModel):
|
|
455
480
|
"""Map results data."""
|
|
456
481
|
links: List['SearchResult']
|
|
@@ -460,6 +485,19 @@ class MapResponse(BaseResponse[MapData]):
|
|
|
460
485
|
pass
|
|
461
486
|
|
|
462
487
|
# Extract types
|
|
488
|
+
class ExtractRequest(BaseModel):
|
|
489
|
+
"""Request for extract operations."""
|
|
490
|
+
urls: Optional[List[str]] = None
|
|
491
|
+
prompt: Optional[str] = None
|
|
492
|
+
schema_: Optional[Dict[str, Any]] = Field(default=None, alias="schema")
|
|
493
|
+
system_prompt: Optional[str] = None
|
|
494
|
+
allow_external_links: Optional[bool] = None
|
|
495
|
+
enable_web_search: Optional[bool] = None
|
|
496
|
+
show_sources: Optional[bool] = None
|
|
497
|
+
scrape_options: Optional[ScrapeOptions] = None
|
|
498
|
+
ignore_invalid_urls: Optional[bool] = None
|
|
499
|
+
integration: Optional[str] = None
|
|
500
|
+
|
|
463
501
|
class ExtractResponse(BaseModel):
|
|
464
502
|
"""Response for extract operations (start/status/final)."""
|
|
465
503
|
success: Optional[bool] = None
|
|
@@ -491,6 +529,10 @@ class TokenUsage(BaseModel):
|
|
|
491
529
|
billing_period_start: Optional[str] = None
|
|
492
530
|
billing_period_end: Optional[str] = None
|
|
493
531
|
|
|
532
|
+
class QueueStatusRequest(BaseModel):
|
|
533
|
+
"""Request to retrieve queue status."""
|
|
534
|
+
pass
|
|
535
|
+
|
|
494
536
|
class QueueStatusResponse(BaseModel):
|
|
495
537
|
"""Metrics about the team's scrape queue."""
|
|
496
538
|
jobs_in_queue: int
|
|
@@ -592,6 +634,7 @@ class SearchRequest(BaseModel):
|
|
|
592
634
|
ignore_invalid_urls: Optional[bool] = None
|
|
593
635
|
timeout: Optional[int] = 60000
|
|
594
636
|
scrape_options: Optional[ScrapeOptions] = None
|
|
637
|
+
integration: Optional[str] = None
|
|
595
638
|
|
|
596
639
|
@field_validator('sources')
|
|
597
640
|
@classmethod
|
|
@@ -691,6 +734,10 @@ class CrawlErrorsResponse(BaseModel):
|
|
|
691
734
|
errors: List[CrawlError]
|
|
692
735
|
robots_blocked: List[str]
|
|
693
736
|
|
|
737
|
+
class CrawlErrorsRequest(BaseModel):
|
|
738
|
+
"""Request for crawl error monitoring."""
|
|
739
|
+
crawl_id: str
|
|
740
|
+
|
|
694
741
|
class ActiveCrawl(BaseModel):
|
|
695
742
|
"""Information about an active crawl job."""
|
|
696
743
|
id: str
|
|
@@ -703,6 +750,10 @@ class ActiveCrawlsResponse(BaseModel):
|
|
|
703
750
|
success: bool = True
|
|
704
751
|
crawls: List[ActiveCrawl]
|
|
705
752
|
|
|
753
|
+
class ActiveCrawlsRequest(BaseModel):
|
|
754
|
+
"""Request for listing active crawl jobs."""
|
|
755
|
+
pass
|
|
756
|
+
|
|
706
757
|
# Configuration types
|
|
707
758
|
class ClientConfig(BaseModel):
|
|
708
759
|
"""Configuration for the Firecrawl client."""
|
|
@@ -177,6 +177,9 @@ def prepare_scrape_options(options: Optional[ScrapeOptions]) -> Optional[Dict[st
|
|
|
177
177
|
# Handle special cases
|
|
178
178
|
for key, value in options_data.items():
|
|
179
179
|
if value is not None:
|
|
180
|
+
if key == "integration":
|
|
181
|
+
scrape_data["integration"] = (str(value).strip() or None)
|
|
182
|
+
continue
|
|
180
183
|
if key == "formats":
|
|
181
184
|
# Handle formats conversion
|
|
182
185
|
converted_formats: List[Any] = []
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py
RENAMED
|
File without changes
|
{firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py
RENAMED
|
File without changes
|
|
File without changes
|
{firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py
RENAMED
|
File without changes
|
|
File without changes
|
{firecrawl-4.3.1 → firecrawl-4.3.3}/firecrawl/__tests__/unit/v2/methods/test_search_validation.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|