firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
firecrawl/v2/client.py
ADDED
|
@@ -0,0 +1,967 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Main Firecrawl v2 API client.
|
|
3
|
+
|
|
4
|
+
This module provides the main client class that orchestrates all v2 functionality.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import os
|
|
8
|
+
from typing import Optional, List, Dict, Any, Callable, Union, Literal
|
|
9
|
+
from .types import (
|
|
10
|
+
ClientConfig,
|
|
11
|
+
ScrapeOptions,
|
|
12
|
+
Document,
|
|
13
|
+
SearchRequest,
|
|
14
|
+
SearchData,
|
|
15
|
+
SourceOption,
|
|
16
|
+
CategoryOption,
|
|
17
|
+
CrawlRequest,
|
|
18
|
+
CrawlResponse,
|
|
19
|
+
CrawlJob,
|
|
20
|
+
CrawlParamsRequest,
|
|
21
|
+
PDFParser,
|
|
22
|
+
CrawlParamsData,
|
|
23
|
+
WebhookConfig,
|
|
24
|
+
CrawlErrorsResponse,
|
|
25
|
+
ActiveCrawlsResponse,
|
|
26
|
+
MapOptions,
|
|
27
|
+
MapData,
|
|
28
|
+
FormatOption,
|
|
29
|
+
WaitAction,
|
|
30
|
+
ScreenshotAction,
|
|
31
|
+
ClickAction,
|
|
32
|
+
WriteAction,
|
|
33
|
+
PressAction,
|
|
34
|
+
ScrollAction,
|
|
35
|
+
ScrapeAction,
|
|
36
|
+
ExecuteJavascriptAction,
|
|
37
|
+
PDFAction,
|
|
38
|
+
Location,
|
|
39
|
+
PaginationConfig,
|
|
40
|
+
AgentOptions,
|
|
41
|
+
)
|
|
42
|
+
from .utils.http_client import HttpClient
|
|
43
|
+
from .utils.error_handler import FirecrawlError
|
|
44
|
+
from .methods import scrape as scrape_module
|
|
45
|
+
from .methods import crawl as crawl_module
|
|
46
|
+
from .methods import batch as batch_module
|
|
47
|
+
from .methods import search as search_module
|
|
48
|
+
from .methods import map as map_module
|
|
49
|
+
from .methods import batch as batch_methods
|
|
50
|
+
from .methods import usage as usage_methods
|
|
51
|
+
from .methods import extract as extract_module
|
|
52
|
+
from .methods import agent as agent_module
|
|
53
|
+
from .watcher import Watcher
|
|
54
|
+
|
|
55
|
+
class FirecrawlClient:
|
|
56
|
+
"""
|
|
57
|
+
Main Firecrawl v2 API client.
|
|
58
|
+
|
|
59
|
+
This client provides a clean, modular interface to all Firecrawl functionality.
|
|
60
|
+
"""
|
|
61
|
+
|
|
62
|
+
@staticmethod
|
|
63
|
+
def _is_cloud_service(url: str) -> bool:
|
|
64
|
+
return "api.firecrawl.dev" in url.lower()
|
|
65
|
+
|
|
66
|
+
def __init__(
|
|
67
|
+
self,
|
|
68
|
+
api_key: Optional[str] = None,
|
|
69
|
+
api_url: str = "https://api.firecrawl.dev",
|
|
70
|
+
timeout: Optional[float] = None,
|
|
71
|
+
max_retries: int = 3,
|
|
72
|
+
backoff_factor: float = 0.5
|
|
73
|
+
):
|
|
74
|
+
"""
|
|
75
|
+
Initialize the Firecrawl client.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
api_key: Firecrawl API key (or set FIRECRAWL_API_KEY env var)
|
|
79
|
+
api_url: Base URL for the Firecrawl API
|
|
80
|
+
timeout: Request timeout in seconds
|
|
81
|
+
max_retries: Maximum number of retries for failed requests
|
|
82
|
+
backoff_factor: Exponential backoff factor for retries (e.g. 0.5 means wait 0.5s, then 1s, then 2s between retries)
|
|
83
|
+
"""
|
|
84
|
+
if api_key is None:
|
|
85
|
+
api_key = os.getenv("FIRECRAWL_API_KEY")
|
|
86
|
+
|
|
87
|
+
if self._is_cloud_service(api_url) and not api_key:
|
|
88
|
+
raise ValueError(
|
|
89
|
+
"API key is required for the cloud API. Set FIRECRAWL_API_KEY environment variable "
|
|
90
|
+
"or pass api_key parameter."
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
self.config = ClientConfig(
|
|
94
|
+
api_key=api_key,
|
|
95
|
+
api_url=api_url,
|
|
96
|
+
timeout=timeout,
|
|
97
|
+
max_retries=max_retries,
|
|
98
|
+
backoff_factor=backoff_factor
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
self.http_client = HttpClient(api_key, api_url)
|
|
102
|
+
|
|
103
|
+
def scrape(
|
|
104
|
+
self,
|
|
105
|
+
url: str,
|
|
106
|
+
*,
|
|
107
|
+
formats: Optional[List['FormatOption']] = None,
|
|
108
|
+
headers: Optional[Dict[str, str]] = None,
|
|
109
|
+
include_tags: Optional[List[str]] = None,
|
|
110
|
+
exclude_tags: Optional[List[str]] = None,
|
|
111
|
+
only_main_content: Optional[bool] = None,
|
|
112
|
+
timeout: Optional[int] = None,
|
|
113
|
+
wait_for: Optional[int] = None,
|
|
114
|
+
mobile: Optional[bool] = None,
|
|
115
|
+
parsers: Optional[Union[List[str], List[Union[str, PDFParser]]]] = None,
|
|
116
|
+
actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
|
|
117
|
+
location: Optional['Location'] = None,
|
|
118
|
+
skip_tls_verification: Optional[bool] = None,
|
|
119
|
+
remove_base64_images: Optional[bool] = None,
|
|
120
|
+
fast_mode: Optional[bool] = None,
|
|
121
|
+
use_mock: Optional[str] = None,
|
|
122
|
+
block_ads: Optional[bool] = None,
|
|
123
|
+
proxy: Optional[str] = None,
|
|
124
|
+
max_age: Optional[int] = None,
|
|
125
|
+
store_in_cache: Optional[bool] = None,
|
|
126
|
+
integration: Optional[str] = None,
|
|
127
|
+
) -> Document:
|
|
128
|
+
"""
|
|
129
|
+
Scrape a single URL and return the document.
|
|
130
|
+
Args:
|
|
131
|
+
url: URL to scrape
|
|
132
|
+
formats: List of formats to scrape
|
|
133
|
+
headers: Dictionary of headers to use
|
|
134
|
+
include_tags: List of tags to include
|
|
135
|
+
exclude_tags: List of tags to exclude
|
|
136
|
+
only_main_content: Whether to only scrape the main content
|
|
137
|
+
timeout: Timeout in seconds
|
|
138
|
+
wait_for: Wait for a specific element to be present
|
|
139
|
+
mobile: Whether to use mobile mode
|
|
140
|
+
parsers: List of parsers to use
|
|
141
|
+
actions: List of actions to perform
|
|
142
|
+
location: Location to scrape
|
|
143
|
+
skip_tls_verification: Whether to skip TLS verification
|
|
144
|
+
remove_base64_images: Whether to remove base64 images
|
|
145
|
+
fast_mode: Whether to use fast mode
|
|
146
|
+
use_mock: Whether to use mock mode
|
|
147
|
+
block_ads: Whether to block ads
|
|
148
|
+
proxy: Proxy to use
|
|
149
|
+
max_age: Maximum age of the cache
|
|
150
|
+
store_in_cache: Whether to store the result in the cache
|
|
151
|
+
Returns:
|
|
152
|
+
Document
|
|
153
|
+
"""
|
|
154
|
+
options = ScrapeOptions(
|
|
155
|
+
**{k: v for k, v in dict(
|
|
156
|
+
formats=formats,
|
|
157
|
+
headers=headers,
|
|
158
|
+
include_tags=include_tags,
|
|
159
|
+
exclude_tags=exclude_tags,
|
|
160
|
+
only_main_content=only_main_content,
|
|
161
|
+
timeout=timeout,
|
|
162
|
+
wait_for=wait_for,
|
|
163
|
+
mobile=mobile,
|
|
164
|
+
parsers=parsers,
|
|
165
|
+
actions=actions,
|
|
166
|
+
location=location,
|
|
167
|
+
skip_tls_verification=skip_tls_verification,
|
|
168
|
+
remove_base64_images=remove_base64_images,
|
|
169
|
+
fast_mode=fast_mode,
|
|
170
|
+
use_mock=use_mock,
|
|
171
|
+
block_ads=block_ads,
|
|
172
|
+
proxy=proxy,
|
|
173
|
+
max_age=max_age,
|
|
174
|
+
store_in_cache=store_in_cache,
|
|
175
|
+
integration=integration,
|
|
176
|
+
).items() if v is not None}
|
|
177
|
+
) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache, integration]) else None
|
|
178
|
+
return scrape_module.scrape(self.http_client, url, options)
|
|
179
|
+
|
|
180
|
+
def search(
|
|
181
|
+
self,
|
|
182
|
+
query: str,
|
|
183
|
+
*,
|
|
184
|
+
sources: Optional[List[SourceOption]] = None,
|
|
185
|
+
categories: Optional[List[CategoryOption]] = None,
|
|
186
|
+
limit: Optional[int] = None,
|
|
187
|
+
tbs: Optional[str] = None,
|
|
188
|
+
location: Optional[str] = None,
|
|
189
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
190
|
+
timeout: Optional[int] = None,
|
|
191
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
192
|
+
integration: Optional[str] = None,
|
|
193
|
+
) -> SearchData:
|
|
194
|
+
"""
|
|
195
|
+
Search for documents.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
query: Search query string
|
|
199
|
+
limit: Maximum number of results to return (default: 5)
|
|
200
|
+
tbs: Time-based search filter
|
|
201
|
+
location: Location string for search
|
|
202
|
+
timeout: Request timeout in milliseconds (default: 300000)
|
|
203
|
+
page_options: Options for scraping individual pages
|
|
204
|
+
|
|
205
|
+
Returns:
|
|
206
|
+
SearchData containing the search results
|
|
207
|
+
"""
|
|
208
|
+
request = SearchRequest(
|
|
209
|
+
query=query,
|
|
210
|
+
sources=sources,
|
|
211
|
+
categories=categories,
|
|
212
|
+
limit=limit,
|
|
213
|
+
tbs=tbs,
|
|
214
|
+
location=location,
|
|
215
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
216
|
+
timeout=timeout,
|
|
217
|
+
scrape_options=scrape_options,
|
|
218
|
+
integration=integration,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
return search_module.search(self.http_client, request)
|
|
222
|
+
|
|
223
|
+
def crawl(
|
|
224
|
+
self,
|
|
225
|
+
url: str,
|
|
226
|
+
*,
|
|
227
|
+
prompt: Optional[str] = None,
|
|
228
|
+
exclude_paths: Optional[List[str]] = None,
|
|
229
|
+
include_paths: Optional[List[str]] = None,
|
|
230
|
+
max_discovery_depth: Optional[int] = None,
|
|
231
|
+
ignore_sitemap: bool = False,
|
|
232
|
+
ignore_query_parameters: bool = False,
|
|
233
|
+
limit: Optional[int] = None,
|
|
234
|
+
crawl_entire_domain: bool = False,
|
|
235
|
+
allow_external_links: bool = False,
|
|
236
|
+
allow_subdomains: bool = False,
|
|
237
|
+
delay: Optional[int] = None,
|
|
238
|
+
max_concurrency: Optional[int] = None,
|
|
239
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
240
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
241
|
+
zero_data_retention: bool = False,
|
|
242
|
+
poll_interval: int = 2,
|
|
243
|
+
timeout: Optional[int] = None,
|
|
244
|
+
request_timeout: Optional[float] = None,
|
|
245
|
+
integration: Optional[str] = None,
|
|
246
|
+
) -> CrawlJob:
|
|
247
|
+
"""
|
|
248
|
+
Start a crawl job and wait for it to complete.
|
|
249
|
+
|
|
250
|
+
Args:
|
|
251
|
+
url: Target URL to start crawling from
|
|
252
|
+
prompt: Optional prompt to guide the crawl
|
|
253
|
+
exclude_paths: Patterns of URLs to exclude
|
|
254
|
+
include_paths: Patterns of URLs to include
|
|
255
|
+
max_discovery_depth: Maximum depth for finding new URLs
|
|
256
|
+
ignore_sitemap: Skip sitemap.xml processing
|
|
257
|
+
ignore_query_parameters: Ignore URL parameters
|
|
258
|
+
limit: Maximum pages to crawl
|
|
259
|
+
crawl_entire_domain: Follow parent directory links
|
|
260
|
+
allow_external_links: Follow external domain links
|
|
261
|
+
allow_subdomains: Follow subdomains
|
|
262
|
+
delay: Delay in seconds between scrapes
|
|
263
|
+
max_concurrency: Maximum number of concurrent scrapes
|
|
264
|
+
webhook: Webhook configuration for notifications
|
|
265
|
+
scrape_options: Page scraping configuration
|
|
266
|
+
zero_data_retention: Whether to delete data after 24 hours
|
|
267
|
+
poll_interval: Seconds between status checks
|
|
268
|
+
timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
|
|
269
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout
|
|
270
|
+
|
|
271
|
+
Returns:
|
|
272
|
+
CrawlJob when job completes
|
|
273
|
+
|
|
274
|
+
Raises:
|
|
275
|
+
ValueError: If request is invalid
|
|
276
|
+
Exception: If the crawl fails to start or complete
|
|
277
|
+
TimeoutError: If timeout is reached
|
|
278
|
+
"""
|
|
279
|
+
request = CrawlRequest(
|
|
280
|
+
url=url,
|
|
281
|
+
prompt=prompt,
|
|
282
|
+
exclude_paths=exclude_paths,
|
|
283
|
+
include_paths=include_paths,
|
|
284
|
+
max_discovery_depth=max_discovery_depth,
|
|
285
|
+
ignore_sitemap=ignore_sitemap,
|
|
286
|
+
ignore_query_parameters=ignore_query_parameters,
|
|
287
|
+
limit=limit,
|
|
288
|
+
crawl_entire_domain=crawl_entire_domain,
|
|
289
|
+
allow_external_links=allow_external_links,
|
|
290
|
+
allow_subdomains=allow_subdomains,
|
|
291
|
+
delay=delay,
|
|
292
|
+
max_concurrency=max_concurrency,
|
|
293
|
+
webhook=webhook,
|
|
294
|
+
scrape_options=scrape_options,
|
|
295
|
+
zero_data_retention=zero_data_retention,
|
|
296
|
+
integration=integration,
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
return crawl_module.crawl(
|
|
300
|
+
self.http_client,
|
|
301
|
+
request,
|
|
302
|
+
poll_interval=poll_interval,
|
|
303
|
+
timeout=timeout,
|
|
304
|
+
request_timeout=request_timeout,
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
def start_crawl(
|
|
308
|
+
self,
|
|
309
|
+
url: str,
|
|
310
|
+
*,
|
|
311
|
+
prompt: Optional[str] = None,
|
|
312
|
+
exclude_paths: Optional[List[str]] = None,
|
|
313
|
+
include_paths: Optional[List[str]] = None,
|
|
314
|
+
max_discovery_depth: Optional[int] = None,
|
|
315
|
+
ignore_sitemap: bool = False,
|
|
316
|
+
ignore_query_parameters: bool = False,
|
|
317
|
+
limit: Optional[int] = None,
|
|
318
|
+
crawl_entire_domain: bool = False,
|
|
319
|
+
allow_external_links: bool = False,
|
|
320
|
+
allow_subdomains: bool = False,
|
|
321
|
+
delay: Optional[int] = None,
|
|
322
|
+
max_concurrency: Optional[int] = None,
|
|
323
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
324
|
+
scrape_options: Optional[ScrapeOptions] = None,
|
|
325
|
+
zero_data_retention: bool = False,
|
|
326
|
+
integration: Optional[str] = None,
|
|
327
|
+
) -> CrawlResponse:
|
|
328
|
+
"""
|
|
329
|
+
Start an asynchronous crawl job.
|
|
330
|
+
|
|
331
|
+
Args:
|
|
332
|
+
url: Target URL to start crawling from
|
|
333
|
+
prompt: Optional prompt to guide the crawl
|
|
334
|
+
exclude_paths: Patterns of URLs to exclude
|
|
335
|
+
include_paths: Patterns of URLs to include
|
|
336
|
+
max_discovery_depth: Maximum depth for finding new URLs
|
|
337
|
+
ignore_sitemap: Skip sitemap.xml processing
|
|
338
|
+
ignore_query_parameters: Ignore URL parameters
|
|
339
|
+
limit: Maximum pages to crawl
|
|
340
|
+
crawl_entire_domain: Follow parent directory links
|
|
341
|
+
allow_external_links: Follow external domain links
|
|
342
|
+
allow_subdomains: Follow subdomains
|
|
343
|
+
delay: Delay in seconds between scrapes
|
|
344
|
+
max_concurrency: Maximum number of concurrent scrapes
|
|
345
|
+
webhook: Webhook configuration for notifications
|
|
346
|
+
scrape_options: Page scraping configuration
|
|
347
|
+
zero_data_retention: Whether to delete data after 24 hours
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
CrawlResponse with job information
|
|
351
|
+
|
|
352
|
+
Raises:
|
|
353
|
+
ValueError: If request is invalid
|
|
354
|
+
Exception: If the crawl operation fails to start
|
|
355
|
+
"""
|
|
356
|
+
request = CrawlRequest(
|
|
357
|
+
url=url,
|
|
358
|
+
prompt=prompt,
|
|
359
|
+
exclude_paths=exclude_paths,
|
|
360
|
+
include_paths=include_paths,
|
|
361
|
+
max_discovery_depth=max_discovery_depth,
|
|
362
|
+
ignore_sitemap=ignore_sitemap,
|
|
363
|
+
ignore_query_parameters=ignore_query_parameters,
|
|
364
|
+
limit=limit,
|
|
365
|
+
crawl_entire_domain=crawl_entire_domain,
|
|
366
|
+
allow_external_links=allow_external_links,
|
|
367
|
+
allow_subdomains=allow_subdomains,
|
|
368
|
+
delay=delay,
|
|
369
|
+
max_concurrency=max_concurrency,
|
|
370
|
+
webhook=webhook,
|
|
371
|
+
scrape_options=scrape_options,
|
|
372
|
+
zero_data_retention=zero_data_retention,
|
|
373
|
+
integration=integration,
|
|
374
|
+
)
|
|
375
|
+
|
|
376
|
+
return crawl_module.start_crawl(self.http_client, request)
|
|
377
|
+
|
|
378
|
+
def get_crawl_status(
|
|
379
|
+
self,
|
|
380
|
+
job_id: str,
|
|
381
|
+
pagination_config: Optional[PaginationConfig] = None,
|
|
382
|
+
*,
|
|
383
|
+
request_timeout: Optional[float] = None,
|
|
384
|
+
) -> CrawlJob:
|
|
385
|
+
"""
|
|
386
|
+
Get the status of a crawl job.
|
|
387
|
+
|
|
388
|
+
Args:
|
|
389
|
+
job_id: ID of the crawl job
|
|
390
|
+
pagination_config: Optional configuration for pagination behavior
|
|
391
|
+
request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
|
|
392
|
+
is enabled (default) and there are multiple pages of results, this timeout applies to
|
|
393
|
+
each page request separately, not to the entire operation
|
|
394
|
+
|
|
395
|
+
Returns:
|
|
396
|
+
CrawlJob with current status and data
|
|
397
|
+
|
|
398
|
+
Raises:
|
|
399
|
+
Exception: If the status check fails
|
|
400
|
+
"""
|
|
401
|
+
return crawl_module.get_crawl_status(
|
|
402
|
+
self.http_client,
|
|
403
|
+
job_id,
|
|
404
|
+
pagination_config=pagination_config,
|
|
405
|
+
request_timeout=request_timeout,
|
|
406
|
+
)
|
|
407
|
+
|
|
408
|
+
def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
|
|
409
|
+
"""
|
|
410
|
+
Retrieve error details and robots.txt blocks for a given crawl job.
|
|
411
|
+
|
|
412
|
+
Args:
|
|
413
|
+
crawl_id: The ID of the crawl job
|
|
414
|
+
|
|
415
|
+
Returns:
|
|
416
|
+
CrawlErrorsResponse containing per-URL errors and robots-blocked URLs
|
|
417
|
+
"""
|
|
418
|
+
return crawl_module.get_crawl_errors(self.http_client, crawl_id)
|
|
419
|
+
|
|
420
|
+
def get_active_crawls(self) -> ActiveCrawlsResponse:
|
|
421
|
+
"""
|
|
422
|
+
Get a list of currently active crawl jobs.
|
|
423
|
+
|
|
424
|
+
Returns:
|
|
425
|
+
ActiveCrawlsResponse containing a list of active crawl jobs.
|
|
426
|
+
"""
|
|
427
|
+
return crawl_module.get_active_crawls(self.http_client)
|
|
428
|
+
|
|
429
|
+
def active_crawls(self) -> ActiveCrawlsResponse:
|
|
430
|
+
"""
|
|
431
|
+
List currently active crawl jobs for the authenticated team.
|
|
432
|
+
|
|
433
|
+
Returns:
|
|
434
|
+
ActiveCrawlsResponse containing the list of active crawl jobs
|
|
435
|
+
"""
|
|
436
|
+
return self.get_active_crawls()
|
|
437
|
+
|
|
438
|
+
def map(
|
|
439
|
+
self,
|
|
440
|
+
url: str,
|
|
441
|
+
*,
|
|
442
|
+
search: Optional[str] = None,
|
|
443
|
+
include_subdomains: Optional[bool] = None,
|
|
444
|
+
ignore_query_parameters: Optional[bool] = None,
|
|
445
|
+
limit: Optional[int] = None,
|
|
446
|
+
sitemap: Optional[Literal["only", "include", "skip"]] = None,
|
|
447
|
+
timeout: Optional[int] = None,
|
|
448
|
+
integration: Optional[str] = None,
|
|
449
|
+
location: Optional[Location] = None,
|
|
450
|
+
) -> MapData:
|
|
451
|
+
"""Map a URL and return discovered links.
|
|
452
|
+
|
|
453
|
+
Args:
|
|
454
|
+
url: Root URL to explore
|
|
455
|
+
search: Optional substring filter for discovered links
|
|
456
|
+
include_subdomains: Whether to include subdomains
|
|
457
|
+
ignore_query_parameters: Whether to ignore query parameters when mapping
|
|
458
|
+
limit: Maximum number of links to return
|
|
459
|
+
sitemap: Sitemap usage mode ("only" | "include" | "skip")
|
|
460
|
+
timeout: Request timeout in milliseconds
|
|
461
|
+
|
|
462
|
+
Returns:
|
|
463
|
+
MapData containing the discovered links
|
|
464
|
+
"""
|
|
465
|
+
options = MapOptions(
|
|
466
|
+
search=search,
|
|
467
|
+
include_subdomains=include_subdomains,
|
|
468
|
+
ignore_query_parameters=ignore_query_parameters,
|
|
469
|
+
limit=limit,
|
|
470
|
+
sitemap=sitemap if sitemap is not None else "include",
|
|
471
|
+
timeout=timeout,
|
|
472
|
+
integration=integration,
|
|
473
|
+
location=location
|
|
474
|
+
) if any(v is not None for v in [search, include_subdomains, ignore_query_parameters, limit, sitemap, timeout, integration, location]) else None
|
|
475
|
+
|
|
476
|
+
return map_module.map(self.http_client, url, options)
|
|
477
|
+
|
|
478
|
+
def cancel_crawl(self, crawl_id: str) -> bool:
|
|
479
|
+
"""
|
|
480
|
+
Cancel a crawl job.
|
|
481
|
+
|
|
482
|
+
Args:
|
|
483
|
+
crawl_id: The ID of the crawl job to cancel
|
|
484
|
+
|
|
485
|
+
Returns:
|
|
486
|
+
bool: True if the crawl was cancelled, False otherwise
|
|
487
|
+
"""
|
|
488
|
+
return crawl_module.cancel_crawl(self.http_client, crawl_id)
|
|
489
|
+
|
|
490
|
+
def crawl_params_preview(self, url: str, prompt: str) -> CrawlParamsData:
|
|
491
|
+
"""Derive crawl parameters from natural-language prompt.
|
|
492
|
+
|
|
493
|
+
Args:
|
|
494
|
+
url: Root URL
|
|
495
|
+
prompt: Instruction describing how to crawl
|
|
496
|
+
|
|
497
|
+
Returns:
|
|
498
|
+
CrawlParamsData with normalized crawl configuration
|
|
499
|
+
"""
|
|
500
|
+
request = CrawlParamsRequest(url=url, prompt=prompt)
|
|
501
|
+
return crawl_module.crawl_params_preview(self.http_client, request)
|
|
502
|
+
|
|
503
|
+
def start_extract(
|
|
504
|
+
self,
|
|
505
|
+
urls: Optional[List[str]] = None,
|
|
506
|
+
*,
|
|
507
|
+
prompt: Optional[str] = None,
|
|
508
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
509
|
+
system_prompt: Optional[str] = None,
|
|
510
|
+
allow_external_links: Optional[bool] = None,
|
|
511
|
+
enable_web_search: Optional[bool] = None,
|
|
512
|
+
show_sources: Optional[bool] = None,
|
|
513
|
+
scrape_options: Optional['ScrapeOptions'] = None,
|
|
514
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
515
|
+
integration: Optional[str] = None,
|
|
516
|
+
agent: Optional[AgentOptions] = None,
|
|
517
|
+
):
|
|
518
|
+
"""Start an extract job (non-blocking).
|
|
519
|
+
|
|
520
|
+
Args:
|
|
521
|
+
urls: URLs to extract from (optional)
|
|
522
|
+
prompt: Natural-language instruction for extraction
|
|
523
|
+
schema: Target JSON schema for the output
|
|
524
|
+
system_prompt: Optional system instruction
|
|
525
|
+
allow_external_links: Allow hyperlinks in output
|
|
526
|
+
enable_web_search: Whether to augment with web search
|
|
527
|
+
show_sources: Include per-field/source mapping when available
|
|
528
|
+
scrape_options: Scrape options applied prior to extraction
|
|
529
|
+
ignore_invalid_urls: Skip invalid URLs instead of failing
|
|
530
|
+
integration: Integration tag/name
|
|
531
|
+
agent: Agent configuration
|
|
532
|
+
Returns:
|
|
533
|
+
Response payload with job id/status (poll with get_extract_status)
|
|
534
|
+
"""
|
|
535
|
+
return extract_module.start_extract(
|
|
536
|
+
self.http_client,
|
|
537
|
+
urls,
|
|
538
|
+
prompt=prompt,
|
|
539
|
+
schema=schema,
|
|
540
|
+
system_prompt=system_prompt,
|
|
541
|
+
allow_external_links=allow_external_links,
|
|
542
|
+
enable_web_search=enable_web_search,
|
|
543
|
+
show_sources=show_sources,
|
|
544
|
+
scrape_options=scrape_options,
|
|
545
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
546
|
+
integration=integration,
|
|
547
|
+
agent=agent,
|
|
548
|
+
)
|
|
549
|
+
|
|
550
|
+
def extract(
|
|
551
|
+
self,
|
|
552
|
+
urls: Optional[List[str]] = None,
|
|
553
|
+
*,
|
|
554
|
+
prompt: Optional[str] = None,
|
|
555
|
+
schema: Optional[Dict[str, Any]] = None,
|
|
556
|
+
system_prompt: Optional[str] = None,
|
|
557
|
+
allow_external_links: Optional[bool] = None,
|
|
558
|
+
enable_web_search: Optional[bool] = None,
|
|
559
|
+
show_sources: Optional[bool] = None,
|
|
560
|
+
scrape_options: Optional['ScrapeOptions'] = None,
|
|
561
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
562
|
+
poll_interval: int = 2,
|
|
563
|
+
timeout: Optional[int] = None,
|
|
564
|
+
integration: Optional[str] = None,
|
|
565
|
+
agent: Optional[AgentOptions] = None,
|
|
566
|
+
):
|
|
567
|
+
"""Extract structured data and wait until completion.
|
|
568
|
+
|
|
569
|
+
Args:
|
|
570
|
+
urls: URLs to extract from (optional)
|
|
571
|
+
prompt: Natural-language instruction for extraction
|
|
572
|
+
schema: Target JSON schema for the output
|
|
573
|
+
system_prompt: Optional system instruction
|
|
574
|
+
allow_external_links: Allow hyperlinks in output
|
|
575
|
+
enable_web_search: Whether to augment with web search
|
|
576
|
+
show_sources: Include per-field/source mapping when available
|
|
577
|
+
scrape_options: Scrape options applied prior to extraction
|
|
578
|
+
ignore_invalid_urls: Skip invalid URLs instead of failing
|
|
579
|
+
poll_interval: Seconds between status checks
|
|
580
|
+
timeout: Maximum seconds to wait (None for no timeout)
|
|
581
|
+
integration: Integration tag/name
|
|
582
|
+
agent: Agent configuration
|
|
583
|
+
Returns:
|
|
584
|
+
Final extract response when completed
|
|
585
|
+
"""
|
|
586
|
+
return extract_module.extract(
|
|
587
|
+
self.http_client,
|
|
588
|
+
urls,
|
|
589
|
+
prompt=prompt,
|
|
590
|
+
schema=schema,
|
|
591
|
+
system_prompt=system_prompt,
|
|
592
|
+
allow_external_links=allow_external_links,
|
|
593
|
+
enable_web_search=enable_web_search,
|
|
594
|
+
show_sources=show_sources,
|
|
595
|
+
scrape_options=scrape_options,
|
|
596
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
597
|
+
poll_interval=poll_interval,
|
|
598
|
+
timeout=timeout,
|
|
599
|
+
integration=integration,
|
|
600
|
+
agent=agent,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
def start_batch_scrape(
|
|
604
|
+
self,
|
|
605
|
+
urls: List[str],
|
|
606
|
+
*,
|
|
607
|
+
formats: Optional[List['FormatOption']] = None,
|
|
608
|
+
headers: Optional[Dict[str, str]] = None,
|
|
609
|
+
include_tags: Optional[List[str]] = None,
|
|
610
|
+
exclude_tags: Optional[List[str]] = None,
|
|
611
|
+
only_main_content: Optional[bool] = None,
|
|
612
|
+
timeout: Optional[int] = None,
|
|
613
|
+
wait_for: Optional[int] = None,
|
|
614
|
+
mobile: Optional[bool] = None,
|
|
615
|
+
parsers: Optional[Union[List[str], List[Union[str, PDFParser]]]] = None,
|
|
616
|
+
actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
|
|
617
|
+
location: Optional['Location'] = None,
|
|
618
|
+
skip_tls_verification: Optional[bool] = None,
|
|
619
|
+
remove_base64_images: Optional[bool] = None,
|
|
620
|
+
fast_mode: Optional[bool] = None,
|
|
621
|
+
use_mock: Optional[str] = None,
|
|
622
|
+
block_ads: Optional[bool] = None,
|
|
623
|
+
proxy: Optional[str] = None,
|
|
624
|
+
max_age: Optional[int] = None,
|
|
625
|
+
store_in_cache: Optional[bool] = None,
|
|
626
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
627
|
+
append_to_id: Optional[str] = None,
|
|
628
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
629
|
+
max_concurrency: Optional[int] = None,
|
|
630
|
+
zero_data_retention: Optional[bool] = None,
|
|
631
|
+
integration: Optional[str] = None,
|
|
632
|
+
idempotency_key: Optional[str] = None,
|
|
633
|
+
):
|
|
634
|
+
"""Start a batch scrape job over multiple URLs (non-blocking).
|
|
635
|
+
|
|
636
|
+
Args:
|
|
637
|
+
urls: List of URLs to scrape
|
|
638
|
+
formats: Output formats to collect per URL
|
|
639
|
+
headers: HTTP headers
|
|
640
|
+
include_tags: HTML tags to include
|
|
641
|
+
exclude_tags: HTML tags to exclude
|
|
642
|
+
only_main_content: Restrict scraping to main content
|
|
643
|
+
timeout: Per-request timeout in milliseconds
|
|
644
|
+
wait_for: Wait condition in milliseconds
|
|
645
|
+
mobile: Emulate mobile viewport
|
|
646
|
+
parsers: Parser list (e.g., ["pdf"])
|
|
647
|
+
actions: Browser actions to perform
|
|
648
|
+
location: Location settings
|
|
649
|
+
skip_tls_verification: Skip TLS verification
|
|
650
|
+
remove_base64_images: Remove base64 images from output
|
|
651
|
+
fast_mode: Prefer faster scraping modes
|
|
652
|
+
use_mock: Use a mock data source (internal/testing)
|
|
653
|
+
block_ads: Block ads during scraping
|
|
654
|
+
proxy: Proxy setting
|
|
655
|
+
max_age: Cache max age
|
|
656
|
+
store_in_cache: Whether to store results in cache
|
|
657
|
+
webhook: Webhook configuration
|
|
658
|
+
append_to_id: Append to an existing batch job
|
|
659
|
+
ignore_invalid_urls: Skip invalid URLs without failing
|
|
660
|
+
max_concurrency: Max concurrent scrapes
|
|
661
|
+
zero_data_retention: Delete data after 24 hours
|
|
662
|
+
integration: Integration tag/name
|
|
663
|
+
idempotency_key: Header used to deduplicate starts
|
|
664
|
+
|
|
665
|
+
Returns:
|
|
666
|
+
Response payload with job id (poll with get_batch_scrape_status)
|
|
667
|
+
"""
|
|
668
|
+
options = ScrapeOptions(
|
|
669
|
+
**{k: v for k, v in dict(
|
|
670
|
+
formats=formats,
|
|
671
|
+
headers=headers,
|
|
672
|
+
include_tags=include_tags,
|
|
673
|
+
exclude_tags=exclude_tags,
|
|
674
|
+
only_main_content=only_main_content,
|
|
675
|
+
timeout=timeout,
|
|
676
|
+
wait_for=wait_for,
|
|
677
|
+
mobile=mobile,
|
|
678
|
+
parsers=parsers,
|
|
679
|
+
actions=actions,
|
|
680
|
+
location=location,
|
|
681
|
+
skip_tls_verification=skip_tls_verification,
|
|
682
|
+
remove_base64_images=remove_base64_images,
|
|
683
|
+
fast_mode=fast_mode,
|
|
684
|
+
use_mock=use_mock,
|
|
685
|
+
block_ads=block_ads,
|
|
686
|
+
proxy=proxy,
|
|
687
|
+
max_age=max_age,
|
|
688
|
+
store_in_cache=store_in_cache,
|
|
689
|
+
).items() if v is not None}
|
|
690
|
+
) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
|
|
691
|
+
|
|
692
|
+
return batch_module.start_batch_scrape(
|
|
693
|
+
self.http_client,
|
|
694
|
+
urls,
|
|
695
|
+
options=options,
|
|
696
|
+
webhook=webhook,
|
|
697
|
+
append_to_id=append_to_id,
|
|
698
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
699
|
+
max_concurrency=max_concurrency,
|
|
700
|
+
zero_data_retention=zero_data_retention,
|
|
701
|
+
integration=integration,
|
|
702
|
+
idempotency_key=idempotency_key,
|
|
703
|
+
)
|
|
704
|
+
|
|
705
|
+
def get_batch_scrape_status(
|
|
706
|
+
self,
|
|
707
|
+
job_id: str,
|
|
708
|
+
pagination_config: Optional[PaginationConfig] = None
|
|
709
|
+
):
|
|
710
|
+
"""Get current status and any scraped data for a batch job.
|
|
711
|
+
|
|
712
|
+
Args:
|
|
713
|
+
job_id: Batch job ID
|
|
714
|
+
pagination_config: Optional configuration for pagination behavior
|
|
715
|
+
|
|
716
|
+
Returns:
|
|
717
|
+
Status payload including counts and partial data
|
|
718
|
+
"""
|
|
719
|
+
return batch_module.get_batch_scrape_status(
|
|
720
|
+
self.http_client,
|
|
721
|
+
job_id,
|
|
722
|
+
pagination_config=pagination_config
|
|
723
|
+
)
|
|
724
|
+
|
|
725
|
+
def cancel_batch_scrape(self, job_id: str) -> bool:
|
|
726
|
+
"""Cancel a running batch scrape job.
|
|
727
|
+
|
|
728
|
+
Args:
|
|
729
|
+
job_id: Batch job ID
|
|
730
|
+
|
|
731
|
+
Returns:
|
|
732
|
+
True if the job was cancelled
|
|
733
|
+
"""
|
|
734
|
+
return batch_module.cancel_batch_scrape(self.http_client, job_id)
|
|
735
|
+
|
|
736
|
+
def get_batch_scrape_errors(self, job_id: str):
|
|
737
|
+
"""Retrieve error details for a batch scrape job.
|
|
738
|
+
|
|
739
|
+
Args:
|
|
740
|
+
job_id: Batch job ID
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
Errors and robots-blocked URLs for the job
|
|
744
|
+
"""
|
|
745
|
+
return batch_methods.get_batch_scrape_errors(self.http_client, job_id)
|
|
746
|
+
|
|
747
|
+
def get_extract_status(self, job_id: str):
|
|
748
|
+
"""Get the current status (and data if completed) of an extract job.
|
|
749
|
+
|
|
750
|
+
Args:
|
|
751
|
+
job_id: Extract job ID
|
|
752
|
+
|
|
753
|
+
Returns:
|
|
754
|
+
Extract response payload with status and optional data
|
|
755
|
+
"""
|
|
756
|
+
return extract_module.get_extract_status(self.http_client, job_id)
|
|
757
|
+
|
|
758
|
+
def start_agent(
|
|
759
|
+
self,
|
|
760
|
+
urls: Optional[List[str]] = None,
|
|
761
|
+
*,
|
|
762
|
+
prompt: str,
|
|
763
|
+
schema: Optional[Any] = None,
|
|
764
|
+
integration: Optional[str] = None,
|
|
765
|
+
max_credits: Optional[int] = None,
|
|
766
|
+
strict_constrain_to_urls: Optional[bool] = None,
|
|
767
|
+
):
|
|
768
|
+
"""Start an agent job (non-blocking).
|
|
769
|
+
|
|
770
|
+
Args:
|
|
771
|
+
urls: URLs to process (optional)
|
|
772
|
+
prompt: Natural-language instruction for the agent
|
|
773
|
+
schema: Target JSON schema for the output (dict or Pydantic BaseModel)
|
|
774
|
+
integration: Integration tag/name
|
|
775
|
+
max_credits: Maximum credits to use (optional)
|
|
776
|
+
Returns:
|
|
777
|
+
Response payload with job id/status (poll with get_agent_status)
|
|
778
|
+
"""
|
|
779
|
+
return agent_module.start_agent(
|
|
780
|
+
self.http_client,
|
|
781
|
+
urls,
|
|
782
|
+
prompt=prompt,
|
|
783
|
+
schema=schema,
|
|
784
|
+
integration=integration,
|
|
785
|
+
max_credits=max_credits,
|
|
786
|
+
strict_constrain_to_urls=strict_constrain_to_urls,
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
def agent(
|
|
790
|
+
self,
|
|
791
|
+
urls: Optional[List[str]] = None,
|
|
792
|
+
*,
|
|
793
|
+
prompt: str,
|
|
794
|
+
schema: Optional[Any] = None,
|
|
795
|
+
integration: Optional[str] = None,
|
|
796
|
+
poll_interval: int = 2,
|
|
797
|
+
timeout: Optional[int] = None,
|
|
798
|
+
max_credits: Optional[int] = None,
|
|
799
|
+
strict_constrain_to_urls: Optional[bool] = None,
|
|
800
|
+
):
|
|
801
|
+
"""Run an agent and wait until completion.
|
|
802
|
+
|
|
803
|
+
Args:
|
|
804
|
+
urls: URLs to process (optional)
|
|
805
|
+
prompt: Natural-language instruction for the agent
|
|
806
|
+
schema: Target JSON schema for the output (dict or Pydantic BaseModel)
|
|
807
|
+
integration: Integration tag/name
|
|
808
|
+
poll_interval: Seconds between status checks
|
|
809
|
+
timeout: Maximum seconds to wait (None for no timeout)
|
|
810
|
+
max_credits: Maximum credits to use (optional)
|
|
811
|
+
Returns:
|
|
812
|
+
Final agent response when completed
|
|
813
|
+
"""
|
|
814
|
+
return agent_module.agent(
|
|
815
|
+
self.http_client,
|
|
816
|
+
urls,
|
|
817
|
+
prompt=prompt,
|
|
818
|
+
schema=schema,
|
|
819
|
+
integration=integration,
|
|
820
|
+
poll_interval=poll_interval,
|
|
821
|
+
timeout=timeout,
|
|
822
|
+
max_credits=max_credits,
|
|
823
|
+
strict_constrain_to_urls=strict_constrain_to_urls,
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
def get_agent_status(self, job_id: str):
|
|
827
|
+
"""Get the current status (and data if completed) of an agent job.
|
|
828
|
+
|
|
829
|
+
Args:
|
|
830
|
+
job_id: Agent job ID
|
|
831
|
+
|
|
832
|
+
Returns:
|
|
833
|
+
Agent response payload with status and optional data
|
|
834
|
+
"""
|
|
835
|
+
return agent_module.get_agent_status(self.http_client, job_id)
|
|
836
|
+
|
|
837
|
+
def cancel_agent(self, job_id: str) -> bool:
|
|
838
|
+
"""Cancel a running agent job.
|
|
839
|
+
|
|
840
|
+
Args:
|
|
841
|
+
job_id: Agent job ID
|
|
842
|
+
|
|
843
|
+
Returns:
|
|
844
|
+
True if the agent was cancelled
|
|
845
|
+
"""
|
|
846
|
+
return agent_module.cancel_agent(self.http_client, job_id)
|
|
847
|
+
|
|
848
|
+
def get_concurrency(self):
|
|
849
|
+
"""Get current concurrency and maximum allowed for this team/key (v2)."""
|
|
850
|
+
return usage_methods.get_concurrency(self.http_client)
|
|
851
|
+
|
|
852
|
+
def get_credit_usage(self):
|
|
853
|
+
"""Get remaining credits for this team/key (v2)."""
|
|
854
|
+
return usage_methods.get_credit_usage(self.http_client)
|
|
855
|
+
|
|
856
|
+
def get_token_usage(self):
|
|
857
|
+
"""Get recent token usage metrics (v2)."""
|
|
858
|
+
return usage_methods.get_token_usage(self.http_client)
|
|
859
|
+
|
|
860
|
+
def get_credit_usage_historical(self, by_api_key: bool = False):
|
|
861
|
+
"""Get historical credit usage (v2)."""
|
|
862
|
+
return usage_methods.get_credit_usage_historical(self.http_client, by_api_key)
|
|
863
|
+
|
|
864
|
+
def get_token_usage_historical(self, by_api_key: bool = False):
|
|
865
|
+
"""Get historical token usage (v2)."""
|
|
866
|
+
return usage_methods.get_token_usage_historical(self.http_client, by_api_key)
|
|
867
|
+
|
|
868
|
+
def get_queue_status(self):
|
|
869
|
+
"""Get metrics about the team's scrape queue."""
|
|
870
|
+
return usage_methods.get_queue_status(self.http_client)
|
|
871
|
+
|
|
872
|
+
def watcher(
|
|
873
|
+
self,
|
|
874
|
+
job_id: str,
|
|
875
|
+
*,
|
|
876
|
+
kind: Literal["crawl", "batch"] = "crawl",
|
|
877
|
+
poll_interval: int = 2,
|
|
878
|
+
timeout: Optional[int] = None,
|
|
879
|
+
) -> Watcher:
|
|
880
|
+
"""Create a watcher for crawl or batch jobs.
|
|
881
|
+
|
|
882
|
+
Args:
|
|
883
|
+
job_id: Job ID to watch
|
|
884
|
+
kind: Job kind ("crawl" or "batch")
|
|
885
|
+
poll_interval: Seconds between status checks
|
|
886
|
+
timeout: Maximum seconds to watch (None for no timeout)
|
|
887
|
+
|
|
888
|
+
Returns:
|
|
889
|
+
Watcher instance
|
|
890
|
+
"""
|
|
891
|
+
return Watcher(self, job_id, kind=kind, poll_interval=poll_interval, timeout=timeout)
|
|
892
|
+
|
|
893
|
+
def batch_scrape(
|
|
894
|
+
self,
|
|
895
|
+
urls: List[str],
|
|
896
|
+
*,
|
|
897
|
+
formats: Optional[List['FormatOption']] = None,
|
|
898
|
+
headers: Optional[Dict[str, str]] = None,
|
|
899
|
+
include_tags: Optional[List[str]] = None,
|
|
900
|
+
exclude_tags: Optional[List[str]] = None,
|
|
901
|
+
only_main_content: Optional[bool] = None,
|
|
902
|
+
timeout: Optional[int] = None,
|
|
903
|
+
wait_for: Optional[int] = None,
|
|
904
|
+
mobile: Optional[bool] = None,
|
|
905
|
+
parsers: Optional[Union[List[str], List[Union[str, PDFParser]]]] = None,
|
|
906
|
+
actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
|
|
907
|
+
location: Optional['Location'] = None,
|
|
908
|
+
skip_tls_verification: Optional[bool] = None,
|
|
909
|
+
remove_base64_images: Optional[bool] = None,
|
|
910
|
+
fast_mode: Optional[bool] = None,
|
|
911
|
+
use_mock: Optional[str] = None,
|
|
912
|
+
block_ads: Optional[bool] = None,
|
|
913
|
+
proxy: Optional[str] = None,
|
|
914
|
+
max_age: Optional[int] = None,
|
|
915
|
+
store_in_cache: Optional[bool] = None,
|
|
916
|
+
webhook: Optional[Union[str, WebhookConfig]] = None,
|
|
917
|
+
append_to_id: Optional[str] = None,
|
|
918
|
+
ignore_invalid_urls: Optional[bool] = None,
|
|
919
|
+
max_concurrency: Optional[int] = None,
|
|
920
|
+
zero_data_retention: Optional[bool] = None,
|
|
921
|
+
integration: Optional[str] = None,
|
|
922
|
+
idempotency_key: Optional[str] = None,
|
|
923
|
+
poll_interval: int = 2,
|
|
924
|
+
wait_timeout: Optional[int] = None,
|
|
925
|
+
):
|
|
926
|
+
"""
|
|
927
|
+
Start a batch scrape job and wait until completion.
|
|
928
|
+
"""
|
|
929
|
+
options = ScrapeOptions(
|
|
930
|
+
**{k: v for k, v in dict(
|
|
931
|
+
formats=formats,
|
|
932
|
+
headers=headers,
|
|
933
|
+
include_tags=include_tags,
|
|
934
|
+
exclude_tags=exclude_tags,
|
|
935
|
+
only_main_content=only_main_content,
|
|
936
|
+
timeout=timeout,
|
|
937
|
+
wait_for=wait_for,
|
|
938
|
+
mobile=mobile,
|
|
939
|
+
parsers=parsers,
|
|
940
|
+
actions=actions,
|
|
941
|
+
location=location,
|
|
942
|
+
skip_tls_verification=skip_tls_verification,
|
|
943
|
+
remove_base64_images=remove_base64_images,
|
|
944
|
+
fast_mode=fast_mode,
|
|
945
|
+
use_mock=use_mock,
|
|
946
|
+
block_ads=block_ads,
|
|
947
|
+
proxy=proxy,
|
|
948
|
+
max_age=max_age,
|
|
949
|
+
store_in_cache=store_in_cache,
|
|
950
|
+
).items() if v is not None}
|
|
951
|
+
) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
|
|
952
|
+
|
|
953
|
+
return batch_module.batch_scrape(
|
|
954
|
+
self.http_client,
|
|
955
|
+
urls,
|
|
956
|
+
options=options,
|
|
957
|
+
webhook=webhook,
|
|
958
|
+
append_to_id=append_to_id,
|
|
959
|
+
ignore_invalid_urls=ignore_invalid_urls,
|
|
960
|
+
max_concurrency=max_concurrency,
|
|
961
|
+
zero_data_retention=zero_data_retention,
|
|
962
|
+
integration=integration,
|
|
963
|
+
idempotency_key=idempotency_key,
|
|
964
|
+
poll_interval=poll_interval,
|
|
965
|
+
timeout=wait_timeout,
|
|
966
|
+
)
|
|
967
|
+
|