firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
firecrawl/v2/client.py ADDED
@@ -0,0 +1,967 @@
1
+ """
2
+ Main Firecrawl v2 API client.
3
+
4
+ This module provides the main client class that orchestrates all v2 functionality.
5
+ """
6
+
7
+ import os
8
+ from typing import Optional, List, Dict, Any, Callable, Union, Literal
9
+ from .types import (
10
+ ClientConfig,
11
+ ScrapeOptions,
12
+ Document,
13
+ SearchRequest,
14
+ SearchData,
15
+ SourceOption,
16
+ CategoryOption,
17
+ CrawlRequest,
18
+ CrawlResponse,
19
+ CrawlJob,
20
+ CrawlParamsRequest,
21
+ PDFParser,
22
+ CrawlParamsData,
23
+ WebhookConfig,
24
+ CrawlErrorsResponse,
25
+ ActiveCrawlsResponse,
26
+ MapOptions,
27
+ MapData,
28
+ FormatOption,
29
+ WaitAction,
30
+ ScreenshotAction,
31
+ ClickAction,
32
+ WriteAction,
33
+ PressAction,
34
+ ScrollAction,
35
+ ScrapeAction,
36
+ ExecuteJavascriptAction,
37
+ PDFAction,
38
+ Location,
39
+ PaginationConfig,
40
+ AgentOptions,
41
+ )
42
+ from .utils.http_client import HttpClient
43
+ from .utils.error_handler import FirecrawlError
44
+ from .methods import scrape as scrape_module
45
+ from .methods import crawl as crawl_module
46
+ from .methods import batch as batch_module
47
+ from .methods import search as search_module
48
+ from .methods import map as map_module
49
+ from .methods import batch as batch_methods
50
+ from .methods import usage as usage_methods
51
+ from .methods import extract as extract_module
52
+ from .methods import agent as agent_module
53
+ from .watcher import Watcher
54
+
55
+ class FirecrawlClient:
56
+ """
57
+ Main Firecrawl v2 API client.
58
+
59
+ This client provides a clean, modular interface to all Firecrawl functionality.
60
+ """
61
+
62
+ @staticmethod
63
+ def _is_cloud_service(url: str) -> bool:
64
+ return "api.firecrawl.dev" in url.lower()
65
+
66
+ def __init__(
67
+ self,
68
+ api_key: Optional[str] = None,
69
+ api_url: str = "https://api.firecrawl.dev",
70
+ timeout: Optional[float] = None,
71
+ max_retries: int = 3,
72
+ backoff_factor: float = 0.5
73
+ ):
74
+ """
75
+ Initialize the Firecrawl client.
76
+
77
+ Args:
78
+ api_key: Firecrawl API key (or set FIRECRAWL_API_KEY env var)
79
+ api_url: Base URL for the Firecrawl API
80
+ timeout: Request timeout in seconds
81
+ max_retries: Maximum number of retries for failed requests
82
+ backoff_factor: Exponential backoff factor for retries (e.g. 0.5 means wait 0.5s, then 1s, then 2s between retries)
83
+ """
84
+ if api_key is None:
85
+ api_key = os.getenv("FIRECRAWL_API_KEY")
86
+
87
+ if self._is_cloud_service(api_url) and not api_key:
88
+ raise ValueError(
89
+ "API key is required for the cloud API. Set FIRECRAWL_API_KEY environment variable "
90
+ "or pass api_key parameter."
91
+ )
92
+
93
+ self.config = ClientConfig(
94
+ api_key=api_key,
95
+ api_url=api_url,
96
+ timeout=timeout,
97
+ max_retries=max_retries,
98
+ backoff_factor=backoff_factor
99
+ )
100
+
101
+ self.http_client = HttpClient(api_key, api_url)
102
+
103
+ def scrape(
104
+ self,
105
+ url: str,
106
+ *,
107
+ formats: Optional[List['FormatOption']] = None,
108
+ headers: Optional[Dict[str, str]] = None,
109
+ include_tags: Optional[List[str]] = None,
110
+ exclude_tags: Optional[List[str]] = None,
111
+ only_main_content: Optional[bool] = None,
112
+ timeout: Optional[int] = None,
113
+ wait_for: Optional[int] = None,
114
+ mobile: Optional[bool] = None,
115
+ parsers: Optional[Union[List[str], List[Union[str, PDFParser]]]] = None,
116
+ actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
117
+ location: Optional['Location'] = None,
118
+ skip_tls_verification: Optional[bool] = None,
119
+ remove_base64_images: Optional[bool] = None,
120
+ fast_mode: Optional[bool] = None,
121
+ use_mock: Optional[str] = None,
122
+ block_ads: Optional[bool] = None,
123
+ proxy: Optional[str] = None,
124
+ max_age: Optional[int] = None,
125
+ store_in_cache: Optional[bool] = None,
126
+ integration: Optional[str] = None,
127
+ ) -> Document:
128
+ """
129
+ Scrape a single URL and return the document.
130
+ Args:
131
+ url: URL to scrape
132
+ formats: List of formats to scrape
133
+ headers: Dictionary of headers to use
134
+ include_tags: List of tags to include
135
+ exclude_tags: List of tags to exclude
136
+ only_main_content: Whether to only scrape the main content
137
+ timeout: Timeout in seconds
138
+ wait_for: Wait for a specific element to be present
139
+ mobile: Whether to use mobile mode
140
+ parsers: List of parsers to use
141
+ actions: List of actions to perform
142
+ location: Location to scrape
143
+ skip_tls_verification: Whether to skip TLS verification
144
+ remove_base64_images: Whether to remove base64 images
145
+ fast_mode: Whether to use fast mode
146
+ use_mock: Whether to use mock mode
147
+ block_ads: Whether to block ads
148
+ proxy: Proxy to use
149
+ max_age: Maximum age of the cache
150
+ store_in_cache: Whether to store the result in the cache
151
+ Returns:
152
+ Document
153
+ """
154
+ options = ScrapeOptions(
155
+ **{k: v for k, v in dict(
156
+ formats=formats,
157
+ headers=headers,
158
+ include_tags=include_tags,
159
+ exclude_tags=exclude_tags,
160
+ only_main_content=only_main_content,
161
+ timeout=timeout,
162
+ wait_for=wait_for,
163
+ mobile=mobile,
164
+ parsers=parsers,
165
+ actions=actions,
166
+ location=location,
167
+ skip_tls_verification=skip_tls_verification,
168
+ remove_base64_images=remove_base64_images,
169
+ fast_mode=fast_mode,
170
+ use_mock=use_mock,
171
+ block_ads=block_ads,
172
+ proxy=proxy,
173
+ max_age=max_age,
174
+ store_in_cache=store_in_cache,
175
+ integration=integration,
176
+ ).items() if v is not None}
177
+ ) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache, integration]) else None
178
+ return scrape_module.scrape(self.http_client, url, options)
179
+
180
+ def search(
181
+ self,
182
+ query: str,
183
+ *,
184
+ sources: Optional[List[SourceOption]] = None,
185
+ categories: Optional[List[CategoryOption]] = None,
186
+ limit: Optional[int] = None,
187
+ tbs: Optional[str] = None,
188
+ location: Optional[str] = None,
189
+ ignore_invalid_urls: Optional[bool] = None,
190
+ timeout: Optional[int] = None,
191
+ scrape_options: Optional[ScrapeOptions] = None,
192
+ integration: Optional[str] = None,
193
+ ) -> SearchData:
194
+ """
195
+ Search for documents.
196
+
197
+ Args:
198
+ query: Search query string
199
+ limit: Maximum number of results to return (default: 5)
200
+ tbs: Time-based search filter
201
+ location: Location string for search
202
+ timeout: Request timeout in milliseconds (default: 300000)
203
+ page_options: Options for scraping individual pages
204
+
205
+ Returns:
206
+ SearchData containing the search results
207
+ """
208
+ request = SearchRequest(
209
+ query=query,
210
+ sources=sources,
211
+ categories=categories,
212
+ limit=limit,
213
+ tbs=tbs,
214
+ location=location,
215
+ ignore_invalid_urls=ignore_invalid_urls,
216
+ timeout=timeout,
217
+ scrape_options=scrape_options,
218
+ integration=integration,
219
+ )
220
+
221
+ return search_module.search(self.http_client, request)
222
+
223
+ def crawl(
224
+ self,
225
+ url: str,
226
+ *,
227
+ prompt: Optional[str] = None,
228
+ exclude_paths: Optional[List[str]] = None,
229
+ include_paths: Optional[List[str]] = None,
230
+ max_discovery_depth: Optional[int] = None,
231
+ ignore_sitemap: bool = False,
232
+ ignore_query_parameters: bool = False,
233
+ limit: Optional[int] = None,
234
+ crawl_entire_domain: bool = False,
235
+ allow_external_links: bool = False,
236
+ allow_subdomains: bool = False,
237
+ delay: Optional[int] = None,
238
+ max_concurrency: Optional[int] = None,
239
+ webhook: Optional[Union[str, WebhookConfig]] = None,
240
+ scrape_options: Optional[ScrapeOptions] = None,
241
+ zero_data_retention: bool = False,
242
+ poll_interval: int = 2,
243
+ timeout: Optional[int] = None,
244
+ request_timeout: Optional[float] = None,
245
+ integration: Optional[str] = None,
246
+ ) -> CrawlJob:
247
+ """
248
+ Start a crawl job and wait for it to complete.
249
+
250
+ Args:
251
+ url: Target URL to start crawling from
252
+ prompt: Optional prompt to guide the crawl
253
+ exclude_paths: Patterns of URLs to exclude
254
+ include_paths: Patterns of URLs to include
255
+ max_discovery_depth: Maximum depth for finding new URLs
256
+ ignore_sitemap: Skip sitemap.xml processing
257
+ ignore_query_parameters: Ignore URL parameters
258
+ limit: Maximum pages to crawl
259
+ crawl_entire_domain: Follow parent directory links
260
+ allow_external_links: Follow external domain links
261
+ allow_subdomains: Follow subdomains
262
+ delay: Delay in seconds between scrapes
263
+ max_concurrency: Maximum number of concurrent scrapes
264
+ webhook: Webhook configuration for notifications
265
+ scrape_options: Page scraping configuration
266
+ zero_data_retention: Whether to delete data after 24 hours
267
+ poll_interval: Seconds between status checks
268
+ timeout: Maximum seconds to wait for the entire crawl job to complete (None for no timeout)
269
+ request_timeout: Timeout (in seconds) for each individual HTTP request, including pagination requests when fetching results. If there are multiple pages, each page request gets this timeout
270
+
271
+ Returns:
272
+ CrawlJob when job completes
273
+
274
+ Raises:
275
+ ValueError: If request is invalid
276
+ Exception: If the crawl fails to start or complete
277
+ TimeoutError: If timeout is reached
278
+ """
279
+ request = CrawlRequest(
280
+ url=url,
281
+ prompt=prompt,
282
+ exclude_paths=exclude_paths,
283
+ include_paths=include_paths,
284
+ max_discovery_depth=max_discovery_depth,
285
+ ignore_sitemap=ignore_sitemap,
286
+ ignore_query_parameters=ignore_query_parameters,
287
+ limit=limit,
288
+ crawl_entire_domain=crawl_entire_domain,
289
+ allow_external_links=allow_external_links,
290
+ allow_subdomains=allow_subdomains,
291
+ delay=delay,
292
+ max_concurrency=max_concurrency,
293
+ webhook=webhook,
294
+ scrape_options=scrape_options,
295
+ zero_data_retention=zero_data_retention,
296
+ integration=integration,
297
+ )
298
+
299
+ return crawl_module.crawl(
300
+ self.http_client,
301
+ request,
302
+ poll_interval=poll_interval,
303
+ timeout=timeout,
304
+ request_timeout=request_timeout,
305
+ )
306
+
307
+ def start_crawl(
308
+ self,
309
+ url: str,
310
+ *,
311
+ prompt: Optional[str] = None,
312
+ exclude_paths: Optional[List[str]] = None,
313
+ include_paths: Optional[List[str]] = None,
314
+ max_discovery_depth: Optional[int] = None,
315
+ ignore_sitemap: bool = False,
316
+ ignore_query_parameters: bool = False,
317
+ limit: Optional[int] = None,
318
+ crawl_entire_domain: bool = False,
319
+ allow_external_links: bool = False,
320
+ allow_subdomains: bool = False,
321
+ delay: Optional[int] = None,
322
+ max_concurrency: Optional[int] = None,
323
+ webhook: Optional[Union[str, WebhookConfig]] = None,
324
+ scrape_options: Optional[ScrapeOptions] = None,
325
+ zero_data_retention: bool = False,
326
+ integration: Optional[str] = None,
327
+ ) -> CrawlResponse:
328
+ """
329
+ Start an asynchronous crawl job.
330
+
331
+ Args:
332
+ url: Target URL to start crawling from
333
+ prompt: Optional prompt to guide the crawl
334
+ exclude_paths: Patterns of URLs to exclude
335
+ include_paths: Patterns of URLs to include
336
+ max_discovery_depth: Maximum depth for finding new URLs
337
+ ignore_sitemap: Skip sitemap.xml processing
338
+ ignore_query_parameters: Ignore URL parameters
339
+ limit: Maximum pages to crawl
340
+ crawl_entire_domain: Follow parent directory links
341
+ allow_external_links: Follow external domain links
342
+ allow_subdomains: Follow subdomains
343
+ delay: Delay in seconds between scrapes
344
+ max_concurrency: Maximum number of concurrent scrapes
345
+ webhook: Webhook configuration for notifications
346
+ scrape_options: Page scraping configuration
347
+ zero_data_retention: Whether to delete data after 24 hours
348
+
349
+ Returns:
350
+ CrawlResponse with job information
351
+
352
+ Raises:
353
+ ValueError: If request is invalid
354
+ Exception: If the crawl operation fails to start
355
+ """
356
+ request = CrawlRequest(
357
+ url=url,
358
+ prompt=prompt,
359
+ exclude_paths=exclude_paths,
360
+ include_paths=include_paths,
361
+ max_discovery_depth=max_discovery_depth,
362
+ ignore_sitemap=ignore_sitemap,
363
+ ignore_query_parameters=ignore_query_parameters,
364
+ limit=limit,
365
+ crawl_entire_domain=crawl_entire_domain,
366
+ allow_external_links=allow_external_links,
367
+ allow_subdomains=allow_subdomains,
368
+ delay=delay,
369
+ max_concurrency=max_concurrency,
370
+ webhook=webhook,
371
+ scrape_options=scrape_options,
372
+ zero_data_retention=zero_data_retention,
373
+ integration=integration,
374
+ )
375
+
376
+ return crawl_module.start_crawl(self.http_client, request)
377
+
378
+ def get_crawl_status(
379
+ self,
380
+ job_id: str,
381
+ pagination_config: Optional[PaginationConfig] = None,
382
+ *,
383
+ request_timeout: Optional[float] = None,
384
+ ) -> CrawlJob:
385
+ """
386
+ Get the status of a crawl job.
387
+
388
+ Args:
389
+ job_id: ID of the crawl job
390
+ pagination_config: Optional configuration for pagination behavior
391
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
392
+ is enabled (default) and there are multiple pages of results, this timeout applies to
393
+ each page request separately, not to the entire operation
394
+
395
+ Returns:
396
+ CrawlJob with current status and data
397
+
398
+ Raises:
399
+ Exception: If the status check fails
400
+ """
401
+ return crawl_module.get_crawl_status(
402
+ self.http_client,
403
+ job_id,
404
+ pagination_config=pagination_config,
405
+ request_timeout=request_timeout,
406
+ )
407
+
408
+ def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
409
+ """
410
+ Retrieve error details and robots.txt blocks for a given crawl job.
411
+
412
+ Args:
413
+ crawl_id: The ID of the crawl job
414
+
415
+ Returns:
416
+ CrawlErrorsResponse containing per-URL errors and robots-blocked URLs
417
+ """
418
+ return crawl_module.get_crawl_errors(self.http_client, crawl_id)
419
+
420
+ def get_active_crawls(self) -> ActiveCrawlsResponse:
421
+ """
422
+ Get a list of currently active crawl jobs.
423
+
424
+ Returns:
425
+ ActiveCrawlsResponse containing a list of active crawl jobs.
426
+ """
427
+ return crawl_module.get_active_crawls(self.http_client)
428
+
429
+ def active_crawls(self) -> ActiveCrawlsResponse:
430
+ """
431
+ List currently active crawl jobs for the authenticated team.
432
+
433
+ Returns:
434
+ ActiveCrawlsResponse containing the list of active crawl jobs
435
+ """
436
+ return self.get_active_crawls()
437
+
438
+ def map(
439
+ self,
440
+ url: str,
441
+ *,
442
+ search: Optional[str] = None,
443
+ include_subdomains: Optional[bool] = None,
444
+ ignore_query_parameters: Optional[bool] = None,
445
+ limit: Optional[int] = None,
446
+ sitemap: Optional[Literal["only", "include", "skip"]] = None,
447
+ timeout: Optional[int] = None,
448
+ integration: Optional[str] = None,
449
+ location: Optional[Location] = None,
450
+ ) -> MapData:
451
+ """Map a URL and return discovered links.
452
+
453
+ Args:
454
+ url: Root URL to explore
455
+ search: Optional substring filter for discovered links
456
+ include_subdomains: Whether to include subdomains
457
+ ignore_query_parameters: Whether to ignore query parameters when mapping
458
+ limit: Maximum number of links to return
459
+ sitemap: Sitemap usage mode ("only" | "include" | "skip")
460
+ timeout: Request timeout in milliseconds
461
+
462
+ Returns:
463
+ MapData containing the discovered links
464
+ """
465
+ options = MapOptions(
466
+ search=search,
467
+ include_subdomains=include_subdomains,
468
+ ignore_query_parameters=ignore_query_parameters,
469
+ limit=limit,
470
+ sitemap=sitemap if sitemap is not None else "include",
471
+ timeout=timeout,
472
+ integration=integration,
473
+ location=location
474
+ ) if any(v is not None for v in [search, include_subdomains, ignore_query_parameters, limit, sitemap, timeout, integration, location]) else None
475
+
476
+ return map_module.map(self.http_client, url, options)
477
+
478
+ def cancel_crawl(self, crawl_id: str) -> bool:
479
+ """
480
+ Cancel a crawl job.
481
+
482
+ Args:
483
+ crawl_id: The ID of the crawl job to cancel
484
+
485
+ Returns:
486
+ bool: True if the crawl was cancelled, False otherwise
487
+ """
488
+ return crawl_module.cancel_crawl(self.http_client, crawl_id)
489
+
490
+ def crawl_params_preview(self, url: str, prompt: str) -> CrawlParamsData:
491
+ """Derive crawl parameters from natural-language prompt.
492
+
493
+ Args:
494
+ url: Root URL
495
+ prompt: Instruction describing how to crawl
496
+
497
+ Returns:
498
+ CrawlParamsData with normalized crawl configuration
499
+ """
500
+ request = CrawlParamsRequest(url=url, prompt=prompt)
501
+ return crawl_module.crawl_params_preview(self.http_client, request)
502
+
503
+ def start_extract(
504
+ self,
505
+ urls: Optional[List[str]] = None,
506
+ *,
507
+ prompt: Optional[str] = None,
508
+ schema: Optional[Dict[str, Any]] = None,
509
+ system_prompt: Optional[str] = None,
510
+ allow_external_links: Optional[bool] = None,
511
+ enable_web_search: Optional[bool] = None,
512
+ show_sources: Optional[bool] = None,
513
+ scrape_options: Optional['ScrapeOptions'] = None,
514
+ ignore_invalid_urls: Optional[bool] = None,
515
+ integration: Optional[str] = None,
516
+ agent: Optional[AgentOptions] = None,
517
+ ):
518
+ """Start an extract job (non-blocking).
519
+
520
+ Args:
521
+ urls: URLs to extract from (optional)
522
+ prompt: Natural-language instruction for extraction
523
+ schema: Target JSON schema for the output
524
+ system_prompt: Optional system instruction
525
+ allow_external_links: Allow hyperlinks in output
526
+ enable_web_search: Whether to augment with web search
527
+ show_sources: Include per-field/source mapping when available
528
+ scrape_options: Scrape options applied prior to extraction
529
+ ignore_invalid_urls: Skip invalid URLs instead of failing
530
+ integration: Integration tag/name
531
+ agent: Agent configuration
532
+ Returns:
533
+ Response payload with job id/status (poll with get_extract_status)
534
+ """
535
+ return extract_module.start_extract(
536
+ self.http_client,
537
+ urls,
538
+ prompt=prompt,
539
+ schema=schema,
540
+ system_prompt=system_prompt,
541
+ allow_external_links=allow_external_links,
542
+ enable_web_search=enable_web_search,
543
+ show_sources=show_sources,
544
+ scrape_options=scrape_options,
545
+ ignore_invalid_urls=ignore_invalid_urls,
546
+ integration=integration,
547
+ agent=agent,
548
+ )
549
+
550
+ def extract(
551
+ self,
552
+ urls: Optional[List[str]] = None,
553
+ *,
554
+ prompt: Optional[str] = None,
555
+ schema: Optional[Dict[str, Any]] = None,
556
+ system_prompt: Optional[str] = None,
557
+ allow_external_links: Optional[bool] = None,
558
+ enable_web_search: Optional[bool] = None,
559
+ show_sources: Optional[bool] = None,
560
+ scrape_options: Optional['ScrapeOptions'] = None,
561
+ ignore_invalid_urls: Optional[bool] = None,
562
+ poll_interval: int = 2,
563
+ timeout: Optional[int] = None,
564
+ integration: Optional[str] = None,
565
+ agent: Optional[AgentOptions] = None,
566
+ ):
567
+ """Extract structured data and wait until completion.
568
+
569
+ Args:
570
+ urls: URLs to extract from (optional)
571
+ prompt: Natural-language instruction for extraction
572
+ schema: Target JSON schema for the output
573
+ system_prompt: Optional system instruction
574
+ allow_external_links: Allow hyperlinks in output
575
+ enable_web_search: Whether to augment with web search
576
+ show_sources: Include per-field/source mapping when available
577
+ scrape_options: Scrape options applied prior to extraction
578
+ ignore_invalid_urls: Skip invalid URLs instead of failing
579
+ poll_interval: Seconds between status checks
580
+ timeout: Maximum seconds to wait (None for no timeout)
581
+ integration: Integration tag/name
582
+ agent: Agent configuration
583
+ Returns:
584
+ Final extract response when completed
585
+ """
586
+ return extract_module.extract(
587
+ self.http_client,
588
+ urls,
589
+ prompt=prompt,
590
+ schema=schema,
591
+ system_prompt=system_prompt,
592
+ allow_external_links=allow_external_links,
593
+ enable_web_search=enable_web_search,
594
+ show_sources=show_sources,
595
+ scrape_options=scrape_options,
596
+ ignore_invalid_urls=ignore_invalid_urls,
597
+ poll_interval=poll_interval,
598
+ timeout=timeout,
599
+ integration=integration,
600
+ agent=agent,
601
+ )
602
+
603
+ def start_batch_scrape(
604
+ self,
605
+ urls: List[str],
606
+ *,
607
+ formats: Optional[List['FormatOption']] = None,
608
+ headers: Optional[Dict[str, str]] = None,
609
+ include_tags: Optional[List[str]] = None,
610
+ exclude_tags: Optional[List[str]] = None,
611
+ only_main_content: Optional[bool] = None,
612
+ timeout: Optional[int] = None,
613
+ wait_for: Optional[int] = None,
614
+ mobile: Optional[bool] = None,
615
+ parsers: Optional[Union[List[str], List[Union[str, PDFParser]]]] = None,
616
+ actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
617
+ location: Optional['Location'] = None,
618
+ skip_tls_verification: Optional[bool] = None,
619
+ remove_base64_images: Optional[bool] = None,
620
+ fast_mode: Optional[bool] = None,
621
+ use_mock: Optional[str] = None,
622
+ block_ads: Optional[bool] = None,
623
+ proxy: Optional[str] = None,
624
+ max_age: Optional[int] = None,
625
+ store_in_cache: Optional[bool] = None,
626
+ webhook: Optional[Union[str, WebhookConfig]] = None,
627
+ append_to_id: Optional[str] = None,
628
+ ignore_invalid_urls: Optional[bool] = None,
629
+ max_concurrency: Optional[int] = None,
630
+ zero_data_retention: Optional[bool] = None,
631
+ integration: Optional[str] = None,
632
+ idempotency_key: Optional[str] = None,
633
+ ):
634
+ """Start a batch scrape job over multiple URLs (non-blocking).
635
+
636
+ Args:
637
+ urls: List of URLs to scrape
638
+ formats: Output formats to collect per URL
639
+ headers: HTTP headers
640
+ include_tags: HTML tags to include
641
+ exclude_tags: HTML tags to exclude
642
+ only_main_content: Restrict scraping to main content
643
+ timeout: Per-request timeout in milliseconds
644
+ wait_for: Wait condition in milliseconds
645
+ mobile: Emulate mobile viewport
646
+ parsers: Parser list (e.g., ["pdf"])
647
+ actions: Browser actions to perform
648
+ location: Location settings
649
+ skip_tls_verification: Skip TLS verification
650
+ remove_base64_images: Remove base64 images from output
651
+ fast_mode: Prefer faster scraping modes
652
+ use_mock: Use a mock data source (internal/testing)
653
+ block_ads: Block ads during scraping
654
+ proxy: Proxy setting
655
+ max_age: Cache max age
656
+ store_in_cache: Whether to store results in cache
657
+ webhook: Webhook configuration
658
+ append_to_id: Append to an existing batch job
659
+ ignore_invalid_urls: Skip invalid URLs without failing
660
+ max_concurrency: Max concurrent scrapes
661
+ zero_data_retention: Delete data after 24 hours
662
+ integration: Integration tag/name
663
+ idempotency_key: Header used to deduplicate starts
664
+
665
+ Returns:
666
+ Response payload with job id (poll with get_batch_scrape_status)
667
+ """
668
+ options = ScrapeOptions(
669
+ **{k: v for k, v in dict(
670
+ formats=formats,
671
+ headers=headers,
672
+ include_tags=include_tags,
673
+ exclude_tags=exclude_tags,
674
+ only_main_content=only_main_content,
675
+ timeout=timeout,
676
+ wait_for=wait_for,
677
+ mobile=mobile,
678
+ parsers=parsers,
679
+ actions=actions,
680
+ location=location,
681
+ skip_tls_verification=skip_tls_verification,
682
+ remove_base64_images=remove_base64_images,
683
+ fast_mode=fast_mode,
684
+ use_mock=use_mock,
685
+ block_ads=block_ads,
686
+ proxy=proxy,
687
+ max_age=max_age,
688
+ store_in_cache=store_in_cache,
689
+ ).items() if v is not None}
690
+ ) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
691
+
692
+ return batch_module.start_batch_scrape(
693
+ self.http_client,
694
+ urls,
695
+ options=options,
696
+ webhook=webhook,
697
+ append_to_id=append_to_id,
698
+ ignore_invalid_urls=ignore_invalid_urls,
699
+ max_concurrency=max_concurrency,
700
+ zero_data_retention=zero_data_retention,
701
+ integration=integration,
702
+ idempotency_key=idempotency_key,
703
+ )
704
+
705
+ def get_batch_scrape_status(
706
+ self,
707
+ job_id: str,
708
+ pagination_config: Optional[PaginationConfig] = None
709
+ ):
710
+ """Get current status and any scraped data for a batch job.
711
+
712
+ Args:
713
+ job_id: Batch job ID
714
+ pagination_config: Optional configuration for pagination behavior
715
+
716
+ Returns:
717
+ Status payload including counts and partial data
718
+ """
719
+ return batch_module.get_batch_scrape_status(
720
+ self.http_client,
721
+ job_id,
722
+ pagination_config=pagination_config
723
+ )
724
+
725
+ def cancel_batch_scrape(self, job_id: str) -> bool:
726
+ """Cancel a running batch scrape job.
727
+
728
+ Args:
729
+ job_id: Batch job ID
730
+
731
+ Returns:
732
+ True if the job was cancelled
733
+ """
734
+ return batch_module.cancel_batch_scrape(self.http_client, job_id)
735
+
736
+ def get_batch_scrape_errors(self, job_id: str):
737
+ """Retrieve error details for a batch scrape job.
738
+
739
+ Args:
740
+ job_id: Batch job ID
741
+
742
+ Returns:
743
+ Errors and robots-blocked URLs for the job
744
+ """
745
+ return batch_methods.get_batch_scrape_errors(self.http_client, job_id)
746
+
747
+ def get_extract_status(self, job_id: str):
748
+ """Get the current status (and data if completed) of an extract job.
749
+
750
+ Args:
751
+ job_id: Extract job ID
752
+
753
+ Returns:
754
+ Extract response payload with status and optional data
755
+ """
756
+ return extract_module.get_extract_status(self.http_client, job_id)
757
+
758
+ def start_agent(
759
+ self,
760
+ urls: Optional[List[str]] = None,
761
+ *,
762
+ prompt: str,
763
+ schema: Optional[Any] = None,
764
+ integration: Optional[str] = None,
765
+ max_credits: Optional[int] = None,
766
+ strict_constrain_to_urls: Optional[bool] = None,
767
+ ):
768
+ """Start an agent job (non-blocking).
769
+
770
+ Args:
771
+ urls: URLs to process (optional)
772
+ prompt: Natural-language instruction for the agent
773
+ schema: Target JSON schema for the output (dict or Pydantic BaseModel)
774
+ integration: Integration tag/name
775
+ max_credits: Maximum credits to use (optional)
776
+ Returns:
777
+ Response payload with job id/status (poll with get_agent_status)
778
+ """
779
+ return agent_module.start_agent(
780
+ self.http_client,
781
+ urls,
782
+ prompt=prompt,
783
+ schema=schema,
784
+ integration=integration,
785
+ max_credits=max_credits,
786
+ strict_constrain_to_urls=strict_constrain_to_urls,
787
+ )
788
+
789
+ def agent(
790
+ self,
791
+ urls: Optional[List[str]] = None,
792
+ *,
793
+ prompt: str,
794
+ schema: Optional[Any] = None,
795
+ integration: Optional[str] = None,
796
+ poll_interval: int = 2,
797
+ timeout: Optional[int] = None,
798
+ max_credits: Optional[int] = None,
799
+ strict_constrain_to_urls: Optional[bool] = None,
800
+ ):
801
+ """Run an agent and wait until completion.
802
+
803
+ Args:
804
+ urls: URLs to process (optional)
805
+ prompt: Natural-language instruction for the agent
806
+ schema: Target JSON schema for the output (dict or Pydantic BaseModel)
807
+ integration: Integration tag/name
808
+ poll_interval: Seconds between status checks
809
+ timeout: Maximum seconds to wait (None for no timeout)
810
+ max_credits: Maximum credits to use (optional)
811
+ Returns:
812
+ Final agent response when completed
813
+ """
814
+ return agent_module.agent(
815
+ self.http_client,
816
+ urls,
817
+ prompt=prompt,
818
+ schema=schema,
819
+ integration=integration,
820
+ poll_interval=poll_interval,
821
+ timeout=timeout,
822
+ max_credits=max_credits,
823
+ strict_constrain_to_urls=strict_constrain_to_urls,
824
+ )
825
+
826
+ def get_agent_status(self, job_id: str):
827
+ """Get the current status (and data if completed) of an agent job.
828
+
829
+ Args:
830
+ job_id: Agent job ID
831
+
832
+ Returns:
833
+ Agent response payload with status and optional data
834
+ """
835
+ return agent_module.get_agent_status(self.http_client, job_id)
836
+
837
+ def cancel_agent(self, job_id: str) -> bool:
838
+ """Cancel a running agent job.
839
+
840
+ Args:
841
+ job_id: Agent job ID
842
+
843
+ Returns:
844
+ True if the agent was cancelled
845
+ """
846
+ return agent_module.cancel_agent(self.http_client, job_id)
847
+
848
+ def get_concurrency(self):
849
+ """Get current concurrency and maximum allowed for this team/key (v2)."""
850
+ return usage_methods.get_concurrency(self.http_client)
851
+
852
+ def get_credit_usage(self):
853
+ """Get remaining credits for this team/key (v2)."""
854
+ return usage_methods.get_credit_usage(self.http_client)
855
+
856
+ def get_token_usage(self):
857
+ """Get recent token usage metrics (v2)."""
858
+ return usage_methods.get_token_usage(self.http_client)
859
+
860
+ def get_credit_usage_historical(self, by_api_key: bool = False):
861
+ """Get historical credit usage (v2)."""
862
+ return usage_methods.get_credit_usage_historical(self.http_client, by_api_key)
863
+
864
+ def get_token_usage_historical(self, by_api_key: bool = False):
865
+ """Get historical token usage (v2)."""
866
+ return usage_methods.get_token_usage_historical(self.http_client, by_api_key)
867
+
868
+ def get_queue_status(self):
869
+ """Get metrics about the team's scrape queue."""
870
+ return usage_methods.get_queue_status(self.http_client)
871
+
872
+ def watcher(
873
+ self,
874
+ job_id: str,
875
+ *,
876
+ kind: Literal["crawl", "batch"] = "crawl",
877
+ poll_interval: int = 2,
878
+ timeout: Optional[int] = None,
879
+ ) -> Watcher:
880
+ """Create a watcher for crawl or batch jobs.
881
+
882
+ Args:
883
+ job_id: Job ID to watch
884
+ kind: Job kind ("crawl" or "batch")
885
+ poll_interval: Seconds between status checks
886
+ timeout: Maximum seconds to watch (None for no timeout)
887
+
888
+ Returns:
889
+ Watcher instance
890
+ """
891
+ return Watcher(self, job_id, kind=kind, poll_interval=poll_interval, timeout=timeout)
892
+
893
+ def batch_scrape(
894
+ self,
895
+ urls: List[str],
896
+ *,
897
+ formats: Optional[List['FormatOption']] = None,
898
+ headers: Optional[Dict[str, str]] = None,
899
+ include_tags: Optional[List[str]] = None,
900
+ exclude_tags: Optional[List[str]] = None,
901
+ only_main_content: Optional[bool] = None,
902
+ timeout: Optional[int] = None,
903
+ wait_for: Optional[int] = None,
904
+ mobile: Optional[bool] = None,
905
+ parsers: Optional[Union[List[str], List[Union[str, PDFParser]]]] = None,
906
+ actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
907
+ location: Optional['Location'] = None,
908
+ skip_tls_verification: Optional[bool] = None,
909
+ remove_base64_images: Optional[bool] = None,
910
+ fast_mode: Optional[bool] = None,
911
+ use_mock: Optional[str] = None,
912
+ block_ads: Optional[bool] = None,
913
+ proxy: Optional[str] = None,
914
+ max_age: Optional[int] = None,
915
+ store_in_cache: Optional[bool] = None,
916
+ webhook: Optional[Union[str, WebhookConfig]] = None,
917
+ append_to_id: Optional[str] = None,
918
+ ignore_invalid_urls: Optional[bool] = None,
919
+ max_concurrency: Optional[int] = None,
920
+ zero_data_retention: Optional[bool] = None,
921
+ integration: Optional[str] = None,
922
+ idempotency_key: Optional[str] = None,
923
+ poll_interval: int = 2,
924
+ wait_timeout: Optional[int] = None,
925
+ ):
926
+ """
927
+ Start a batch scrape job and wait until completion.
928
+ """
929
+ options = ScrapeOptions(
930
+ **{k: v for k, v in dict(
931
+ formats=formats,
932
+ headers=headers,
933
+ include_tags=include_tags,
934
+ exclude_tags=exclude_tags,
935
+ only_main_content=only_main_content,
936
+ timeout=timeout,
937
+ wait_for=wait_for,
938
+ mobile=mobile,
939
+ parsers=parsers,
940
+ actions=actions,
941
+ location=location,
942
+ skip_tls_verification=skip_tls_verification,
943
+ remove_base64_images=remove_base64_images,
944
+ fast_mode=fast_mode,
945
+ use_mock=use_mock,
946
+ block_ads=block_ads,
947
+ proxy=proxy,
948
+ max_age=max_age,
949
+ store_in_cache=store_in_cache,
950
+ ).items() if v is not None}
951
+ ) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
952
+
953
+ return batch_module.batch_scrape(
954
+ self.http_client,
955
+ urls,
956
+ options=options,
957
+ webhook=webhook,
958
+ append_to_id=append_to_id,
959
+ ignore_invalid_urls=ignore_invalid_urls,
960
+ max_concurrency=max_concurrency,
961
+ zero_data_retention=zero_data_retention,
962
+ integration=integration,
963
+ idempotency_key=idempotency_key,
964
+ poll_interval=poll_interval,
965
+ timeout=wait_timeout,
966
+ )
967
+