firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (82) hide show
  1. firecrawl/__init__.py +27 -19
  2. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. firecrawl/__tests__/e2e/v2/test_search.py +265 -0
  18. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
  36. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. firecrawl/client.py +241 -0
  41. firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
  42. firecrawl/types.py +157 -0
  43. firecrawl/v1/__init__.py +14 -0
  44. firecrawl/v1/client.py +4653 -0
  45. firecrawl/v2/__init__.py +4 -0
  46. firecrawl/v2/client.py +802 -0
  47. firecrawl/v2/client_async.py +250 -0
  48. firecrawl/v2/methods/aio/__init__.py +1 -0
  49. firecrawl/v2/methods/aio/batch.py +85 -0
  50. firecrawl/v2/methods/aio/crawl.py +174 -0
  51. firecrawl/v2/methods/aio/extract.py +126 -0
  52. firecrawl/v2/methods/aio/map.py +59 -0
  53. firecrawl/v2/methods/aio/scrape.py +36 -0
  54. firecrawl/v2/methods/aio/search.py +58 -0
  55. firecrawl/v2/methods/aio/usage.py +42 -0
  56. firecrawl/v2/methods/batch.py +420 -0
  57. firecrawl/v2/methods/crawl.py +468 -0
  58. firecrawl/v2/methods/extract.py +131 -0
  59. firecrawl/v2/methods/map.py +77 -0
  60. firecrawl/v2/methods/scrape.py +68 -0
  61. firecrawl/v2/methods/search.py +173 -0
  62. firecrawl/v2/methods/usage.py +41 -0
  63. firecrawl/v2/types.py +546 -0
  64. firecrawl/v2/utils/__init__.py +9 -0
  65. firecrawl/v2/utils/error_handler.py +107 -0
  66. firecrawl/v2/utils/get_version.py +15 -0
  67. firecrawl/v2/utils/http_client.py +153 -0
  68. firecrawl/v2/utils/http_client_async.py +64 -0
  69. firecrawl/v2/utils/validation.py +324 -0
  70. firecrawl/v2/watcher.py +312 -0
  71. firecrawl/v2/watcher_async.py +245 -0
  72. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
  73. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
  74. firecrawl-3.0.3.dist-info/RECORD +78 -0
  75. tests/test_timeout_conversion.py +117 -0
  76. firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  77. firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  78. firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  79. firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
  80. firecrawl-2.16.5.dist-info/RECORD +0 -12
  81. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
  82. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
firecrawl/v2/client.py ADDED
@@ -0,0 +1,802 @@
1
+ """
2
+ Main Firecrawl v2 API client.
3
+
4
+ This module provides the main client class that orchestrates all v2 functionality.
5
+ """
6
+
7
+ import os
8
+ from typing import Optional, List, Dict, Any, Callable, Union, Literal
9
+ from .types import (
10
+ ClientConfig,
11
+ ScrapeOptions,
12
+ Document,
13
+ SearchRequest,
14
+ SearchData,
15
+ SourceOption,
16
+ CrawlRequest,
17
+ CrawlResponse,
18
+ CrawlJob,
19
+ CrawlParamsRequest,
20
+ CrawlParamsData,
21
+ WebhookConfig,
22
+ CrawlErrorsResponse,
23
+ ActiveCrawlsResponse,
24
+ MapOptions,
25
+ MapData,
26
+ FormatOption,
27
+ WaitAction,
28
+ ScreenshotAction,
29
+ ClickAction,
30
+ WriteAction,
31
+ PressAction,
32
+ ScrollAction,
33
+ ScrapeAction,
34
+ ExecuteJavascriptAction,
35
+ PDFAction,
36
+ Location,
37
+ )
38
+ from .utils.http_client import HttpClient
39
+ from .utils.error_handler import FirecrawlError
40
+ from .methods import scrape as scrape_module
41
+ from .methods import crawl as crawl_module
42
+ from .methods import batch as batch_module
43
+ from .methods import search as search_module
44
+ from .methods import map as map_module
45
+ from .methods import batch as batch_methods
46
+ from .methods import usage as usage_methods
47
+ from .methods import extract as extract_module
48
+ from .watcher import Watcher
49
+
50
+ class FirecrawlClient:
51
+ """
52
+ Main Firecrawl v2 API client.
53
+
54
+ This client provides a clean, modular interface to all Firecrawl functionality.
55
+ """
56
+
57
+ def __init__(
58
+ self,
59
+ api_key: Optional[str] = None,
60
+ api_url: str = "https://api.firecrawl.dev",
61
+ timeout: Optional[float] = None,
62
+ max_retries: int = 3,
63
+ backoff_factor: float = 0.5
64
+ ):
65
+ """
66
+ Initialize the Firecrawl client.
67
+
68
+ Args:
69
+ api_key: Firecrawl API key (or set FIRECRAWL_API_KEY env var)
70
+ api_url: Base URL for the Firecrawl API
71
+ timeout: Request timeout in seconds
72
+ max_retries: Maximum number of retries for failed requests
73
+ backoff_factor: Exponential backoff factor for retries (e.g. 0.5 means wait 0.5s, then 1s, then 2s between retries)
74
+ """
75
+ if api_key is None:
76
+ api_key = os.getenv("FIRECRAWL_API_KEY")
77
+
78
+ if not api_key:
79
+ raise ValueError(
80
+ "API key is required. Set FIRECRAWL_API_KEY environment variable "
81
+ "or pass api_key parameter."
82
+ )
83
+
84
+ self.config = ClientConfig(
85
+ api_key=api_key,
86
+ api_url=api_url,
87
+ timeout=timeout,
88
+ max_retries=max_retries,
89
+ backoff_factor=backoff_factor
90
+ )
91
+
92
+ self.http_client = HttpClient(api_key, api_url)
93
+
94
+ def scrape(
95
+ self,
96
+ url: str,
97
+ *,
98
+ formats: Optional[List['FormatOption']] = None,
99
+ headers: Optional[Dict[str, str]] = None,
100
+ include_tags: Optional[List[str]] = None,
101
+ exclude_tags: Optional[List[str]] = None,
102
+ only_main_content: Optional[bool] = None,
103
+ timeout: Optional[int] = None,
104
+ wait_for: Optional[int] = None,
105
+ mobile: Optional[bool] = None,
106
+ parsers: Optional[List[str]] = None,
107
+ actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
108
+ location: Optional['Location'] = None,
109
+ skip_tls_verification: Optional[bool] = None,
110
+ remove_base64_images: Optional[bool] = None,
111
+ fast_mode: Optional[bool] = None,
112
+ use_mock: Optional[str] = None,
113
+ block_ads: Optional[bool] = None,
114
+ proxy: Optional[str] = None,
115
+ max_age: Optional[int] = None,
116
+ store_in_cache: Optional[bool] = None,
117
+ ) -> Document:
118
+ """
119
+ Scrape a single URL and return the document.
120
+ Args:
121
+ url: URL to scrape
122
+ formats: List of formats to scrape
123
+ headers: Dictionary of headers to use
124
+ include_tags: List of tags to include
125
+ exclude_tags: List of tags to exclude
126
+ only_main_content: Whether to only scrape the main content
127
+ timeout: Timeout in seconds
128
+ wait_for: Wait for a specific element to be present
129
+ mobile: Whether to use mobile mode
130
+ parsers: List of parsers to use
131
+ actions: List of actions to perform
132
+ location: Location to scrape
133
+ skip_tls_verification: Whether to skip TLS verification
134
+ remove_base64_images: Whether to remove base64 images
135
+ fast_mode: Whether to use fast mode
136
+ use_mock: Whether to use mock mode
137
+ block_ads: Whether to block ads
138
+ proxy: Proxy to use
139
+ max_age: Maximum age of the cache
140
+ store_in_cache: Whether to store the result in the cache
141
+ Returns:
142
+ Document
143
+ """
144
+ options = ScrapeOptions(
145
+ **{k: v for k, v in dict(
146
+ formats=formats,
147
+ headers=headers,
148
+ include_tags=include_tags,
149
+ exclude_tags=exclude_tags,
150
+ only_main_content=only_main_content,
151
+ timeout=timeout,
152
+ wait_for=wait_for,
153
+ mobile=mobile,
154
+ parsers=parsers,
155
+ actions=actions,
156
+ location=location,
157
+ skip_tls_verification=skip_tls_verification,
158
+ remove_base64_images=remove_base64_images,
159
+ fast_mode=fast_mode,
160
+ use_mock=use_mock,
161
+ block_ads=block_ads,
162
+ proxy=proxy,
163
+ max_age=max_age,
164
+ store_in_cache=store_in_cache,
165
+ ).items() if v is not None}
166
+ ) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
167
+ return scrape_module.scrape(self.http_client, url, options)
168
+
169
+ def search(
170
+ self,
171
+ query: str,
172
+ *,
173
+ sources: Optional[List[SourceOption]] = None,
174
+ limit: Optional[int] = None,
175
+ tbs: Optional[str] = None,
176
+ location: Optional[str] = None,
177
+ ignore_invalid_urls: Optional[bool] = None,
178
+ timeout: Optional[int] = None,
179
+ scrape_options: Optional[ScrapeOptions] = None,
180
+ ) -> SearchData:
181
+ """
182
+ Search for documents.
183
+
184
+ Args:
185
+ query: Search query string
186
+ limit: Maximum number of results to return (default: 5)
187
+ tbs: Time-based search filter
188
+ location: Location string for search
189
+ timeout: Request timeout in milliseconds (default: 60000)
190
+ page_options: Options for scraping individual pages
191
+
192
+ Returns:
193
+ SearchData containing the search results
194
+ """
195
+ request = SearchRequest(
196
+ query=query,
197
+ sources=sources,
198
+ limit=limit,
199
+ tbs=tbs,
200
+ location=location,
201
+ ignore_invalid_urls=ignore_invalid_urls,
202
+ timeout=timeout,
203
+ scrape_options=scrape_options,
204
+ )
205
+
206
+ return search_module.search(self.http_client, request)
207
+
208
+ def crawl(
209
+ self,
210
+ url: str,
211
+ *,
212
+ prompt: Optional[str] = None,
213
+ exclude_paths: Optional[List[str]] = None,
214
+ include_paths: Optional[List[str]] = None,
215
+ max_discovery_depth: Optional[int] = None,
216
+ ignore_sitemap: bool = False,
217
+ ignore_query_parameters: bool = False,
218
+ limit: Optional[int] = None,
219
+ crawl_entire_domain: bool = False,
220
+ allow_external_links: bool = False,
221
+ allow_subdomains: bool = False,
222
+ delay: Optional[int] = None,
223
+ max_concurrency: Optional[int] = None,
224
+ webhook: Optional[Union[str, WebhookConfig]] = None,
225
+ scrape_options: Optional[ScrapeOptions] = None,
226
+ zero_data_retention: bool = False,
227
+ poll_interval: int = 2,
228
+ timeout: Optional[int] = None
229
+ ) -> CrawlJob:
230
+ """
231
+ Start a crawl job and wait for it to complete.
232
+
233
+ Args:
234
+ url: Target URL to start crawling from
235
+ prompt: Optional prompt to guide the crawl
236
+ exclude_paths: Patterns of URLs to exclude
237
+ include_paths: Patterns of URLs to include
238
+ max_discovery_depth: Maximum depth for finding new URLs
239
+ ignore_sitemap: Skip sitemap.xml processing
240
+ ignore_query_parameters: Ignore URL parameters
241
+ limit: Maximum pages to crawl
242
+ crawl_entire_domain: Follow parent directory links
243
+ allow_external_links: Follow external domain links
244
+ allow_subdomains: Follow subdomains
245
+ delay: Delay in seconds between scrapes
246
+ max_concurrency: Maximum number of concurrent scrapes
247
+ webhook: Webhook configuration for notifications
248
+ scrape_options: Page scraping configuration
249
+ zero_data_retention: Whether to delete data after 24 hours
250
+ poll_interval: Seconds between status checks
251
+ timeout: Maximum seconds to wait (None for no timeout)
252
+
253
+ Returns:
254
+ CrawlJob when job completes
255
+
256
+ Raises:
257
+ ValueError: If request is invalid
258
+ Exception: If the crawl fails to start or complete
259
+ TimeoutError: If timeout is reached
260
+ """
261
+ request = CrawlRequest(
262
+ url=url,
263
+ prompt=prompt,
264
+ exclude_paths=exclude_paths,
265
+ include_paths=include_paths,
266
+ max_discovery_depth=max_discovery_depth,
267
+ ignore_sitemap=ignore_sitemap,
268
+ ignore_query_parameters=ignore_query_parameters,
269
+ limit=limit,
270
+ crawl_entire_domain=crawl_entire_domain,
271
+ allow_external_links=allow_external_links,
272
+ allow_subdomains=allow_subdomains,
273
+ delay=delay,
274
+ max_concurrency=max_concurrency,
275
+ webhook=webhook,
276
+ scrape_options=scrape_options,
277
+ zero_data_retention=zero_data_retention
278
+ )
279
+
280
+ return crawl_module.crawl(
281
+ self.http_client,
282
+ request,
283
+ poll_interval=poll_interval,
284
+ timeout=timeout
285
+ )
286
+
287
+ def start_crawl(
288
+ self,
289
+ url: str,
290
+ *,
291
+ prompt: Optional[str] = None,
292
+ exclude_paths: Optional[List[str]] = None,
293
+ include_paths: Optional[List[str]] = None,
294
+ max_discovery_depth: Optional[int] = None,
295
+ ignore_sitemap: bool = False,
296
+ ignore_query_parameters: bool = False,
297
+ limit: Optional[int] = None,
298
+ crawl_entire_domain: bool = False,
299
+ allow_external_links: bool = False,
300
+ allow_subdomains: bool = False,
301
+ delay: Optional[int] = None,
302
+ max_concurrency: Optional[int] = None,
303
+ webhook: Optional[Union[str, WebhookConfig]] = None,
304
+ scrape_options: Optional[ScrapeOptions] = None,
305
+ zero_data_retention: bool = False
306
+ ) -> CrawlResponse:
307
+ """
308
+ Start an asynchronous crawl job.
309
+
310
+ Args:
311
+ url: Target URL to start crawling from
312
+ prompt: Optional prompt to guide the crawl
313
+ exclude_paths: Patterns of URLs to exclude
314
+ include_paths: Patterns of URLs to include
315
+ max_discovery_depth: Maximum depth for finding new URLs
316
+ ignore_sitemap: Skip sitemap.xml processing
317
+ ignore_query_parameters: Ignore URL parameters
318
+ limit: Maximum pages to crawl
319
+ crawl_entire_domain: Follow parent directory links
320
+ allow_external_links: Follow external domain links
321
+ allow_subdomains: Follow subdomains
322
+ delay: Delay in seconds between scrapes
323
+ max_concurrency: Maximum number of concurrent scrapes
324
+ webhook: Webhook configuration for notifications
325
+ scrape_options: Page scraping configuration
326
+ zero_data_retention: Whether to delete data after 24 hours
327
+
328
+ Returns:
329
+ CrawlResponse with job information
330
+
331
+ Raises:
332
+ ValueError: If request is invalid
333
+ Exception: If the crawl operation fails to start
334
+ """
335
+ request = CrawlRequest(
336
+ url=url,
337
+ prompt=prompt,
338
+ exclude_paths=exclude_paths,
339
+ include_paths=include_paths,
340
+ max_discovery_depth=max_discovery_depth,
341
+ ignore_sitemap=ignore_sitemap,
342
+ ignore_query_parameters=ignore_query_parameters,
343
+ limit=limit,
344
+ crawl_entire_domain=crawl_entire_domain,
345
+ allow_external_links=allow_external_links,
346
+ allow_subdomains=allow_subdomains,
347
+ delay=delay,
348
+ max_concurrency=max_concurrency,
349
+ webhook=webhook,
350
+ scrape_options=scrape_options,
351
+ zero_data_retention=zero_data_retention
352
+ )
353
+
354
+ return crawl_module.start_crawl(self.http_client, request)
355
+
356
+ def get_crawl_status(self, job_id: str) -> CrawlJob:
357
+ """
358
+ Get the status of a crawl job.
359
+
360
+ Args:
361
+ job_id: ID of the crawl job
362
+
363
+ Returns:
364
+ CrawlJob with current status and data
365
+
366
+ Raises:
367
+ Exception: If the status check fails
368
+ """
369
+ return crawl_module.get_crawl_status(self.http_client, job_id)
370
+
371
+ def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
372
+ """
373
+ Retrieve error details and robots.txt blocks for a given crawl job.
374
+
375
+ Args:
376
+ crawl_id: The ID of the crawl job
377
+
378
+ Returns:
379
+ CrawlErrorsResponse containing per-URL errors and robots-blocked URLs
380
+ """
381
+ return crawl_module.get_crawl_errors(self.http_client, crawl_id)
382
+
383
+ def get_active_crawls(self) -> ActiveCrawlsResponse:
384
+ """
385
+ Get a list of currently active crawl jobs.
386
+
387
+ Returns:
388
+ ActiveCrawlsResponse containing a list of active crawl jobs.
389
+ """
390
+ return crawl_module.get_active_crawls(self.http_client)
391
+
392
+ def active_crawls(self) -> ActiveCrawlsResponse:
393
+ """
394
+ List currently active crawl jobs for the authenticated team.
395
+
396
+ Returns:
397
+ ActiveCrawlsResponse containing the list of active crawl jobs
398
+ """
399
+ return self.get_active_crawls()
400
+
401
+ def map(
402
+ self,
403
+ url: str,
404
+ *,
405
+ search: Optional[str] = None,
406
+ include_subdomains: Optional[bool] = None,
407
+ limit: Optional[int] = None,
408
+ sitemap: Optional[Literal["only", "include", "skip"]] = None,
409
+ timeout: Optional[int] = None,
410
+ ) -> MapData:
411
+ """Map a URL and return discovered links.
412
+
413
+ Args:
414
+ url: Root URL to explore
415
+ search: Optional substring filter for discovered links
416
+ include_subdomains: Whether to include subdomains
417
+ limit: Maximum number of links to return
418
+ sitemap: Sitemap usage mode ("only" | "include" | "skip")
419
+ timeout: Request timeout in milliseconds
420
+
421
+ Returns:
422
+ MapData containing the discovered links
423
+ """
424
+ options = MapOptions(
425
+ search=search,
426
+ include_subdomains=include_subdomains,
427
+ limit=limit,
428
+ sitemap=sitemap if sitemap is not None else "include",
429
+ timeout=timeout,
430
+ ) if any(v is not None for v in [search, include_subdomains, limit, sitemap, timeout]) else None
431
+
432
+ return map_module.map(self.http_client, url, options)
433
+
434
+ def cancel_crawl(self, crawl_id: str) -> bool:
435
+ """
436
+ Cancel a crawl job.
437
+
438
+ Args:
439
+ crawl_id: The ID of the crawl job to cancel
440
+
441
+ Returns:
442
+ bool: True if the crawl was cancelled, False otherwise
443
+ """
444
+ return crawl_module.cancel_crawl(self.http_client, crawl_id)
445
+
446
+ def crawl_params_preview(self, url: str, prompt: str) -> CrawlParamsData:
447
+ """Derive crawl parameters from natural-language prompt.
448
+
449
+ Args:
450
+ url: Root URL
451
+ prompt: Instruction describing how to crawl
452
+
453
+ Returns:
454
+ CrawlParamsData with normalized crawl configuration
455
+ """
456
+ request = CrawlParamsRequest(url=url, prompt=prompt)
457
+ return crawl_module.crawl_params_preview(self.http_client, request)
458
+
459
+ def start_extract(
460
+ self,
461
+ urls: Optional[List[str]] = None,
462
+ *,
463
+ prompt: Optional[str] = None,
464
+ schema: Optional[Dict[str, Any]] = None,
465
+ system_prompt: Optional[str] = None,
466
+ allow_external_links: Optional[bool] = None,
467
+ enable_web_search: Optional[bool] = None,
468
+ show_sources: Optional[bool] = None,
469
+ scrape_options: Optional['ScrapeOptions'] = None,
470
+ ignore_invalid_urls: Optional[bool] = None,
471
+ ):
472
+ """Start an extract job (non-blocking).
473
+
474
+ Args:
475
+ urls: URLs to extract from (optional)
476
+ prompt: Natural-language instruction for extraction
477
+ schema: Target JSON schema for the output
478
+ system_prompt: Optional system instruction
479
+ allow_external_links: Allow hyperlinks in output
480
+ enable_web_search: Whether to augment with web search
481
+ show_sources: Include per-field/source mapping when available
482
+ scrape_options: Scrape options applied prior to extraction
483
+ ignore_invalid_urls: Skip invalid URLs instead of failing
484
+
485
+ Returns:
486
+ Response payload with job id/status (poll with get_extract_status)
487
+ """
488
+ return extract_module.start_extract(
489
+ self.http_client,
490
+ urls,
491
+ prompt=prompt,
492
+ schema=schema,
493
+ system_prompt=system_prompt,
494
+ allow_external_links=allow_external_links,
495
+ enable_web_search=enable_web_search,
496
+ show_sources=show_sources,
497
+ scrape_options=scrape_options,
498
+ ignore_invalid_urls=ignore_invalid_urls,
499
+ )
500
+
501
+ def extract(
502
+ self,
503
+ urls: Optional[List[str]] = None,
504
+ *,
505
+ prompt: Optional[str] = None,
506
+ schema: Optional[Dict[str, Any]] = None,
507
+ system_prompt: Optional[str] = None,
508
+ allow_external_links: Optional[bool] = None,
509
+ enable_web_search: Optional[bool] = None,
510
+ show_sources: Optional[bool] = None,
511
+ scrape_options: Optional['ScrapeOptions'] = None,
512
+ ignore_invalid_urls: Optional[bool] = None,
513
+ poll_interval: int = 2,
514
+ timeout: Optional[int] = None,
515
+ ):
516
+ """Extract structured data and wait until completion.
517
+
518
+ Args:
519
+ urls: URLs to extract from (optional)
520
+ prompt: Natural-language instruction for extraction
521
+ schema: Target JSON schema for the output
522
+ system_prompt: Optional system instruction
523
+ allow_external_links: Allow hyperlinks in output
524
+ enable_web_search: Whether to augment with web search
525
+ show_sources: Include per-field/source mapping when available
526
+ scrape_options: Scrape options applied prior to extraction
527
+ ignore_invalid_urls: Skip invalid URLs instead of failing
528
+ poll_interval: Seconds between status checks
529
+ timeout: Maximum seconds to wait (None for no timeout)
530
+
531
+ Returns:
532
+ Final extract response when completed
533
+ """
534
+ return extract_module.extract(
535
+ self.http_client,
536
+ urls,
537
+ prompt=prompt,
538
+ schema=schema,
539
+ system_prompt=system_prompt,
540
+ allow_external_links=allow_external_links,
541
+ enable_web_search=enable_web_search,
542
+ show_sources=show_sources,
543
+ scrape_options=scrape_options,
544
+ ignore_invalid_urls=ignore_invalid_urls,
545
+ poll_interval=poll_interval,
546
+ timeout=timeout,
547
+ )
548
+
549
+ def start_batch_scrape(
550
+ self,
551
+ urls: List[str],
552
+ *,
553
+ formats: Optional[List['FormatOption']] = None,
554
+ headers: Optional[Dict[str, str]] = None,
555
+ include_tags: Optional[List[str]] = None,
556
+ exclude_tags: Optional[List[str]] = None,
557
+ only_main_content: Optional[bool] = None,
558
+ timeout: Optional[int] = None,
559
+ wait_for: Optional[int] = None,
560
+ mobile: Optional[bool] = None,
561
+ parsers: Optional[List[str]] = None,
562
+ actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
563
+ location: Optional['Location'] = None,
564
+ skip_tls_verification: Optional[bool] = None,
565
+ remove_base64_images: Optional[bool] = None,
566
+ fast_mode: Optional[bool] = None,
567
+ use_mock: Optional[str] = None,
568
+ block_ads: Optional[bool] = None,
569
+ proxy: Optional[str] = None,
570
+ max_age: Optional[int] = None,
571
+ store_in_cache: Optional[bool] = None,
572
+ webhook: Optional[Union[str, WebhookConfig]] = None,
573
+ append_to_id: Optional[str] = None,
574
+ ignore_invalid_urls: Optional[bool] = None,
575
+ max_concurrency: Optional[int] = None,
576
+ zero_data_retention: Optional[bool] = None,
577
+ integration: Optional[str] = None,
578
+ idempotency_key: Optional[str] = None,
579
+ ):
580
+ """Start a batch scrape job over multiple URLs (non-blocking).
581
+
582
+ Args:
583
+ urls: List of URLs to scrape
584
+ formats: Output formats to collect per URL
585
+ headers: HTTP headers
586
+ include_tags: HTML tags to include
587
+ exclude_tags: HTML tags to exclude
588
+ only_main_content: Restrict scraping to main content
589
+ timeout: Per-request timeout in milliseconds
590
+ wait_for: Wait condition in milliseconds
591
+ mobile: Emulate mobile viewport
592
+ parsers: Parser list (e.g., ["pdf"])
593
+ actions: Browser actions to perform
594
+ location: Location settings
595
+ skip_tls_verification: Skip TLS verification
596
+ remove_base64_images: Remove base64 images from output
597
+ fast_mode: Prefer faster scraping modes
598
+ use_mock: Use a mock data source (internal/testing)
599
+ block_ads: Block ads during scraping
600
+ proxy: Proxy setting
601
+ max_age: Cache max age
602
+ store_in_cache: Whether to store results in cache
603
+ webhook: Webhook configuration
604
+ append_to_id: Append to an existing batch job
605
+ ignore_invalid_urls: Skip invalid URLs without failing
606
+ max_concurrency: Max concurrent scrapes
607
+ zero_data_retention: Delete data after 24 hours
608
+ integration: Integration tag/name
609
+ idempotency_key: Header used to deduplicate starts
610
+
611
+ Returns:
612
+ Response payload with job id (poll with get_batch_scrape_status)
613
+ """
614
+ options = ScrapeOptions(
615
+ **{k: v for k, v in dict(
616
+ formats=formats,
617
+ headers=headers,
618
+ include_tags=include_tags,
619
+ exclude_tags=exclude_tags,
620
+ only_main_content=only_main_content,
621
+ timeout=timeout,
622
+ wait_for=wait_for,
623
+ mobile=mobile,
624
+ parsers=parsers,
625
+ actions=actions,
626
+ location=location,
627
+ skip_tls_verification=skip_tls_verification,
628
+ remove_base64_images=remove_base64_images,
629
+ fast_mode=fast_mode,
630
+ use_mock=use_mock,
631
+ block_ads=block_ads,
632
+ proxy=proxy,
633
+ max_age=max_age,
634
+ store_in_cache=store_in_cache,
635
+ ).items() if v is not None}
636
+ ) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
637
+
638
+ return batch_module.start_batch_scrape(
639
+ self.http_client,
640
+ urls,
641
+ options=options,
642
+ webhook=webhook,
643
+ append_to_id=append_to_id,
644
+ ignore_invalid_urls=ignore_invalid_urls,
645
+ max_concurrency=max_concurrency,
646
+ zero_data_retention=zero_data_retention,
647
+ integration=integration,
648
+ idempotency_key=idempotency_key,
649
+ )
650
+
651
+ def get_batch_scrape_status(self, job_id: str):
652
+ """Get current status and any scraped data for a batch job.
653
+
654
+ Args:
655
+ job_id: Batch job ID
656
+
657
+ Returns:
658
+ Status payload including counts and partial data
659
+ """
660
+ return batch_module.get_batch_scrape_status(self.http_client, job_id)
661
+
662
+ def cancel_batch_scrape(self, job_id: str) -> bool:
663
+ """Cancel a running batch scrape job.
664
+
665
+ Args:
666
+ job_id: Batch job ID
667
+
668
+ Returns:
669
+ True if the job was cancelled
670
+ """
671
+ return batch_module.cancel_batch_scrape(self.http_client, job_id)
672
+
673
+ def get_batch_scrape_errors(self, job_id: str):
674
+ """Retrieve error details for a batch scrape job.
675
+
676
+ Args:
677
+ job_id: Batch job ID
678
+
679
+ Returns:
680
+ Errors and robots-blocked URLs for the job
681
+ """
682
+ return batch_methods.get_batch_scrape_errors(self.http_client, job_id)
683
+
684
+ def get_extract_status(self, job_id: str):
685
+ """Get the current status (and data if completed) of an extract job.
686
+
687
+ Args:
688
+ job_id: Extract job ID
689
+
690
+ Returns:
691
+ Extract response payload with status and optional data
692
+ """
693
+ return extract_module.get_extract_status(self.http_client, job_id)
694
+
695
+ def get_concurrency(self):
696
+ """Get current concurrency and maximum allowed for this team/key (v2)."""
697
+ return usage_methods.get_concurrency(self.http_client)
698
+
699
+ def get_credit_usage(self):
700
+ """Get remaining credits for this team/key (v2)."""
701
+ return usage_methods.get_credit_usage(self.http_client)
702
+
703
+ def get_token_usage(self):
704
+ """Get recent token usage metrics (v2)."""
705
+ return usage_methods.get_token_usage(self.http_client)
706
+
707
+ def watcher(
708
+ self,
709
+ job_id: str,
710
+ *,
711
+ kind: Literal["crawl", "batch"] = "crawl",
712
+ poll_interval: int = 2,
713
+ timeout: Optional[int] = None,
714
+ ) -> Watcher:
715
+ """Create a watcher for crawl or batch jobs.
716
+
717
+ Args:
718
+ job_id: Job ID to watch
719
+ kind: Job kind ("crawl" or "batch")
720
+ poll_interval: Seconds between status checks
721
+ timeout: Maximum seconds to watch (None for no timeout)
722
+
723
+ Returns:
724
+ Watcher instance
725
+ """
726
+ return Watcher(self, job_id, kind=kind, poll_interval=poll_interval, timeout=timeout)
727
+
728
+ def batch_scrape(
729
+ self,
730
+ urls: List[str],
731
+ *,
732
+ formats: Optional[List['FormatOption']] = None,
733
+ headers: Optional[Dict[str, str]] = None,
734
+ include_tags: Optional[List[str]] = None,
735
+ exclude_tags: Optional[List[str]] = None,
736
+ only_main_content: Optional[bool] = None,
737
+ timeout: Optional[int] = None,
738
+ wait_for: Optional[int] = None,
739
+ mobile: Optional[bool] = None,
740
+ parsers: Optional[List[str]] = None,
741
+ actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
742
+ location: Optional['Location'] = None,
743
+ skip_tls_verification: Optional[bool] = None,
744
+ remove_base64_images: Optional[bool] = None,
745
+ fast_mode: Optional[bool] = None,
746
+ use_mock: Optional[str] = None,
747
+ block_ads: Optional[bool] = None,
748
+ proxy: Optional[str] = None,
749
+ max_age: Optional[int] = None,
750
+ store_in_cache: Optional[bool] = None,
751
+ webhook: Optional[Union[str, WebhookConfig]] = None,
752
+ append_to_id: Optional[str] = None,
753
+ ignore_invalid_urls: Optional[bool] = None,
754
+ max_concurrency: Optional[int] = None,
755
+ zero_data_retention: Optional[bool] = None,
756
+ integration: Optional[str] = None,
757
+ idempotency_key: Optional[str] = None,
758
+ poll_interval: int = 2,
759
+ wait_timeout: Optional[int] = None,
760
+ ):
761
+ """
762
+ Start a batch scrape job and wait until completion.
763
+ """
764
+ options = ScrapeOptions(
765
+ **{k: v for k, v in dict(
766
+ formats=formats,
767
+ headers=headers,
768
+ include_tags=include_tags,
769
+ exclude_tags=exclude_tags,
770
+ only_main_content=only_main_content,
771
+ timeout=timeout,
772
+ wait_for=wait_for,
773
+ mobile=mobile,
774
+ parsers=parsers,
775
+ actions=actions,
776
+ location=location,
777
+ skip_tls_verification=skip_tls_verification,
778
+ remove_base64_images=remove_base64_images,
779
+ fast_mode=fast_mode,
780
+ use_mock=use_mock,
781
+ block_ads=block_ads,
782
+ proxy=proxy,
783
+ max_age=max_age,
784
+ store_in_cache=store_in_cache,
785
+ ).items() if v is not None}
786
+ ) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
787
+
788
+ return batch_module.batch_scrape(
789
+ self.http_client,
790
+ urls,
791
+ options=options,
792
+ webhook=webhook,
793
+ append_to_id=append_to_id,
794
+ ignore_invalid_urls=ignore_invalid_urls,
795
+ max_concurrency=max_concurrency,
796
+ zero_data_retention=zero_data_retention,
797
+ integration=integration,
798
+ idempotency_key=idempotency_key,
799
+ poll_interval=poll_interval,
800
+ timeout=wait_timeout,
801
+ )
802
+