firecrawl-py 3.3.1__py3-none-any.whl → 3.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (81) hide show
  1. firecrawl/__init__.py +1 -1
  2. {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.2.dist-info}/METADATA +1 -1
  3. firecrawl_py-3.3.2.dist-info/RECORD +79 -0
  4. {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.2.dist-info}/top_level.txt +0 -2
  5. build/lib/firecrawl/__init__.py +0 -87
  6. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -79
  7. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -188
  8. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -38
  9. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -40
  10. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -137
  11. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -248
  12. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -35
  13. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -43
  14. build/lib/firecrawl/__tests__/e2e/v2/conftest.py +0 -73
  15. build/lib/firecrawl/__tests__/e2e/v2/test_async.py +0 -73
  16. build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -105
  17. build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -276
  18. build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +0 -54
  19. build/lib/firecrawl/__tests__/e2e/v2/test_map.py +0 -60
  20. build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -154
  21. build/lib/firecrawl/__tests__/e2e/v2/test_search.py +0 -269
  22. build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +0 -26
  23. build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -65
  24. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -12
  25. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -61
  26. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -12
  27. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -19
  28. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -50
  29. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -63
  30. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -28
  31. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -117
  32. build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -90
  33. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -70
  34. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -240
  35. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -107
  36. build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -53
  37. build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -92
  38. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -167
  39. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -236
  40. build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -18
  41. build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -123
  42. build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -290
  43. build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -332
  44. build/lib/firecrawl/client.py +0 -242
  45. build/lib/firecrawl/firecrawl.backup.py +0 -4635
  46. build/lib/firecrawl/types.py +0 -161
  47. build/lib/firecrawl/v1/__init__.py +0 -14
  48. build/lib/firecrawl/v1/client.py +0 -4653
  49. build/lib/firecrawl/v2/__init__.py +0 -4
  50. build/lib/firecrawl/v2/client.py +0 -805
  51. build/lib/firecrawl/v2/client_async.py +0 -250
  52. build/lib/firecrawl/v2/methods/aio/__init__.py +0 -1
  53. build/lib/firecrawl/v2/methods/aio/batch.py +0 -85
  54. build/lib/firecrawl/v2/methods/aio/crawl.py +0 -171
  55. build/lib/firecrawl/v2/methods/aio/extract.py +0 -126
  56. build/lib/firecrawl/v2/methods/aio/map.py +0 -59
  57. build/lib/firecrawl/v2/methods/aio/scrape.py +0 -33
  58. build/lib/firecrawl/v2/methods/aio/search.py +0 -172
  59. build/lib/firecrawl/v2/methods/aio/usage.py +0 -42
  60. build/lib/firecrawl/v2/methods/batch.py +0 -417
  61. build/lib/firecrawl/v2/methods/crawl.py +0 -469
  62. build/lib/firecrawl/v2/methods/extract.py +0 -131
  63. build/lib/firecrawl/v2/methods/map.py +0 -77
  64. build/lib/firecrawl/v2/methods/scrape.py +0 -64
  65. build/lib/firecrawl/v2/methods/search.py +0 -197
  66. build/lib/firecrawl/v2/methods/usage.py +0 -41
  67. build/lib/firecrawl/v2/types.py +0 -665
  68. build/lib/firecrawl/v2/utils/__init__.py +0 -9
  69. build/lib/firecrawl/v2/utils/error_handler.py +0 -107
  70. build/lib/firecrawl/v2/utils/get_version.py +0 -15
  71. build/lib/firecrawl/v2/utils/http_client.py +0 -153
  72. build/lib/firecrawl/v2/utils/http_client_async.py +0 -65
  73. build/lib/firecrawl/v2/utils/normalize.py +0 -107
  74. build/lib/firecrawl/v2/utils/validation.py +0 -324
  75. build/lib/firecrawl/v2/watcher.py +0 -301
  76. build/lib/firecrawl/v2/watcher_async.py +0 -242
  77. build/lib/tests/test_change_tracking.py +0 -98
  78. build/lib/tests/test_timeout_conversion.py +0 -117
  79. firecrawl_py-3.3.1.dist-info/RECORD +0 -153
  80. {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.2.dist-info}/LICENSE +0 -0
  81. {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.2.dist-info}/WHEEL +0 -0
@@ -1,805 +0,0 @@
1
- """
2
- Main Firecrawl v2 API client.
3
-
4
- This module provides the main client class that orchestrates all v2 functionality.
5
- """
6
-
7
- import os
8
- from typing import Optional, List, Dict, Any, Callable, Union, Literal
9
- from .types import (
10
- ClientConfig,
11
- ScrapeOptions,
12
- Document,
13
- SearchRequest,
14
- SearchData,
15
- SourceOption,
16
- CategoryOption,
17
- CrawlRequest,
18
- CrawlResponse,
19
- CrawlJob,
20
- CrawlParamsRequest,
21
- CrawlParamsData,
22
- WebhookConfig,
23
- CrawlErrorsResponse,
24
- ActiveCrawlsResponse,
25
- MapOptions,
26
- MapData,
27
- FormatOption,
28
- WaitAction,
29
- ScreenshotAction,
30
- ClickAction,
31
- WriteAction,
32
- PressAction,
33
- ScrollAction,
34
- ScrapeAction,
35
- ExecuteJavascriptAction,
36
- PDFAction,
37
- Location,
38
- )
39
- from .utils.http_client import HttpClient
40
- from .utils.error_handler import FirecrawlError
41
- from .methods import scrape as scrape_module
42
- from .methods import crawl as crawl_module
43
- from .methods import batch as batch_module
44
- from .methods import search as search_module
45
- from .methods import map as map_module
46
- from .methods import batch as batch_methods
47
- from .methods import usage as usage_methods
48
- from .methods import extract as extract_module
49
- from .watcher import Watcher
50
-
51
- class FirecrawlClient:
52
- """
53
- Main Firecrawl v2 API client.
54
-
55
- This client provides a clean, modular interface to all Firecrawl functionality.
56
- """
57
-
58
- def __init__(
59
- self,
60
- api_key: Optional[str] = None,
61
- api_url: str = "https://api.firecrawl.dev",
62
- timeout: Optional[float] = None,
63
- max_retries: int = 3,
64
- backoff_factor: float = 0.5
65
- ):
66
- """
67
- Initialize the Firecrawl client.
68
-
69
- Args:
70
- api_key: Firecrawl API key (or set FIRECRAWL_API_KEY env var)
71
- api_url: Base URL for the Firecrawl API
72
- timeout: Request timeout in seconds
73
- max_retries: Maximum number of retries for failed requests
74
- backoff_factor: Exponential backoff factor for retries (e.g. 0.5 means wait 0.5s, then 1s, then 2s between retries)
75
- """
76
- if api_key is None:
77
- api_key = os.getenv("FIRECRAWL_API_KEY")
78
-
79
- if not api_key:
80
- raise ValueError(
81
- "API key is required. Set FIRECRAWL_API_KEY environment variable "
82
- "or pass api_key parameter."
83
- )
84
-
85
- self.config = ClientConfig(
86
- api_key=api_key,
87
- api_url=api_url,
88
- timeout=timeout,
89
- max_retries=max_retries,
90
- backoff_factor=backoff_factor
91
- )
92
-
93
- self.http_client = HttpClient(api_key, api_url)
94
-
95
- def scrape(
96
- self,
97
- url: str,
98
- *,
99
- formats: Optional[List['FormatOption']] = None,
100
- headers: Optional[Dict[str, str]] = None,
101
- include_tags: Optional[List[str]] = None,
102
- exclude_tags: Optional[List[str]] = None,
103
- only_main_content: Optional[bool] = None,
104
- timeout: Optional[int] = None,
105
- wait_for: Optional[int] = None,
106
- mobile: Optional[bool] = None,
107
- parsers: Optional[List[str]] = None,
108
- actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
109
- location: Optional['Location'] = None,
110
- skip_tls_verification: Optional[bool] = None,
111
- remove_base64_images: Optional[bool] = None,
112
- fast_mode: Optional[bool] = None,
113
- use_mock: Optional[str] = None,
114
- block_ads: Optional[bool] = None,
115
- proxy: Optional[str] = None,
116
- max_age: Optional[int] = None,
117
- store_in_cache: Optional[bool] = None,
118
- ) -> Document:
119
- """
120
- Scrape a single URL and return the document.
121
- Args:
122
- url: URL to scrape
123
- formats: List of formats to scrape
124
- headers: Dictionary of headers to use
125
- include_tags: List of tags to include
126
- exclude_tags: List of tags to exclude
127
- only_main_content: Whether to only scrape the main content
128
- timeout: Timeout in seconds
129
- wait_for: Wait for a specific element to be present
130
- mobile: Whether to use mobile mode
131
- parsers: List of parsers to use
132
- actions: List of actions to perform
133
- location: Location to scrape
134
- skip_tls_verification: Whether to skip TLS verification
135
- remove_base64_images: Whether to remove base64 images
136
- fast_mode: Whether to use fast mode
137
- use_mock: Whether to use mock mode
138
- block_ads: Whether to block ads
139
- proxy: Proxy to use
140
- max_age: Maximum age of the cache
141
- store_in_cache: Whether to store the result in the cache
142
- Returns:
143
- Document
144
- """
145
- options = ScrapeOptions(
146
- **{k: v for k, v in dict(
147
- formats=formats,
148
- headers=headers,
149
- include_tags=include_tags,
150
- exclude_tags=exclude_tags,
151
- only_main_content=only_main_content,
152
- timeout=timeout,
153
- wait_for=wait_for,
154
- mobile=mobile,
155
- parsers=parsers,
156
- actions=actions,
157
- location=location,
158
- skip_tls_verification=skip_tls_verification,
159
- remove_base64_images=remove_base64_images,
160
- fast_mode=fast_mode,
161
- use_mock=use_mock,
162
- block_ads=block_ads,
163
- proxy=proxy,
164
- max_age=max_age,
165
- store_in_cache=store_in_cache,
166
- ).items() if v is not None}
167
- ) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
168
- return scrape_module.scrape(self.http_client, url, options)
169
-
170
- def search(
171
- self,
172
- query: str,
173
- *,
174
- sources: Optional[List[SourceOption]] = None,
175
- categories: Optional[List[CategoryOption]] = None,
176
- limit: Optional[int] = None,
177
- tbs: Optional[str] = None,
178
- location: Optional[str] = None,
179
- ignore_invalid_urls: Optional[bool] = None,
180
- timeout: Optional[int] = None,
181
- scrape_options: Optional[ScrapeOptions] = None,
182
- ) -> SearchData:
183
- """
184
- Search for documents.
185
-
186
- Args:
187
- query: Search query string
188
- limit: Maximum number of results to return (default: 5)
189
- tbs: Time-based search filter
190
- location: Location string for search
191
- timeout: Request timeout in milliseconds (default: 60000)
192
- page_options: Options for scraping individual pages
193
-
194
- Returns:
195
- SearchData containing the search results
196
- """
197
- request = SearchRequest(
198
- query=query,
199
- sources=sources,
200
- categories=categories,
201
- limit=limit,
202
- tbs=tbs,
203
- location=location,
204
- ignore_invalid_urls=ignore_invalid_urls,
205
- timeout=timeout,
206
- scrape_options=scrape_options,
207
- )
208
-
209
- return search_module.search(self.http_client, request)
210
-
211
- def crawl(
212
- self,
213
- url: str,
214
- *,
215
- prompt: Optional[str] = None,
216
- exclude_paths: Optional[List[str]] = None,
217
- include_paths: Optional[List[str]] = None,
218
- max_discovery_depth: Optional[int] = None,
219
- ignore_sitemap: bool = False,
220
- ignore_query_parameters: bool = False,
221
- limit: Optional[int] = None,
222
- crawl_entire_domain: bool = False,
223
- allow_external_links: bool = False,
224
- allow_subdomains: bool = False,
225
- delay: Optional[int] = None,
226
- max_concurrency: Optional[int] = None,
227
- webhook: Optional[Union[str, WebhookConfig]] = None,
228
- scrape_options: Optional[ScrapeOptions] = None,
229
- zero_data_retention: bool = False,
230
- poll_interval: int = 2,
231
- timeout: Optional[int] = None
232
- ) -> CrawlJob:
233
- """
234
- Start a crawl job and wait for it to complete.
235
-
236
- Args:
237
- url: Target URL to start crawling from
238
- prompt: Optional prompt to guide the crawl
239
- exclude_paths: Patterns of URLs to exclude
240
- include_paths: Patterns of URLs to include
241
- max_discovery_depth: Maximum depth for finding new URLs
242
- ignore_sitemap: Skip sitemap.xml processing
243
- ignore_query_parameters: Ignore URL parameters
244
- limit: Maximum pages to crawl
245
- crawl_entire_domain: Follow parent directory links
246
- allow_external_links: Follow external domain links
247
- allow_subdomains: Follow subdomains
248
- delay: Delay in seconds between scrapes
249
- max_concurrency: Maximum number of concurrent scrapes
250
- webhook: Webhook configuration for notifications
251
- scrape_options: Page scraping configuration
252
- zero_data_retention: Whether to delete data after 24 hours
253
- poll_interval: Seconds between status checks
254
- timeout: Maximum seconds to wait (None for no timeout)
255
-
256
- Returns:
257
- CrawlJob when job completes
258
-
259
- Raises:
260
- ValueError: If request is invalid
261
- Exception: If the crawl fails to start or complete
262
- TimeoutError: If timeout is reached
263
- """
264
- request = CrawlRequest(
265
- url=url,
266
- prompt=prompt,
267
- exclude_paths=exclude_paths,
268
- include_paths=include_paths,
269
- max_discovery_depth=max_discovery_depth,
270
- ignore_sitemap=ignore_sitemap,
271
- ignore_query_parameters=ignore_query_parameters,
272
- limit=limit,
273
- crawl_entire_domain=crawl_entire_domain,
274
- allow_external_links=allow_external_links,
275
- allow_subdomains=allow_subdomains,
276
- delay=delay,
277
- max_concurrency=max_concurrency,
278
- webhook=webhook,
279
- scrape_options=scrape_options,
280
- zero_data_retention=zero_data_retention
281
- )
282
-
283
- return crawl_module.crawl(
284
- self.http_client,
285
- request,
286
- poll_interval=poll_interval,
287
- timeout=timeout
288
- )
289
-
290
- def start_crawl(
291
- self,
292
- url: str,
293
- *,
294
- prompt: Optional[str] = None,
295
- exclude_paths: Optional[List[str]] = None,
296
- include_paths: Optional[List[str]] = None,
297
- max_discovery_depth: Optional[int] = None,
298
- ignore_sitemap: bool = False,
299
- ignore_query_parameters: bool = False,
300
- limit: Optional[int] = None,
301
- crawl_entire_domain: bool = False,
302
- allow_external_links: bool = False,
303
- allow_subdomains: bool = False,
304
- delay: Optional[int] = None,
305
- max_concurrency: Optional[int] = None,
306
- webhook: Optional[Union[str, WebhookConfig]] = None,
307
- scrape_options: Optional[ScrapeOptions] = None,
308
- zero_data_retention: bool = False
309
- ) -> CrawlResponse:
310
- """
311
- Start an asynchronous crawl job.
312
-
313
- Args:
314
- url: Target URL to start crawling from
315
- prompt: Optional prompt to guide the crawl
316
- exclude_paths: Patterns of URLs to exclude
317
- include_paths: Patterns of URLs to include
318
- max_discovery_depth: Maximum depth for finding new URLs
319
- ignore_sitemap: Skip sitemap.xml processing
320
- ignore_query_parameters: Ignore URL parameters
321
- limit: Maximum pages to crawl
322
- crawl_entire_domain: Follow parent directory links
323
- allow_external_links: Follow external domain links
324
- allow_subdomains: Follow subdomains
325
- delay: Delay in seconds between scrapes
326
- max_concurrency: Maximum number of concurrent scrapes
327
- webhook: Webhook configuration for notifications
328
- scrape_options: Page scraping configuration
329
- zero_data_retention: Whether to delete data after 24 hours
330
-
331
- Returns:
332
- CrawlResponse with job information
333
-
334
- Raises:
335
- ValueError: If request is invalid
336
- Exception: If the crawl operation fails to start
337
- """
338
- request = CrawlRequest(
339
- url=url,
340
- prompt=prompt,
341
- exclude_paths=exclude_paths,
342
- include_paths=include_paths,
343
- max_discovery_depth=max_discovery_depth,
344
- ignore_sitemap=ignore_sitemap,
345
- ignore_query_parameters=ignore_query_parameters,
346
- limit=limit,
347
- crawl_entire_domain=crawl_entire_domain,
348
- allow_external_links=allow_external_links,
349
- allow_subdomains=allow_subdomains,
350
- delay=delay,
351
- max_concurrency=max_concurrency,
352
- webhook=webhook,
353
- scrape_options=scrape_options,
354
- zero_data_retention=zero_data_retention
355
- )
356
-
357
- return crawl_module.start_crawl(self.http_client, request)
358
-
359
- def get_crawl_status(self, job_id: str) -> CrawlJob:
360
- """
361
- Get the status of a crawl job.
362
-
363
- Args:
364
- job_id: ID of the crawl job
365
-
366
- Returns:
367
- CrawlJob with current status and data
368
-
369
- Raises:
370
- Exception: If the status check fails
371
- """
372
- return crawl_module.get_crawl_status(self.http_client, job_id)
373
-
374
- def get_crawl_errors(self, crawl_id: str) -> CrawlErrorsResponse:
375
- """
376
- Retrieve error details and robots.txt blocks for a given crawl job.
377
-
378
- Args:
379
- crawl_id: The ID of the crawl job
380
-
381
- Returns:
382
- CrawlErrorsResponse containing per-URL errors and robots-blocked URLs
383
- """
384
- return crawl_module.get_crawl_errors(self.http_client, crawl_id)
385
-
386
- def get_active_crawls(self) -> ActiveCrawlsResponse:
387
- """
388
- Get a list of currently active crawl jobs.
389
-
390
- Returns:
391
- ActiveCrawlsResponse containing a list of active crawl jobs.
392
- """
393
- return crawl_module.get_active_crawls(self.http_client)
394
-
395
- def active_crawls(self) -> ActiveCrawlsResponse:
396
- """
397
- List currently active crawl jobs for the authenticated team.
398
-
399
- Returns:
400
- ActiveCrawlsResponse containing the list of active crawl jobs
401
- """
402
- return self.get_active_crawls()
403
-
404
- def map(
405
- self,
406
- url: str,
407
- *,
408
- search: Optional[str] = None,
409
- include_subdomains: Optional[bool] = None,
410
- limit: Optional[int] = None,
411
- sitemap: Optional[Literal["only", "include", "skip"]] = None,
412
- timeout: Optional[int] = None,
413
- ) -> MapData:
414
- """Map a URL and return discovered links.
415
-
416
- Args:
417
- url: Root URL to explore
418
- search: Optional substring filter for discovered links
419
- include_subdomains: Whether to include subdomains
420
- limit: Maximum number of links to return
421
- sitemap: Sitemap usage mode ("only" | "include" | "skip")
422
- timeout: Request timeout in milliseconds
423
-
424
- Returns:
425
- MapData containing the discovered links
426
- """
427
- options = MapOptions(
428
- search=search,
429
- include_subdomains=include_subdomains,
430
- limit=limit,
431
- sitemap=sitemap if sitemap is not None else "include",
432
- timeout=timeout,
433
- ) if any(v is not None for v in [search, include_subdomains, limit, sitemap, timeout]) else None
434
-
435
- return map_module.map(self.http_client, url, options)
436
-
437
- def cancel_crawl(self, crawl_id: str) -> bool:
438
- """
439
- Cancel a crawl job.
440
-
441
- Args:
442
- crawl_id: The ID of the crawl job to cancel
443
-
444
- Returns:
445
- bool: True if the crawl was cancelled, False otherwise
446
- """
447
- return crawl_module.cancel_crawl(self.http_client, crawl_id)
448
-
449
- def crawl_params_preview(self, url: str, prompt: str) -> CrawlParamsData:
450
- """Derive crawl parameters from natural-language prompt.
451
-
452
- Args:
453
- url: Root URL
454
- prompt: Instruction describing how to crawl
455
-
456
- Returns:
457
- CrawlParamsData with normalized crawl configuration
458
- """
459
- request = CrawlParamsRequest(url=url, prompt=prompt)
460
- return crawl_module.crawl_params_preview(self.http_client, request)
461
-
462
- def start_extract(
463
- self,
464
- urls: Optional[List[str]] = None,
465
- *,
466
- prompt: Optional[str] = None,
467
- schema: Optional[Dict[str, Any]] = None,
468
- system_prompt: Optional[str] = None,
469
- allow_external_links: Optional[bool] = None,
470
- enable_web_search: Optional[bool] = None,
471
- show_sources: Optional[bool] = None,
472
- scrape_options: Optional['ScrapeOptions'] = None,
473
- ignore_invalid_urls: Optional[bool] = None,
474
- ):
475
- """Start an extract job (non-blocking).
476
-
477
- Args:
478
- urls: URLs to extract from (optional)
479
- prompt: Natural-language instruction for extraction
480
- schema: Target JSON schema for the output
481
- system_prompt: Optional system instruction
482
- allow_external_links: Allow hyperlinks in output
483
- enable_web_search: Whether to augment with web search
484
- show_sources: Include per-field/source mapping when available
485
- scrape_options: Scrape options applied prior to extraction
486
- ignore_invalid_urls: Skip invalid URLs instead of failing
487
-
488
- Returns:
489
- Response payload with job id/status (poll with get_extract_status)
490
- """
491
- return extract_module.start_extract(
492
- self.http_client,
493
- urls,
494
- prompt=prompt,
495
- schema=schema,
496
- system_prompt=system_prompt,
497
- allow_external_links=allow_external_links,
498
- enable_web_search=enable_web_search,
499
- show_sources=show_sources,
500
- scrape_options=scrape_options,
501
- ignore_invalid_urls=ignore_invalid_urls,
502
- )
503
-
504
- def extract(
505
- self,
506
- urls: Optional[List[str]] = None,
507
- *,
508
- prompt: Optional[str] = None,
509
- schema: Optional[Dict[str, Any]] = None,
510
- system_prompt: Optional[str] = None,
511
- allow_external_links: Optional[bool] = None,
512
- enable_web_search: Optional[bool] = None,
513
- show_sources: Optional[bool] = None,
514
- scrape_options: Optional['ScrapeOptions'] = None,
515
- ignore_invalid_urls: Optional[bool] = None,
516
- poll_interval: int = 2,
517
- timeout: Optional[int] = None,
518
- ):
519
- """Extract structured data and wait until completion.
520
-
521
- Args:
522
- urls: URLs to extract from (optional)
523
- prompt: Natural-language instruction for extraction
524
- schema: Target JSON schema for the output
525
- system_prompt: Optional system instruction
526
- allow_external_links: Allow hyperlinks in output
527
- enable_web_search: Whether to augment with web search
528
- show_sources: Include per-field/source mapping when available
529
- scrape_options: Scrape options applied prior to extraction
530
- ignore_invalid_urls: Skip invalid URLs instead of failing
531
- poll_interval: Seconds between status checks
532
- timeout: Maximum seconds to wait (None for no timeout)
533
-
534
- Returns:
535
- Final extract response when completed
536
- """
537
- return extract_module.extract(
538
- self.http_client,
539
- urls,
540
- prompt=prompt,
541
- schema=schema,
542
- system_prompt=system_prompt,
543
- allow_external_links=allow_external_links,
544
- enable_web_search=enable_web_search,
545
- show_sources=show_sources,
546
- scrape_options=scrape_options,
547
- ignore_invalid_urls=ignore_invalid_urls,
548
- poll_interval=poll_interval,
549
- timeout=timeout,
550
- )
551
-
552
- def start_batch_scrape(
553
- self,
554
- urls: List[str],
555
- *,
556
- formats: Optional[List['FormatOption']] = None,
557
- headers: Optional[Dict[str, str]] = None,
558
- include_tags: Optional[List[str]] = None,
559
- exclude_tags: Optional[List[str]] = None,
560
- only_main_content: Optional[bool] = None,
561
- timeout: Optional[int] = None,
562
- wait_for: Optional[int] = None,
563
- mobile: Optional[bool] = None,
564
- parsers: Optional[List[str]] = None,
565
- actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
566
- location: Optional['Location'] = None,
567
- skip_tls_verification: Optional[bool] = None,
568
- remove_base64_images: Optional[bool] = None,
569
- fast_mode: Optional[bool] = None,
570
- use_mock: Optional[str] = None,
571
- block_ads: Optional[bool] = None,
572
- proxy: Optional[str] = None,
573
- max_age: Optional[int] = None,
574
- store_in_cache: Optional[bool] = None,
575
- webhook: Optional[Union[str, WebhookConfig]] = None,
576
- append_to_id: Optional[str] = None,
577
- ignore_invalid_urls: Optional[bool] = None,
578
- max_concurrency: Optional[int] = None,
579
- zero_data_retention: Optional[bool] = None,
580
- integration: Optional[str] = None,
581
- idempotency_key: Optional[str] = None,
582
- ):
583
- """Start a batch scrape job over multiple URLs (non-blocking).
584
-
585
- Args:
586
- urls: List of URLs to scrape
587
- formats: Output formats to collect per URL
588
- headers: HTTP headers
589
- include_tags: HTML tags to include
590
- exclude_tags: HTML tags to exclude
591
- only_main_content: Restrict scraping to main content
592
- timeout: Per-request timeout in milliseconds
593
- wait_for: Wait condition in milliseconds
594
- mobile: Emulate mobile viewport
595
- parsers: Parser list (e.g., ["pdf"])
596
- actions: Browser actions to perform
597
- location: Location settings
598
- skip_tls_verification: Skip TLS verification
599
- remove_base64_images: Remove base64 images from output
600
- fast_mode: Prefer faster scraping modes
601
- use_mock: Use a mock data source (internal/testing)
602
- block_ads: Block ads during scraping
603
- proxy: Proxy setting
604
- max_age: Cache max age
605
- store_in_cache: Whether to store results in cache
606
- webhook: Webhook configuration
607
- append_to_id: Append to an existing batch job
608
- ignore_invalid_urls: Skip invalid URLs without failing
609
- max_concurrency: Max concurrent scrapes
610
- zero_data_retention: Delete data after 24 hours
611
- integration: Integration tag/name
612
- idempotency_key: Header used to deduplicate starts
613
-
614
- Returns:
615
- Response payload with job id (poll with get_batch_scrape_status)
616
- """
617
- options = ScrapeOptions(
618
- **{k: v for k, v in dict(
619
- formats=formats,
620
- headers=headers,
621
- include_tags=include_tags,
622
- exclude_tags=exclude_tags,
623
- only_main_content=only_main_content,
624
- timeout=timeout,
625
- wait_for=wait_for,
626
- mobile=mobile,
627
- parsers=parsers,
628
- actions=actions,
629
- location=location,
630
- skip_tls_verification=skip_tls_verification,
631
- remove_base64_images=remove_base64_images,
632
- fast_mode=fast_mode,
633
- use_mock=use_mock,
634
- block_ads=block_ads,
635
- proxy=proxy,
636
- max_age=max_age,
637
- store_in_cache=store_in_cache,
638
- ).items() if v is not None}
639
- ) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
640
-
641
- return batch_module.start_batch_scrape(
642
- self.http_client,
643
- urls,
644
- options=options,
645
- webhook=webhook,
646
- append_to_id=append_to_id,
647
- ignore_invalid_urls=ignore_invalid_urls,
648
- max_concurrency=max_concurrency,
649
- zero_data_retention=zero_data_retention,
650
- integration=integration,
651
- idempotency_key=idempotency_key,
652
- )
653
-
654
- def get_batch_scrape_status(self, job_id: str):
655
- """Get current status and any scraped data for a batch job.
656
-
657
- Args:
658
- job_id: Batch job ID
659
-
660
- Returns:
661
- Status payload including counts and partial data
662
- """
663
- return batch_module.get_batch_scrape_status(self.http_client, job_id)
664
-
665
- def cancel_batch_scrape(self, job_id: str) -> bool:
666
- """Cancel a running batch scrape job.
667
-
668
- Args:
669
- job_id: Batch job ID
670
-
671
- Returns:
672
- True if the job was cancelled
673
- """
674
- return batch_module.cancel_batch_scrape(self.http_client, job_id)
675
-
676
- def get_batch_scrape_errors(self, job_id: str):
677
- """Retrieve error details for a batch scrape job.
678
-
679
- Args:
680
- job_id: Batch job ID
681
-
682
- Returns:
683
- Errors and robots-blocked URLs for the job
684
- """
685
- return batch_methods.get_batch_scrape_errors(self.http_client, job_id)
686
-
687
- def get_extract_status(self, job_id: str):
688
- """Get the current status (and data if completed) of an extract job.
689
-
690
- Args:
691
- job_id: Extract job ID
692
-
693
- Returns:
694
- Extract response payload with status and optional data
695
- """
696
- return extract_module.get_extract_status(self.http_client, job_id)
697
-
698
- def get_concurrency(self):
699
- """Get current concurrency and maximum allowed for this team/key (v2)."""
700
- return usage_methods.get_concurrency(self.http_client)
701
-
702
- def get_credit_usage(self):
703
- """Get remaining credits for this team/key (v2)."""
704
- return usage_methods.get_credit_usage(self.http_client)
705
-
706
- def get_token_usage(self):
707
- """Get recent token usage metrics (v2)."""
708
- return usage_methods.get_token_usage(self.http_client)
709
-
710
- def watcher(
711
- self,
712
- job_id: str,
713
- *,
714
- kind: Literal["crawl", "batch"] = "crawl",
715
- poll_interval: int = 2,
716
- timeout: Optional[int] = None,
717
- ) -> Watcher:
718
- """Create a watcher for crawl or batch jobs.
719
-
720
- Args:
721
- job_id: Job ID to watch
722
- kind: Job kind ("crawl" or "batch")
723
- poll_interval: Seconds between status checks
724
- timeout: Maximum seconds to watch (None for no timeout)
725
-
726
- Returns:
727
- Watcher instance
728
- """
729
- return Watcher(self, job_id, kind=kind, poll_interval=poll_interval, timeout=timeout)
730
-
731
- def batch_scrape(
732
- self,
733
- urls: List[str],
734
- *,
735
- formats: Optional[List['FormatOption']] = None,
736
- headers: Optional[Dict[str, str]] = None,
737
- include_tags: Optional[List[str]] = None,
738
- exclude_tags: Optional[List[str]] = None,
739
- only_main_content: Optional[bool] = None,
740
- timeout: Optional[int] = None,
741
- wait_for: Optional[int] = None,
742
- mobile: Optional[bool] = None,
743
- parsers: Optional[List[str]] = None,
744
- actions: Optional[List[Union['WaitAction', 'ScreenshotAction', 'ClickAction', 'WriteAction', 'PressAction', 'ScrollAction', 'ScrapeAction', 'ExecuteJavascriptAction', 'PDFAction']]] = None,
745
- location: Optional['Location'] = None,
746
- skip_tls_verification: Optional[bool] = None,
747
- remove_base64_images: Optional[bool] = None,
748
- fast_mode: Optional[bool] = None,
749
- use_mock: Optional[str] = None,
750
- block_ads: Optional[bool] = None,
751
- proxy: Optional[str] = None,
752
- max_age: Optional[int] = None,
753
- store_in_cache: Optional[bool] = None,
754
- webhook: Optional[Union[str, WebhookConfig]] = None,
755
- append_to_id: Optional[str] = None,
756
- ignore_invalid_urls: Optional[bool] = None,
757
- max_concurrency: Optional[int] = None,
758
- zero_data_retention: Optional[bool] = None,
759
- integration: Optional[str] = None,
760
- idempotency_key: Optional[str] = None,
761
- poll_interval: int = 2,
762
- wait_timeout: Optional[int] = None,
763
- ):
764
- """
765
- Start a batch scrape job and wait until completion.
766
- """
767
- options = ScrapeOptions(
768
- **{k: v for k, v in dict(
769
- formats=formats,
770
- headers=headers,
771
- include_tags=include_tags,
772
- exclude_tags=exclude_tags,
773
- only_main_content=only_main_content,
774
- timeout=timeout,
775
- wait_for=wait_for,
776
- mobile=mobile,
777
- parsers=parsers,
778
- actions=actions,
779
- location=location,
780
- skip_tls_verification=skip_tls_verification,
781
- remove_base64_images=remove_base64_images,
782
- fast_mode=fast_mode,
783
- use_mock=use_mock,
784
- block_ads=block_ads,
785
- proxy=proxy,
786
- max_age=max_age,
787
- store_in_cache=store_in_cache,
788
- ).items() if v is not None}
789
- ) if any(v is not None for v in [formats, headers, include_tags, exclude_tags, only_main_content, timeout, wait_for, mobile, parsers, actions, location, skip_tls_verification, remove_base64_images, fast_mode, use_mock, block_ads, proxy, max_age, store_in_cache]) else None
790
-
791
- return batch_module.batch_scrape(
792
- self.http_client,
793
- urls,
794
- options=options,
795
- webhook=webhook,
796
- append_to_id=append_to_id,
797
- ignore_invalid_urls=ignore_invalid_urls,
798
- max_concurrency=max_concurrency,
799
- zero_data_retention=zero_data_retention,
800
- integration=integration,
801
- idempotency_key=idempotency_key,
802
- poll_interval=poll_interval,
803
- timeout=wait_timeout,
804
- )
805
-