firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,188 @@
1
+ from typing import Optional, List, Dict, Any
2
+ from ...types import ScrapeOptions, WebhookConfig, Document, BatchScrapeResponse, BatchScrapeJob, PaginationConfig
3
+ from ...utils.http_client_async import AsyncHttpClient
4
+ from ...utils.validation import prepare_scrape_options
5
+ from ...utils.error_handler import handle_response_error
6
+ from ...utils.normalize import normalize_document_input
7
+ from ...methods.batch import validate_batch_urls
8
+ import time
9
+
10
+ def _prepare(urls: List[str], *, options: Optional[ScrapeOptions] = None, **kwargs) -> Dict[str, Any]:
11
+ if not urls:
12
+ raise ValueError("URLs list cannot be empty")
13
+
14
+ validated_urls = validate_batch_urls([u.strip() if isinstance(u, str) else u for u in urls])
15
+ payload: Dict[str, Any] = {"urls": validated_urls}
16
+ if options:
17
+ opts = prepare_scrape_options(options)
18
+ if opts:
19
+ payload.update(opts)
20
+ if (w := kwargs.get("webhook")) is not None:
21
+ payload["webhook"] = w if isinstance(w, str) else w.model_dump(exclude_none=True)
22
+ if (v := kwargs.get("append_to_id")) is not None:
23
+ payload["appendToId"] = v
24
+ if (v := kwargs.get("ignore_invalid_urls")) is not None:
25
+ payload["ignoreInvalidURLs"] = v
26
+ if (v := kwargs.get("max_concurrency")) is not None:
27
+ payload["maxConcurrency"] = v
28
+ if (v := kwargs.get("zero_data_retention")) is not None:
29
+ payload["zeroDataRetention"] = v
30
+ if (v := kwargs.get("integration")) is not None:
31
+ trimmed_integration = str(v).strip()
32
+ if trimmed_integration:
33
+ payload["integration"] = trimmed_integration
34
+ return payload
35
+
36
+
37
+ async def start_batch_scrape(client: AsyncHttpClient, urls: List[str], **kwargs) -> BatchScrapeResponse:
38
+ payload = _prepare(urls, **kwargs)
39
+ response = await client.post("/v2/batch/scrape", payload)
40
+ if response.status_code >= 400:
41
+ handle_response_error(response, "start batch scrape")
42
+ body = response.json()
43
+ if not body.get("success"):
44
+ raise Exception(body.get("error", "Unknown error occurred"))
45
+ return BatchScrapeResponse(id=body.get("id"), url=body.get("url"), invalid_urls=body.get("invalidURLs"))
46
+
47
+
48
+ async def get_batch_scrape_status(
49
+ client: AsyncHttpClient,
50
+ job_id: str,
51
+ pagination_config: Optional[PaginationConfig] = None
52
+ ) -> BatchScrapeJob:
53
+ """
54
+ Get the status of a batch scrape job.
55
+
56
+ Args:
57
+ client: Async HTTP client instance
58
+ job_id: ID of the batch scrape job
59
+ pagination_config: Optional configuration for pagination behavior
60
+
61
+ Returns:
62
+ BatchScrapeJob containing job status and data
63
+
64
+ Raises:
65
+ Exception: If the status check fails
66
+ """
67
+ response = await client.get(f"/v2/batch/scrape/{job_id}")
68
+ if response.status_code >= 400:
69
+ handle_response_error(response, "get batch scrape status")
70
+ body = response.json()
71
+ if not body.get("success"):
72
+ raise Exception(body.get("error", "Unknown error occurred"))
73
+ docs: List[Document] = []
74
+ for doc in body.get("data", []) or []:
75
+ if isinstance(doc, dict):
76
+ normalized = normalize_document_input(doc)
77
+ docs.append(Document(**normalized))
78
+
79
+ # Handle pagination if requested
80
+ auto_paginate = pagination_config.auto_paginate if pagination_config else True
81
+ if auto_paginate and body.get("next"):
82
+ docs = await _fetch_all_batch_pages_async(
83
+ client,
84
+ body.get("next"),
85
+ docs,
86
+ pagination_config
87
+ )
88
+
89
+ return BatchScrapeJob(
90
+ status=body.get("status"),
91
+ completed=body.get("completed", 0),
92
+ total=body.get("total", 0),
93
+ credits_used=body.get("creditsUsed"),
94
+ expires_at=body.get("expiresAt"),
95
+ next=body.get("next") if not auto_paginate else None,
96
+ data=docs,
97
+ )
98
+
99
+
100
+ async def _fetch_all_batch_pages_async(
101
+ client: AsyncHttpClient,
102
+ next_url: str,
103
+ initial_documents: List[Document],
104
+ pagination_config: Optional[PaginationConfig] = None
105
+ ) -> List[Document]:
106
+ """
107
+ Fetch all pages of batch scrape results asynchronously.
108
+
109
+ Args:
110
+ client: Async HTTP client instance
111
+ next_url: URL for the next page
112
+ initial_documents: Documents from the first page
113
+ pagination_config: Optional configuration for pagination limits
114
+
115
+ Returns:
116
+ List of all documents from all pages
117
+ """
118
+ documents = initial_documents.copy()
119
+ current_url = next_url
120
+ page_count = 0
121
+
122
+ # Apply pagination limits
123
+ max_pages = pagination_config.max_pages if pagination_config else None
124
+ max_results = pagination_config.max_results if pagination_config else None
125
+ max_wait_time = pagination_config.max_wait_time if pagination_config else None
126
+
127
+ start_time = time.monotonic()
128
+
129
+ while current_url:
130
+ # Check pagination limits
131
+ if (max_pages is not None) and (page_count >= max_pages):
132
+ break
133
+
134
+ if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
135
+ break
136
+
137
+ # Fetch next page
138
+ response = await client.get(current_url)
139
+
140
+ if response.status_code >= 400:
141
+ # Log error but continue with what we have
142
+ import logging
143
+ logger = logging.getLogger("firecrawl")
144
+ logger.warning(f"Failed to fetch next page: {response.status_code}")
145
+ break
146
+
147
+ page_data = response.json()
148
+
149
+ if not page_data.get("success"):
150
+ break
151
+
152
+ # Add documents from this page
153
+ for doc in page_data.get("data", []) or []:
154
+ if isinstance(doc, dict):
155
+ # Check max_results limit
156
+ if (max_results is not None) and (len(documents) >= max_results):
157
+ break
158
+ normalized = normalize_document_input(doc)
159
+ documents.append(Document(**normalized))
160
+
161
+ # Check if we hit max_results limit
162
+ if (max_results is not None) and (len(documents) >= max_results):
163
+ break
164
+
165
+ # Get next URL
166
+ current_url = page_data.get("next")
167
+ page_count += 1
168
+
169
+ return documents
170
+
171
+
172
+ async def cancel_batch_scrape(client: AsyncHttpClient, job_id: str) -> bool:
173
+ response = await client.delete(f"/v2/batch/scrape/{job_id}")
174
+ if response.status_code >= 400:
175
+ handle_response_error(response, "cancel batch scrape")
176
+ body = response.json()
177
+ return body.get("status") == "cancelled"
178
+
179
+
180
+ async def get_batch_scrape_errors(client: AsyncHttpClient, job_id: str) -> Dict[str, Any]:
181
+ response = await client.get(f"/v2/batch/scrape/{job_id}/errors")
182
+ if response.status_code >= 400:
183
+ handle_response_error(response, "get batch scrape errors")
184
+ body = response.json()
185
+ if not body.get("success"):
186
+ raise Exception(body.get("error", "Unknown error occurred"))
187
+ return body
188
+
@@ -0,0 +1,351 @@
1
+ from typing import Optional, Dict, Any, List
2
+ from ...types import (
3
+ CrawlRequest,
4
+ CrawlJob,
5
+ CrawlResponse,
6
+ Document,
7
+ CrawlParamsRequest,
8
+ CrawlParamsData,
9
+ WebhookConfig,
10
+ CrawlErrorsResponse,
11
+ ActiveCrawlsResponse,
12
+ ActiveCrawl,
13
+ PaginationConfig,
14
+ )
15
+ from ...utils.error_handler import handle_response_error
16
+ from ...utils.validation import prepare_scrape_options
17
+ from ...utils.http_client_async import AsyncHttpClient
18
+ from ...utils.normalize import normalize_document_input
19
+ import time
20
+
21
+
22
+ def _prepare_crawl_request(request: CrawlRequest) -> dict:
23
+ if not request.url or not request.url.strip():
24
+ raise ValueError("URL cannot be empty")
25
+ data = {"url": request.url}
26
+ if request.prompt:
27
+ data["prompt"] = request.prompt
28
+ if request.scrape_options is not None:
29
+ opts = prepare_scrape_options(request.scrape_options)
30
+ if opts:
31
+ data["scrapeOptions"] = opts
32
+ # Webhook conversion
33
+ if request.webhook is not None:
34
+ if isinstance(request.webhook, str):
35
+ data["webhook"] = request.webhook
36
+ else:
37
+ data["webhook"] = request.webhook.model_dump(exclude_none=True)
38
+ request_data = request.model_dump(exclude_none=True, exclude_unset=True)
39
+ request_data.pop("url", None)
40
+ request_data.pop("prompt", None)
41
+ request_data.pop("scrape_options", None)
42
+ field_mappings = {
43
+ "include_paths": "includePaths",
44
+ "exclude_paths": "excludePaths",
45
+ "max_discovery_depth": "maxDiscoveryDepth",
46
+ "ignore_sitemap": "ignoreSitemap",
47
+ "ignore_query_parameters": "ignoreQueryParameters",
48
+ "crawl_entire_domain": "crawlEntireDomain",
49
+ "allow_external_links": "allowExternalLinks",
50
+ "allow_subdomains": "allowSubdomains",
51
+ "delay": "delay",
52
+ "max_concurrency": "maxConcurrency",
53
+ "zero_data_retention": "zeroDataRetention",
54
+ }
55
+ for snake, camel in field_mappings.items():
56
+ if snake in request_data:
57
+ data[camel] = request_data.pop(snake)
58
+ data.update(request_data)
59
+ if getattr(request, "integration", None) is not None:
60
+ data["integration"] = str(getattr(request, "integration")).strip()
61
+ return data
62
+
63
+
64
+ async def start_crawl(client: AsyncHttpClient, request: CrawlRequest) -> CrawlResponse:
65
+ """
66
+ Start a crawl job for a website.
67
+
68
+ Args:
69
+ client: Async HTTP client instance
70
+ request: CrawlRequest containing URL and options
71
+
72
+ Returns:
73
+ CrawlResponse with job information
74
+
75
+ Raises:
76
+ ValueError: If request is invalid
77
+ Exception: If the crawl operation fails to start
78
+ """
79
+ payload = _prepare_crawl_request(request)
80
+ response = await client.post("/v2/crawl", payload)
81
+ if response.status_code >= 400:
82
+ handle_response_error(response, "start crawl")
83
+ body = response.json()
84
+ if body.get("success"):
85
+ return CrawlResponse(id=body.get("id"), url=body.get("url"))
86
+ raise Exception(body.get("error", "Unknown error occurred"))
87
+
88
+
89
+ async def get_crawl_status(
90
+ client: AsyncHttpClient,
91
+ job_id: str,
92
+ pagination_config: Optional[PaginationConfig] = None,
93
+ *,
94
+ request_timeout: Optional[float] = None,
95
+ ) -> CrawlJob:
96
+ """
97
+ Get the status of a crawl job.
98
+
99
+ Args:
100
+ client: Async HTTP client instance
101
+ job_id: ID of the crawl job
102
+ pagination_config: Optional configuration for pagination limits
103
+ request_timeout: Timeout (in seconds) for each individual HTTP request. When auto-pagination
104
+ is enabled (default) and there are multiple pages of results, this timeout applies to
105
+ each page request separately, not to the entire operation
106
+
107
+ Returns:
108
+ CrawlJob with job information
109
+
110
+ Raises:
111
+ Exception: If the status check fails
112
+ """
113
+ response = await client.get(f"/v2/crawl/{job_id}", timeout=request_timeout)
114
+ if response.status_code >= 400:
115
+ handle_response_error(response, "get crawl status")
116
+ body = response.json()
117
+ if body.get("success"):
118
+ documents = []
119
+ for doc_data in body.get("data", []):
120
+ if isinstance(doc_data, dict):
121
+ normalized = normalize_document_input(doc_data)
122
+ documents.append(Document(**normalized))
123
+
124
+ # Handle pagination if requested
125
+ auto_paginate = pagination_config.auto_paginate if pagination_config else True
126
+ if auto_paginate and body.get("next"):
127
+ documents = await _fetch_all_pages_async(
128
+ client,
129
+ body.get("next"),
130
+ documents,
131
+ pagination_config,
132
+ request_timeout=request_timeout,
133
+ )
134
+
135
+ return CrawlJob(
136
+ status=body.get("status"),
137
+ completed=body.get("completed", 0),
138
+ total=body.get("total", 0),
139
+ credits_used=body.get("creditsUsed", 0),
140
+ expires_at=body.get("expiresAt"),
141
+ next=body.get("next") if not auto_paginate else None,
142
+ data=documents,
143
+ )
144
+ raise Exception(body.get("error", "Unknown error occurred"))
145
+
146
+
147
+ async def _fetch_all_pages_async(
148
+ client: AsyncHttpClient,
149
+ next_url: str,
150
+ initial_documents: List[Document],
151
+ pagination_config: Optional[PaginationConfig] = None,
152
+ *,
153
+ request_timeout: Optional[float] = None,
154
+ ) -> List[Document]:
155
+ """
156
+ Fetch all pages of crawl results asynchronously.
157
+
158
+ Args:
159
+ client: Async HTTP client instance
160
+ next_url: URL for the next page
161
+ initial_documents: Documents from the first page
162
+ pagination_config: Optional configuration for pagination limits
163
+ request_timeout: Optional timeout (in seconds) for the underlying HTTP request
164
+
165
+ Returns:
166
+ List of all documents from all pages
167
+ """
168
+ documents = initial_documents.copy()
169
+ current_url = next_url
170
+ page_count = 0
171
+
172
+ # Apply pagination limits
173
+ max_pages = pagination_config.max_pages if pagination_config else None
174
+ max_results = pagination_config.max_results if pagination_config else None
175
+ max_wait_time = pagination_config.max_wait_time if pagination_config else None
176
+
177
+ start_time = time.monotonic()
178
+
179
+ while current_url:
180
+ # Check pagination limits (treat 0 as a valid limit)
181
+ if (max_pages is not None) and page_count >= max_pages:
182
+ break
183
+
184
+ if (max_wait_time is not None) and (time.monotonic() - start_time) > max_wait_time:
185
+ break
186
+
187
+ # Fetch next page
188
+ response = await client.get(current_url, timeout=request_timeout)
189
+
190
+ if response.status_code >= 400:
191
+ # Log error but continue with what we have
192
+ import logging
193
+ logger = logging.getLogger("firecrawl")
194
+ logger.warning("Failed to fetch next page", extra={"status_code": response.status_code})
195
+ break
196
+
197
+ page_data = response.json()
198
+
199
+ if not page_data.get("success"):
200
+ break
201
+
202
+ # Add documents from this page
203
+ for doc_data in page_data.get("data", []):
204
+ if isinstance(doc_data, dict):
205
+ # Check max_results limit
206
+ if (max_results is not None) and (len(documents) >= max_results):
207
+ break
208
+ normalized = normalize_document_input(doc_data)
209
+ documents.append(Document(**normalized))
210
+
211
+ # Check if we hit max_results limit
212
+ if (max_results is not None) and (len(documents) >= max_results):
213
+ break
214
+
215
+ # Get next URL
216
+ current_url = page_data.get("next")
217
+ page_count += 1
218
+
219
+ return documents
220
+
221
+
222
+ async def cancel_crawl(client: AsyncHttpClient, job_id: str) -> bool:
223
+ """
224
+ Cancel a crawl job.
225
+
226
+ Args:
227
+ client: Async HTTP client instance
228
+ job_id: ID of the crawl job
229
+
230
+ Returns:
231
+ True if cancellation was successful
232
+
233
+ Raises:
234
+ Exception: If the cancellation operation fails
235
+ """
236
+ response = await client.delete(f"/v2/crawl/{job_id}")
237
+ if response.status_code >= 400:
238
+ handle_response_error(response, "cancel crawl")
239
+ body = response.json()
240
+ return body.get("status") == "cancelled"
241
+
242
+
243
+ async def crawl_params_preview(client: AsyncHttpClient, request: CrawlParamsRequest) -> CrawlParamsData:
244
+ """
245
+ Preview crawl parameters before starting a crawl job.
246
+
247
+ Args:
248
+ client: Async HTTP client instance
249
+ request: CrawlParamsRequest containing URL and prompt
250
+
251
+ Returns:
252
+ CrawlParamsData containing crawl configuration
253
+
254
+ Raises:
255
+ ValueError: If request is invalid
256
+ Exception: If the parameter preview fails
257
+ """
258
+ if not request.url or not request.url.strip():
259
+ raise ValueError("URL cannot be empty")
260
+ if not request.prompt or not request.prompt.strip():
261
+ raise ValueError("Prompt cannot be empty")
262
+ payload = {"url": request.url, "prompt": request.prompt}
263
+ response = await client.post("/v2/crawl/params-preview", payload)
264
+ if response.status_code >= 400:
265
+ handle_response_error(response, "crawl params preview")
266
+ body = response.json()
267
+ if not body.get("success"):
268
+ raise Exception(body.get("error", "Unknown error occurred"))
269
+ params_data = body.get("data", {})
270
+ converted: Dict[str, Any] = {}
271
+ mapping = {
272
+ "includePaths": "include_paths",
273
+ "excludePaths": "exclude_paths",
274
+ "maxDiscoveryDepth": "max_discovery_depth",
275
+ "ignoreSitemap": "ignore_sitemap",
276
+ "ignoreQueryParameters": "ignore_query_parameters",
277
+ "crawlEntireDomain": "crawl_entire_domain",
278
+ "allowExternalLinks": "allow_external_links",
279
+ "allowSubdomains": "allow_subdomains",
280
+ "maxConcurrency": "max_concurrency",
281
+ "scrapeOptions": "scrape_options",
282
+ "zeroDataRetention": "zero_data_retention",
283
+ }
284
+ for camel, snake in mapping.items():
285
+ if camel in params_data:
286
+ converted[snake] = params_data[camel]
287
+ if "webhook" in params_data:
288
+ wk = params_data["webhook"]
289
+ converted["webhook"] = wk
290
+ if "warning" in body:
291
+ converted["warning"] = body["warning"]
292
+ return CrawlParamsData(**converted)
293
+
294
+
295
+ async def get_crawl_errors(client: AsyncHttpClient, crawl_id: str) -> CrawlErrorsResponse:
296
+ """
297
+ Get errors from a crawl job.
298
+
299
+ Args:
300
+ client: Async HTTP client instance
301
+ crawl_id: ID of the crawl job
302
+
303
+ Returns:
304
+ CrawlErrorsResponse with errors and robots blocked
305
+
306
+ Raises:
307
+ Exception: If the error check operation fails
308
+ """
309
+ response = await client.get(f"/v2/crawl/{crawl_id}/errors")
310
+ if response.status_code >= 400:
311
+ handle_response_error(response, "check crawl errors")
312
+ body = response.json()
313
+ payload = body.get("data", body)
314
+ normalized = {
315
+ "errors": payload.get("errors", []),
316
+ "robots_blocked": payload.get("robotsBlocked", payload.get("robots_blocked", [])),
317
+ }
318
+ return CrawlErrorsResponse(**normalized)
319
+
320
+
321
+ async def get_active_crawls(client: AsyncHttpClient) -> ActiveCrawlsResponse:
322
+ """
323
+ Get active crawl jobs.
324
+
325
+ Args:
326
+ client: Async HTTP client instance
327
+
328
+ Returns:
329
+ ActiveCrawlsResponse with active crawl jobs
330
+
331
+ Raises:
332
+ Exception: If the active crawl jobs operation fails
333
+ """
334
+ response = await client.get("/v2/crawl/active")
335
+ if response.status_code >= 400:
336
+ handle_response_error(response, "get active crawls")
337
+ body = response.json()
338
+ if not body.get("success"):
339
+ raise Exception(body.get("error", "Unknown error occurred"))
340
+ crawls_in = body.get("crawls", [])
341
+ normalized = []
342
+ for c in crawls_in:
343
+ if isinstance(c, dict):
344
+ normalized.append({
345
+ "id": c.get("id"),
346
+ "team_id": c.get("teamId", c.get("team_id")),
347
+ "url": c.get("url"),
348
+ "options": c.get("options"),
349
+ })
350
+ return ActiveCrawlsResponse(success=True, crawls=[ActiveCrawl(**nc) for nc in normalized])
351
+
@@ -0,0 +1,133 @@
1
+ from typing import Any, Dict, List, Optional
2
+ import asyncio
3
+
4
+ from ...types import ExtractResponse, ScrapeOptions
5
+ from ...utils.http_client_async import AsyncHttpClient
6
+ from ...utils.validation import prepare_scrape_options
7
+
8
+
9
+ def _prepare_extract_request(
10
+ urls: Optional[List[str]],
11
+ *,
12
+ prompt: Optional[str] = None,
13
+ schema: Optional[Dict[str, Any]] = None,
14
+ system_prompt: Optional[str] = None,
15
+ allow_external_links: Optional[bool] = None,
16
+ enable_web_search: Optional[bool] = None,
17
+ show_sources: Optional[bool] = None,
18
+ scrape_options: Optional[ScrapeOptions] = None,
19
+ ignore_invalid_urls: Optional[bool] = None,
20
+ integration: Optional[str] = None,
21
+ ) -> Dict[str, Any]:
22
+ body: Dict[str, Any] = {}
23
+ if urls is not None:
24
+ body["urls"] = urls
25
+ if prompt is not None:
26
+ body["prompt"] = prompt
27
+ if schema is not None:
28
+ body["schema"] = schema
29
+ if system_prompt is not None:
30
+ body["systemPrompt"] = system_prompt
31
+ if allow_external_links is not None:
32
+ body["allowExternalLinks"] = allow_external_links
33
+ if enable_web_search is not None:
34
+ body["enableWebSearch"] = enable_web_search
35
+ if show_sources is not None:
36
+ body["showSources"] = show_sources
37
+ if ignore_invalid_urls is not None:
38
+ body["ignoreInvalidURLs"] = ignore_invalid_urls
39
+ if scrape_options is not None:
40
+ prepared = prepare_scrape_options(scrape_options)
41
+ if prepared:
42
+ body["scrapeOptions"] = prepared
43
+ if integration is not None and str(integration).strip():
44
+ body["integration"] = str(integration).strip()
45
+ return body
46
+
47
+
48
+ async def start_extract(
49
+ client: AsyncHttpClient,
50
+ urls: Optional[List[str]],
51
+ *,
52
+ prompt: Optional[str] = None,
53
+ schema: Optional[Dict[str, Any]] = None,
54
+ system_prompt: Optional[str] = None,
55
+ allow_external_links: Optional[bool] = None,
56
+ enable_web_search: Optional[bool] = None,
57
+ show_sources: Optional[bool] = None,
58
+ scrape_options: Optional[ScrapeOptions] = None,
59
+ ignore_invalid_urls: Optional[bool] = None,
60
+ integration: Optional[str] = None,
61
+ ) -> ExtractResponse:
62
+ body = _prepare_extract_request(
63
+ urls,
64
+ prompt=prompt,
65
+ schema=schema,
66
+ system_prompt=system_prompt,
67
+ allow_external_links=allow_external_links,
68
+ enable_web_search=enable_web_search,
69
+ show_sources=show_sources,
70
+ scrape_options=scrape_options,
71
+ ignore_invalid_urls=ignore_invalid_urls,
72
+ integration=integration,
73
+ )
74
+ resp = await client.post("/v2/extract", body)
75
+ return ExtractResponse(**resp.json())
76
+
77
+
78
+ async def get_extract_status(client: AsyncHttpClient, job_id: str) -> ExtractResponse:
79
+ resp = await client.get(f"/v2/extract/{job_id}")
80
+ return ExtractResponse(**resp.json())
81
+
82
+
83
+ async def wait_extract(
84
+ client: AsyncHttpClient,
85
+ job_id: str,
86
+ *,
87
+ poll_interval: int = 2,
88
+ timeout: Optional[int] = None,
89
+ ) -> ExtractResponse:
90
+ start_ts = asyncio.get_event_loop().time()
91
+ while True:
92
+ status = await get_extract_status(client, job_id)
93
+ if status.status in ("completed", "failed", "cancelled"):
94
+ return status
95
+ if timeout is not None and (asyncio.get_event_loop().time() - start_ts) > timeout:
96
+ return status
97
+ await asyncio.sleep(max(1, poll_interval))
98
+
99
+
100
+ async def extract(
101
+ client: AsyncHttpClient,
102
+ urls: Optional[List[str]],
103
+ *,
104
+ prompt: Optional[str] = None,
105
+ schema: Optional[Dict[str, Any]] = None,
106
+ system_prompt: Optional[str] = None,
107
+ allow_external_links: Optional[bool] = None,
108
+ enable_web_search: Optional[bool] = None,
109
+ show_sources: Optional[bool] = None,
110
+ scrape_options: Optional[ScrapeOptions] = None,
111
+ ignore_invalid_urls: Optional[bool] = None,
112
+ poll_interval: int = 2,
113
+ timeout: Optional[int] = None,
114
+ integration: Optional[str] = None,
115
+ ) -> ExtractResponse:
116
+ started = await start_extract(
117
+ client,
118
+ urls,
119
+ prompt=prompt,
120
+ schema=schema,
121
+ system_prompt=system_prompt,
122
+ allow_external_links=allow_external_links,
123
+ enable_web_search=enable_web_search,
124
+ show_sources=show_sources,
125
+ scrape_options=scrape_options,
126
+ ignore_invalid_urls=ignore_invalid_urls,
127
+ integration=integration,
128
+ )
129
+ job_id = getattr(started, "id", None)
130
+ if not job_id:
131
+ return started
132
+ return await wait_extract(client, job_id, poll_interval=poll_interval, timeout=timeout)
133
+