firecrawl-py 3.2.1__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (85) hide show
  1. build/lib/firecrawl/__init__.py +87 -0
  2. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +188 -0
  4. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +248 -0
  8. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. build/lib/firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. build/lib/firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. build/lib/firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. build/lib/firecrawl/__tests__/e2e/v2/test_search.py +269 -0
  18. build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  36. build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. build/lib/firecrawl/client.py +242 -0
  41. build/lib/firecrawl/firecrawl.backup.py +4635 -0
  42. build/lib/firecrawl/types.py +161 -0
  43. build/lib/firecrawl/v1/__init__.py +14 -0
  44. build/lib/firecrawl/v1/client.py +4653 -0
  45. build/lib/firecrawl/v2/__init__.py +4 -0
  46. build/lib/firecrawl/v2/client.py +802 -0
  47. build/lib/firecrawl/v2/client_async.py +250 -0
  48. build/lib/firecrawl/v2/methods/aio/__init__.py +1 -0
  49. build/lib/firecrawl/v2/methods/aio/batch.py +85 -0
  50. build/lib/firecrawl/v2/methods/aio/crawl.py +171 -0
  51. build/lib/firecrawl/v2/methods/aio/extract.py +126 -0
  52. build/lib/firecrawl/v2/methods/aio/map.py +59 -0
  53. build/lib/firecrawl/v2/methods/aio/scrape.py +33 -0
  54. build/lib/firecrawl/v2/methods/aio/search.py +172 -0
  55. build/lib/firecrawl/v2/methods/aio/usage.py +42 -0
  56. build/lib/firecrawl/v2/methods/batch.py +417 -0
  57. build/lib/firecrawl/v2/methods/crawl.py +469 -0
  58. build/lib/firecrawl/v2/methods/extract.py +131 -0
  59. build/lib/firecrawl/v2/methods/map.py +77 -0
  60. build/lib/firecrawl/v2/methods/scrape.py +64 -0
  61. build/lib/firecrawl/v2/methods/search.py +197 -0
  62. build/lib/firecrawl/v2/methods/usage.py +41 -0
  63. build/lib/firecrawl/v2/types.py +665 -0
  64. build/lib/firecrawl/v2/utils/__init__.py +9 -0
  65. build/lib/firecrawl/v2/utils/error_handler.py +107 -0
  66. build/lib/firecrawl/v2/utils/get_version.py +15 -0
  67. build/lib/firecrawl/v2/utils/http_client.py +153 -0
  68. build/lib/firecrawl/v2/utils/http_client_async.py +65 -0
  69. build/lib/firecrawl/v2/utils/normalize.py +107 -0
  70. build/lib/firecrawl/v2/utils/validation.py +324 -0
  71. build/lib/firecrawl/v2/watcher.py +301 -0
  72. build/lib/firecrawl/v2/watcher_async.py +242 -0
  73. build/lib/tests/test_change_tracking.py +98 -0
  74. build/lib/tests/test_timeout_conversion.py +117 -0
  75. firecrawl/__init__.py +1 -1
  76. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +2 -2
  77. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +6 -6
  78. firecrawl/v2/methods/search.py +11 -0
  79. firecrawl/v2/types.py +30 -1
  80. {firecrawl_py-3.2.1.dist-info/licenses → firecrawl_py-3.3.0.dist-info}/LICENSE +0 -0
  81. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/METADATA +3 -7
  82. firecrawl_py-3.3.0.dist-info/RECORD +153 -0
  83. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/WHEEL +1 -1
  84. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/top_level.txt +2 -0
  85. firecrawl_py-3.2.1.dist-info/RECORD +0 -79
@@ -0,0 +1,107 @@
1
+ """
2
+ Error handling utilities for v2 API.
3
+ """
4
+
5
+ import requests
6
+ from typing import Dict, Any, Optional
7
+
8
+
9
+ class FirecrawlError(Exception):
10
+ """Base exception for Firecrawl API errors."""
11
+
12
+ def __init__(self, message: str, status_code: Optional[int] = None, response: Optional[requests.Response] = None):
13
+ super().__init__(message)
14
+ self.status_code = status_code
15
+ self.response = response
16
+
17
+
18
+ class BadRequestError(FirecrawlError):
19
+ """Raised when the request is invalid (400)."""
20
+ pass
21
+
22
+
23
+
24
+ class UnauthorizedError(FirecrawlError):
25
+ """Raised when the request is unauthorized (401)."""
26
+ pass
27
+
28
+
29
+ class PaymentRequiredError(FirecrawlError):
30
+ """Raised when payment is required (402)."""
31
+ pass
32
+
33
+
34
+ class WebsiteNotSupportedError(FirecrawlError):
35
+ """Raised when website is not supported (403)."""
36
+ pass
37
+
38
+
39
+ class RequestTimeoutError(FirecrawlError):
40
+ """Raised when request times out (408)."""
41
+ pass
42
+
43
+
44
+ class RateLimitError(FirecrawlError):
45
+ """Raised when the rate limit is exceeded (429)."""
46
+ pass
47
+
48
+
49
+ class InternalServerError(FirecrawlError):
50
+ """Raised when there's an internal server error (500)."""
51
+ pass
52
+
53
+
54
+ def handle_response_error(response: requests.Response, action: str) -> None:
55
+ """
56
+ Handle API response errors and raise appropriate exceptions.
57
+
58
+ Args:
59
+ response: The HTTP response object
60
+ action: Description of the action being performed
61
+
62
+ Raises:
63
+ FirecrawlError: Appropriate error based on status code
64
+ """
65
+ try:
66
+ response_json = response.json()
67
+ error_message = response_json.get('error', 'No error message provided.')
68
+ error_details = response_json.get('details', 'No additional error details provided.')
69
+ except:
70
+ # If we can't parse JSON, provide a helpful error message
71
+ try:
72
+ response_text = response.text[:500] # Limit to first 500 chars
73
+ if response_text.strip():
74
+ error_message = f"Server returned non-JSON response: {response_text}"
75
+ error_details = f"Full response status: {response.status_code}"
76
+ else:
77
+ error_message = f"Server returned empty response with status {response.status_code}"
78
+ error_details = "No additional details available"
79
+ except:
80
+ error_message = f"Server returned unreadable response with status {response.status_code}"
81
+ error_details = "No additional details available"
82
+
83
+ # Create appropriate error message
84
+ if response.status_code == 400:
85
+ message = f"Bad Request: Failed to {action}. {error_message} - {error_details}"
86
+ raise BadRequestError(message, response.status_code, response)
87
+ elif response.status_code == 401:
88
+ message = f"Unauthorized: Failed to {action}. {error_message} - {error_details}"
89
+ raise UnauthorizedError(message, response.status_code, response)
90
+ elif response.status_code == 402:
91
+ message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"
92
+ raise PaymentRequiredError(message, response.status_code, response)
93
+ elif response.status_code == 403:
94
+ message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}"
95
+ raise WebsiteNotSupportedError(message, response.status_code, response)
96
+ elif response.status_code == 408:
97
+ message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}"
98
+ raise RequestTimeoutError(message, response.status_code, response)
99
+ elif response.status_code == 429:
100
+ message = f"Rate Limit Exceeded: Failed to {action}. {error_message} - {error_details}"
101
+ raise RateLimitError(message, response.status_code, response)
102
+ elif response.status_code == 500:
103
+ message = f"Internal Server Error: Failed to {action}. {error_message} - {error_details}"
104
+ raise InternalServerError(message, response.status_code, response)
105
+ else:
106
+ message = f"Unexpected error during {action}: Status code {response.status_code}. {error_message} - {error_details}"
107
+ raise FirecrawlError(message, response.status_code, response)
@@ -0,0 +1,15 @@
1
+ import os
2
+ import re
3
+ from pathlib import Path
4
+
5
+ def get_version():
6
+ try:
7
+ package_path = Path(__file__).parents[2]
8
+ version_file = (package_path / "__init__.py").read_text()
9
+ version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M)
10
+ if version_match:
11
+ return version_match.group(1).strip()
12
+ return "3.x.x"
13
+ except Exception as e:
14
+ print(f"Failed to get version from __init__.py: {e}")
15
+ return "3.x.x"
@@ -0,0 +1,153 @@
1
+ """
2
+ HTTP client utilities for v2 API.
3
+ """
4
+
5
+ import time
6
+ from typing import Dict, Any, Optional
7
+ import requests
8
+ from .get_version import get_version
9
+
10
+ version = get_version()
11
+
12
+ class HttpClient:
13
+ """HTTP client with retry logic and error handling."""
14
+
15
+ def __init__(self, api_key: str, api_url: str):
16
+ self.api_key = api_key
17
+ self.api_url = api_url
18
+
19
+ def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
20
+ """Prepare headers for API requests."""
21
+ headers = {
22
+ 'Content-Type': 'application/json',
23
+ 'Authorization': f'Bearer {self.api_key}',
24
+ }
25
+
26
+ if idempotency_key:
27
+ headers['x-idempotency-key'] = idempotency_key
28
+
29
+ return headers
30
+
31
+ def post(
32
+ self,
33
+ endpoint: str,
34
+ data: Dict[str, Any],
35
+ headers: Optional[Dict[str, str]] = None,
36
+ timeout: Optional[float] = None,
37
+ retries: int = 3,
38
+ backoff_factor: float = 0.5
39
+ ) -> requests.Response:
40
+ """Make a POST request with retry logic."""
41
+ if headers is None:
42
+ headers = self._prepare_headers()
43
+
44
+ data['origin'] = f'python-sdk@{version}'
45
+
46
+ url = f"{self.api_url}{endpoint}"
47
+
48
+ last_exception = None
49
+
50
+ for attempt in range(retries):
51
+ try:
52
+ response = requests.post(
53
+ url,
54
+ headers=headers,
55
+ json=data,
56
+ timeout=timeout
57
+ )
58
+
59
+ if response.status_code == 502:
60
+ if attempt < retries - 1:
61
+ time.sleep(backoff_factor * (2 ** attempt))
62
+ continue
63
+
64
+ return response
65
+
66
+ except requests.RequestException as e:
67
+ last_exception = e
68
+ if attempt == retries - 1:
69
+ raise e
70
+ time.sleep(backoff_factor * (2 ** attempt))
71
+
72
+ # This should never be reached due to the exception handling above
73
+ raise last_exception or Exception("Unexpected error in POST request")
74
+
75
+ def get(
76
+ self,
77
+ endpoint: str,
78
+ headers: Optional[Dict[str, str]] = None,
79
+ timeout: Optional[float] = None,
80
+ retries: int = 3,
81
+ backoff_factor: float = 0.5
82
+ ) -> requests.Response:
83
+ """Make a GET request with retry logic."""
84
+ if headers is None:
85
+ headers = self._prepare_headers()
86
+
87
+ url = f"{self.api_url}{endpoint}"
88
+
89
+ last_exception = None
90
+
91
+ for attempt in range(retries):
92
+ try:
93
+ response = requests.get(
94
+ url,
95
+ headers=headers,
96
+ timeout=timeout
97
+ )
98
+
99
+ if response.status_code == 502:
100
+ if attempt < retries - 1:
101
+ time.sleep(backoff_factor * (2 ** attempt))
102
+ continue
103
+
104
+ return response
105
+
106
+ except requests.RequestException as e:
107
+ last_exception = e
108
+ if attempt == retries - 1:
109
+ raise e
110
+ time.sleep(backoff_factor * (2 ** attempt))
111
+
112
+ # This should never be reached due to the exception handling above
113
+ raise last_exception or Exception("Unexpected error in GET request")
114
+
115
+ def delete(
116
+ self,
117
+ endpoint: str,
118
+ headers: Optional[Dict[str, str]] = None,
119
+ timeout: Optional[float] = None,
120
+ retries: int = 3,
121
+ backoff_factor: float = 0.5
122
+ ) -> requests.Response:
123
+ """Make a DELETE request with retry logic."""
124
+ if headers is None:
125
+ headers = self._prepare_headers()
126
+
127
+ url = f"{self.api_url}{endpoint}"
128
+
129
+ last_exception = None
130
+
131
+ for attempt in range(retries):
132
+ try:
133
+ response = requests.delete(
134
+ url,
135
+ headers=headers,
136
+ timeout=timeout
137
+ )
138
+
139
+ if response.status_code == 502:
140
+ if attempt < retries - 1:
141
+ time.sleep(backoff_factor * (2 ** attempt))
142
+ continue
143
+
144
+ return response
145
+
146
+ except requests.RequestException as e:
147
+ last_exception = e
148
+ if attempt == retries - 1:
149
+ raise e
150
+ time.sleep(backoff_factor * (2 ** attempt))
151
+
152
+ # This should never be reached due to the exception handling above
153
+ raise last_exception or Exception("Unexpected error in DELETE request")
@@ -0,0 +1,65 @@
1
+ import httpx
2
+ from typing import Optional, Dict, Any
3
+ from .get_version import get_version
4
+
5
+ version = get_version()
6
+
7
+
8
+ class AsyncHttpClient:
9
+ def __init__(self, api_key: str, api_url: str):
10
+ self.api_key = api_key
11
+ self.api_url = api_url
12
+ self._client = httpx.AsyncClient(
13
+ base_url=api_url,
14
+ headers={
15
+ "Authorization": f"Bearer {api_key}",
16
+ "Content-Type": "application/json",
17
+ },
18
+ limits=httpx.Limits(max_keepalive_connections=0),
19
+ )
20
+
21
+ async def close(self) -> None:
22
+ await self._client.aclose()
23
+
24
+ def _headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
25
+ headers: Dict[str, str] = {}
26
+ if idempotency_key:
27
+ headers["x-idempotency-key"] = idempotency_key
28
+ return headers
29
+
30
+ async def post(
31
+ self,
32
+ endpoint: str,
33
+ data: Dict[str, Any],
34
+ headers: Optional[Dict[str, str]] = None,
35
+ timeout: Optional[float] = None,
36
+ ) -> httpx.Response:
37
+ payload = dict(data)
38
+ payload["origin"] = f"python-sdk@{version}"
39
+ return await self._client.post(
40
+ endpoint,
41
+ json=payload,
42
+ headers={**self._headers(), **(headers or {})},
43
+ timeout=timeout,
44
+ )
45
+
46
+ async def get(
47
+ self,
48
+ endpoint: str,
49
+ headers: Optional[Dict[str, str]] = None,
50
+ timeout: Optional[float] = None,
51
+ ) -> httpx.Response:
52
+ return await self._client.get(
53
+ endpoint, headers={**self._headers(), **(headers or {})}, timeout=timeout
54
+ )
55
+
56
+ async def delete(
57
+ self,
58
+ endpoint: str,
59
+ headers: Optional[Dict[str, str]] = None,
60
+ timeout: Optional[float] = None,
61
+ ) -> httpx.Response:
62
+ return await self._client.delete(
63
+ endpoint, headers={**self._headers(), **(headers or {})}, timeout=timeout
64
+ )
65
+
@@ -0,0 +1,107 @@
1
+ """
2
+ Normalization helpers for v2 API payloads to avoid relying on Pydantic aliases.
3
+ """
4
+
5
+ from typing import Any, Dict, List
6
+ from ..types import DocumentMetadata
7
+
8
+
9
+ def _map_metadata_keys(md: Dict[str, Any]) -> Dict[str, Any]:
10
+ """
11
+ Convert API v2 camelCase metadata keys to snake_case expected by DocumentMetadata.
12
+ Leaves unknown keys as-is.
13
+ """
14
+ mapping = {
15
+ # OpenGraph
16
+ "ogTitle": "og_title",
17
+ "ogDescription": "og_description",
18
+ "ogUrl": "og_url",
19
+ "ogImage": "og_image",
20
+ "ogAudio": "og_audio",
21
+ "ogDeterminer": "og_determiner",
22
+ "ogLocale": "og_locale",
23
+ "ogLocaleAlternate": "og_locale_alternate",
24
+ "ogSiteName": "og_site_name",
25
+ "ogVideo": "og_video",
26
+ # Dublin Core and misc
27
+ "dcTermsCreated": "dc_terms_created",
28
+ "dcDateCreated": "dc_date_created",
29
+ "dcDate": "dc_date",
30
+ "dcTermsType": "dc_terms_type",
31
+ "dcType": "dc_type",
32
+ "dcTermsAudience": "dc_terms_audience",
33
+ "dcTermsSubject": "dc_terms_subject",
34
+ "dcSubject": "dc_subject",
35
+ "dcDescription": "dc_description",
36
+ "dcTermsKeywords": "dc_terms_keywords",
37
+ "modifiedTime": "modified_time",
38
+ "publishedTime": "published_time",
39
+ "articleTag": "article_tag",
40
+ "articleSection": "article_section",
41
+ # Response-level
42
+ "sourceURL": "source_url",
43
+ "statusCode": "status_code",
44
+ "scrapeId": "scrape_id",
45
+ "numPages": "num_pages",
46
+ "contentType": "content_type",
47
+ "proxyUsed": "proxy_used",
48
+ "cacheState": "cache_state",
49
+ "cachedAt": "cached_at",
50
+ "creditsUsed": "credits_used",
51
+ }
52
+
53
+ out: Dict[str, Any] = {}
54
+ for k, v in md.items():
55
+ snake = mapping.get(k, k)
56
+ out[snake] = v
57
+
58
+ # Light coercions where server may send strings/lists
59
+ if isinstance(out.get("status_code"), str):
60
+ try:
61
+ out["status_code"] = int(out["status_code"]) # type: ignore
62
+ except ValueError:
63
+ pass
64
+
65
+ # Generic rule: if a value is a list, join with ", " for string-like fields,
66
+ # except for explicit fields we preserve as lists.
67
+ preserve_list_fields: List[str] = [
68
+ "og_locale_alternate",
69
+ ]
70
+ for f, val in list(out.items()):
71
+ if isinstance(val, list) and f not in preserve_list_fields:
72
+ try:
73
+ out[f] = ", ".join(str(x) for x in val)
74
+ except Exception:
75
+ # Fallback: keep original list if join fails
76
+ pass
77
+
78
+ return out
79
+
80
+
81
+ def normalize_document_input(doc: Dict[str, Any]) -> Dict[str, Any]:
82
+ """
83
+ Normalize a raw Document dict from the API into the Python SDK's expected shape:
84
+ - Convert top-level keys rawHtml->raw_html, changeTracking->change_tracking
85
+ - Convert metadata keys from camelCase to snake_case
86
+ """
87
+ normalized = dict(doc)
88
+
89
+ if "rawHtml" in normalized and "raw_html" not in normalized:
90
+ normalized["raw_html"] = normalized.pop("rawHtml")
91
+
92
+ if "changeTracking" in normalized and "change_tracking" not in normalized:
93
+ normalized["change_tracking"] = normalized.pop("changeTracking")
94
+
95
+ md = normalized.get("metadata")
96
+ if isinstance(md, dict):
97
+ mapped = _map_metadata_keys(md)
98
+ # Construct a concrete DocumentMetadata so downstream has a typed object
99
+ try:
100
+ normalized["metadata"] = DocumentMetadata(**mapped)
101
+ except Exception:
102
+ # Fallback to mapped dict if model construction fails for any reason
103
+ normalized["metadata"] = mapped
104
+
105
+ return normalized
106
+
107
+