firecrawl-py 3.2.1__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (85) hide show
  1. build/lib/firecrawl/__init__.py +87 -0
  2. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +188 -0
  4. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +248 -0
  8. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. build/lib/firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. build/lib/firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. build/lib/firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. build/lib/firecrawl/__tests__/e2e/v2/test_search.py +269 -0
  18. build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  36. build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. build/lib/firecrawl/client.py +242 -0
  41. build/lib/firecrawl/firecrawl.backup.py +4635 -0
  42. build/lib/firecrawl/types.py +161 -0
  43. build/lib/firecrawl/v1/__init__.py +14 -0
  44. build/lib/firecrawl/v1/client.py +4653 -0
  45. build/lib/firecrawl/v2/__init__.py +4 -0
  46. build/lib/firecrawl/v2/client.py +802 -0
  47. build/lib/firecrawl/v2/client_async.py +250 -0
  48. build/lib/firecrawl/v2/methods/aio/__init__.py +1 -0
  49. build/lib/firecrawl/v2/methods/aio/batch.py +85 -0
  50. build/lib/firecrawl/v2/methods/aio/crawl.py +171 -0
  51. build/lib/firecrawl/v2/methods/aio/extract.py +126 -0
  52. build/lib/firecrawl/v2/methods/aio/map.py +59 -0
  53. build/lib/firecrawl/v2/methods/aio/scrape.py +33 -0
  54. build/lib/firecrawl/v2/methods/aio/search.py +172 -0
  55. build/lib/firecrawl/v2/methods/aio/usage.py +42 -0
  56. build/lib/firecrawl/v2/methods/batch.py +417 -0
  57. build/lib/firecrawl/v2/methods/crawl.py +469 -0
  58. build/lib/firecrawl/v2/methods/extract.py +131 -0
  59. build/lib/firecrawl/v2/methods/map.py +77 -0
  60. build/lib/firecrawl/v2/methods/scrape.py +64 -0
  61. build/lib/firecrawl/v2/methods/search.py +197 -0
  62. build/lib/firecrawl/v2/methods/usage.py +41 -0
  63. build/lib/firecrawl/v2/types.py +665 -0
  64. build/lib/firecrawl/v2/utils/__init__.py +9 -0
  65. build/lib/firecrawl/v2/utils/error_handler.py +107 -0
  66. build/lib/firecrawl/v2/utils/get_version.py +15 -0
  67. build/lib/firecrawl/v2/utils/http_client.py +153 -0
  68. build/lib/firecrawl/v2/utils/http_client_async.py +65 -0
  69. build/lib/firecrawl/v2/utils/normalize.py +107 -0
  70. build/lib/firecrawl/v2/utils/validation.py +324 -0
  71. build/lib/firecrawl/v2/watcher.py +301 -0
  72. build/lib/firecrawl/v2/watcher_async.py +242 -0
  73. build/lib/tests/test_change_tracking.py +98 -0
  74. build/lib/tests/test_timeout_conversion.py +117 -0
  75. firecrawl/__init__.py +1 -1
  76. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +2 -2
  77. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +6 -6
  78. firecrawl/v2/methods/search.py +11 -0
  79. firecrawl/v2/types.py +30 -1
  80. {firecrawl_py-3.2.1.dist-info/licenses → firecrawl_py-3.3.0.dist-info}/LICENSE +0 -0
  81. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/METADATA +3 -7
  82. firecrawl_py-3.3.0.dist-info/RECORD +153 -0
  83. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/WHEEL +1 -1
  84. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.0.dist-info}/top_level.txt +2 -0
  85. firecrawl_py-3.2.1.dist-info/RECORD +0 -79
@@ -0,0 +1,64 @@
1
+ """
2
+ Scraping functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ from typing import Optional, Dict, Any
6
+ from ..types import ScrapeOptions, Document
7
+ from ..utils.normalize import normalize_document_input
8
+ from ..utils import HttpClient, handle_response_error, prepare_scrape_options, validate_scrape_options
9
+
10
+
11
+ def _prepare_scrape_request(url: str, options: Optional[ScrapeOptions] = None) -> Dict[str, Any]:
12
+ """
13
+ Prepare a scrape request payload for v2 API.
14
+
15
+ Args:
16
+ url: URL to scrape
17
+ options: ScrapeOptions (snake_case) to convert and include
18
+
19
+ Returns:
20
+ Request payload dictionary with camelCase fields
21
+ """
22
+ if not url or not url.strip():
23
+ raise ValueError("URL cannot be empty")
24
+
25
+ request_data: Dict[str, Any] = {"url": url.strip()}
26
+
27
+ if options is not None:
28
+ validated = validate_scrape_options(options)
29
+ if validated is not None:
30
+ opts = prepare_scrape_options(validated)
31
+ if opts:
32
+ request_data.update(opts)
33
+
34
+ return request_data
35
+
36
+ def scrape(client: HttpClient, url: str, options: Optional[ScrapeOptions] = None) -> Document:
37
+ """
38
+ Scrape a single URL and return the document.
39
+
40
+ The v2 API returns: { success: boolean, data: Document }
41
+ We surface just the Document to callers.
42
+
43
+ Args:
44
+ client: HTTP client instance
45
+ url: URL to scrape
46
+ options: Scraping options (snake_case)
47
+
48
+ Returns:
49
+ Document
50
+ """
51
+ payload = _prepare_scrape_request(url, options)
52
+
53
+ response = client.post("/v2/scrape", payload)
54
+
55
+ if not response.ok:
56
+ handle_response_error(response, "scrape")
57
+
58
+ body = response.json()
59
+ if not body.get("success"):
60
+ raise Exception(body.get("error", "Unknown error occurred"))
61
+
62
+ document_data = body.get("data", {})
63
+ normalized = normalize_document_input(document_data)
64
+ return Document(**normalized)
@@ -0,0 +1,197 @@
1
+ """
2
+ Search functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ import re
6
+ from typing import Dict, Any, Union, List, TypeVar, Type
7
+ from ..types import SearchRequest, SearchData, Document, SearchResultWeb, SearchResultNews, SearchResultImages
8
+ from ..utils.normalize import normalize_document_input
9
+ from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
10
+
11
+ T = TypeVar("T")
12
+
13
+ def search(
14
+ client: HttpClient,
15
+ request: SearchRequest
16
+ ) -> SearchData:
17
+ """
18
+ Search for documents.
19
+
20
+ Args:
21
+ client: HTTP client instance
22
+ request: Search request
23
+
24
+ Returns:
25
+ SearchData with search results grouped by source type
26
+
27
+ Raises:
28
+ FirecrawlError: If the search operation fails
29
+ """
30
+ request_data = _prepare_search_request(request)
31
+ try:
32
+ response = client.post("/v2/search", request_data)
33
+ if response.status_code != 200:
34
+ handle_response_error(response, "search")
35
+ response_data = response.json()
36
+ if not response_data.get("success"):
37
+ handle_response_error(response, "search")
38
+ data = response_data.get("data", {}) or {}
39
+ out = SearchData()
40
+ if "web" in data:
41
+ out.web = _transform_array(data["web"], SearchResultWeb)
42
+ if "news" in data:
43
+ out.news = _transform_array(data["news"], SearchResultNews)
44
+ if "images" in data:
45
+ out.images = _transform_array(data["images"], SearchResultImages)
46
+ return out
47
+ except Exception as err:
48
+ # If the error is an HTTP error from requests, handle it
49
+ # (simulate isAxiosError by checking for requests' HTTPError or Response)
50
+ if hasattr(err, "response"):
51
+ handle_response_error(getattr(err, "response"), "search")
52
+ raise err
53
+
54
+ def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, 'Document']]:
55
+ """
56
+ Transforms an array of items into a list of result_type or Document.
57
+ If the item dict contains any of the special keys, it is treated as a Document.
58
+ Otherwise, it is treated as result_type.
59
+ If the item is not a dict, it is wrapped as result_type with url=item.
60
+ """
61
+ results: List[Union[T, 'Document']] = []
62
+ for item in arr:
63
+ if item and isinstance(item, dict):
64
+ if (
65
+ "markdown" in item or
66
+ "html" in item or
67
+ "rawHtml" in item or
68
+ "links" in item or
69
+ "screenshot" in item or
70
+ "changeTracking" in item or
71
+ "summary" in item or
72
+ "json" in item
73
+ ):
74
+ results.append(Document(**item))
75
+ else:
76
+ results.append(result_type(**item))
77
+ else:
78
+ # For non-dict items, assume it's a URL and wrap in result_type
79
+ results.append(result_type(url=item))
80
+ return results
81
+
82
+ def _validate_search_request(request: SearchRequest) -> SearchRequest:
83
+ """
84
+ Validate and normalize search request.
85
+
86
+ Args:
87
+ request: Search request to validate
88
+
89
+ Returns:
90
+ Validated request
91
+
92
+ Raises:
93
+ ValueError: If request is invalid
94
+ """
95
+ # Validate query
96
+ if not request.query or not request.query.strip():
97
+ raise ValueError("Query cannot be empty")
98
+
99
+ # Validate limit
100
+ if request.limit is not None:
101
+ if request.limit <= 0:
102
+ raise ValueError("Limit must be positive")
103
+ if request.limit > 100:
104
+ raise ValueError("Limit cannot exceed 100")
105
+
106
+ # Validate timeout
107
+ if request.timeout is not None:
108
+ if request.timeout <= 0:
109
+ raise ValueError("Timeout must be positive")
110
+ if request.timeout > 300000: # 5 minutes max
111
+ raise ValueError("Timeout cannot exceed 300000ms (5 minutes)")
112
+
113
+ # Validate sources (if provided)
114
+ if request.sources is not None:
115
+ valid_sources = {"web", "news", "images"}
116
+ for source in request.sources:
117
+ if isinstance(source, str):
118
+ if source not in valid_sources:
119
+ raise ValueError(f"Invalid source type: {source}. Valid types: {valid_sources}")
120
+ elif hasattr(source, 'type'):
121
+ if source.type not in valid_sources:
122
+ raise ValueError(f"Invalid source type: {source.type}. Valid types: {valid_sources}")
123
+
124
+ # Validate categories (if provided)
125
+ if request.categories is not None:
126
+ valid_categories = {"github", "research"}
127
+ for category in request.categories:
128
+ if isinstance(category, str):
129
+ if category not in valid_categories:
130
+ raise ValueError(f"Invalid category type: {category}. Valid types: {valid_categories}")
131
+ elif hasattr(category, 'type'):
132
+ if category.type not in valid_categories:
133
+ raise ValueError(f"Invalid category type: {category.type}. Valid types: {valid_categories}")
134
+
135
+ # Validate location (if provided)
136
+ if request.location is not None:
137
+ if not isinstance(request.location, str) or len(request.location.strip()) == 0:
138
+ raise ValueError("Location must be a non-empty string")
139
+
140
+ # Validate tbs (time-based search, if provided)
141
+ if request.tbs is not None:
142
+ valid_tbs_values = {
143
+ "qdr:h", "qdr:d", "qdr:w", "qdr:m", "qdr:y", # Google time filters
144
+ "d", "w", "m", "y" # Short forms
145
+ }
146
+
147
+ if request.tbs in valid_tbs_values:
148
+ pass # Valid predefined value
149
+ elif request.tbs.startswith("cdr:"):
150
+ custom_date_pattern = r"^cdr:1,cd_min:\d{1,2}/\d{1,2}/\d{4},cd_max:\d{1,2}/\d{1,2}/\d{4}$"
151
+ if not re.match(custom_date_pattern, request.tbs):
152
+ raise ValueError(f"Invalid custom date range format: {request.tbs}. Expected format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
153
+ else:
154
+ raise ValueError(f"Invalid tbs value: {request.tbs}. Valid values: {valid_tbs_values} or custom date range format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
155
+
156
+ # Validate scrape_options (if provided)
157
+ if request.scrape_options is not None:
158
+ validate_scrape_options(request.scrape_options)
159
+
160
+ return request
161
+
162
+
163
+ def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
164
+ """
165
+ Prepare a search request payload.
166
+
167
+ Args:
168
+ request: Search request
169
+
170
+ Returns:
171
+ Request payload dictionary
172
+ """
173
+ validated_request = _validate_search_request(request)
174
+ data = validated_request.model_dump(exclude_none=True, by_alias=True)
175
+
176
+ # Ensure default values are included only if not explicitly set to None
177
+ if "limit" not in data and validated_request.limit is not None:
178
+ data["limit"] = validated_request.limit
179
+ if "timeout" not in data and validated_request.timeout is not None:
180
+ data["timeout"] = validated_request.timeout
181
+
182
+ # Handle snake_case to camelCase conversions manually
183
+ # (Pydantic Field() aliases interfere with value assignment)
184
+
185
+ # ignore_invalid_urls → ignoreInvalidURLs
186
+ if validated_request.ignore_invalid_urls is not None:
187
+ data["ignoreInvalidURLs"] = validated_request.ignore_invalid_urls
188
+ data.pop("ignore_invalid_urls", None)
189
+
190
+ # scrape_options → scrapeOptions
191
+ if validated_request.scrape_options is not None:
192
+ scrape_data = prepare_scrape_options(validated_request.scrape_options)
193
+ if scrape_data:
194
+ data["scrapeOptions"] = scrape_data
195
+ data.pop("scrape_options", None)
196
+
197
+ return data
@@ -0,0 +1,41 @@
1
+ from ..utils import HttpClient, handle_response_error
2
+ from ..types import ConcurrencyCheck, CreditUsage, TokenUsage
3
+
4
+
5
+ def get_concurrency(client: HttpClient) -> ConcurrencyCheck:
6
+ resp = client.get("/v2/concurrency-check")
7
+ if not resp.ok:
8
+ handle_response_error(resp, "get concurrency")
9
+ body = resp.json()
10
+ if not body.get("success"):
11
+ raise Exception(body.get("error", "Unknown error"))
12
+ data = body.get("data", body)
13
+ return ConcurrencyCheck(
14
+ concurrency=data.get("concurrency"),
15
+ max_concurrency=data.get("maxConcurrency", data.get("max_concurrency")),
16
+ )
17
+
18
+
19
+ def get_credit_usage(client: HttpClient) -> CreditUsage:
20
+ resp = client.get("/v2/team/credit-usage")
21
+ if not resp.ok:
22
+ handle_response_error(resp, "get credit usage")
23
+ body = resp.json()
24
+ if not body.get("success"):
25
+ raise Exception(body.get("error", "Unknown error"))
26
+ data = body.get("data", body)
27
+ return CreditUsage(remaining_credits=data.get("remainingCredits", data.get("remaining_credits", 0)))
28
+
29
+
30
+ def get_token_usage(client: HttpClient) -> TokenUsage:
31
+ resp = client.get("/v2/team/token-usage")
32
+ if not resp.ok:
33
+ handle_response_error(resp, "get token usage")
34
+ body = resp.json()
35
+ if not body.get("success"):
36
+ raise Exception(body.get("error", "Unknown error"))
37
+ data = body.get("data", body)
38
+ return TokenUsage(
39
+ remaining_tokens=data.get("remainingTokens", 0)
40
+ )
41
+