firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,65 @@
1
+ from typing import Optional, Dict, Any
2
+ from ...types import MapOptions, MapData, LinkResult
3
+ from ...utils.http_client_async import AsyncHttpClient
4
+ from ...utils.error_handler import handle_response_error
5
+
6
+
7
+ def _prepare_map_request(url: str, options: Optional[MapOptions] = None) -> Dict[str, Any]:
8
+ if not url or not url.strip():
9
+ raise ValueError("URL cannot be empty")
10
+ payload: Dict[str, Any] = {"url": url.strip()}
11
+ if options is not None:
12
+ data: Dict[str, Any] = {}
13
+ if getattr(options, "sitemap", None) is not None:
14
+ data["sitemap"] = options.sitemap
15
+ if options.search is not None:
16
+ data["search"] = options.search
17
+ if options.include_subdomains is not None:
18
+ data["includeSubdomains"] = options.include_subdomains
19
+ if options.ignore_query_parameters is not None:
20
+ data["ignoreQueryParameters"] = options.ignore_query_parameters
21
+ if options.limit is not None:
22
+ data["limit"] = options.limit
23
+ if options.timeout is not None:
24
+ data["timeout"] = options.timeout
25
+ if options.integration is not None:
26
+ data["integration"] = options.integration.strip()
27
+ if options.location is not None:
28
+ data["location"] = options.location.model_dump(exclude_none=True)
29
+ payload.update(data)
30
+ return payload
31
+
32
+
33
+ async def map(client: AsyncHttpClient, url: str, options: Optional[MapOptions] = None) -> MapData:
34
+ request_data = _prepare_map_request(url, options)
35
+ response = await client.post("/v2/map", request_data)
36
+ if response.status_code >= 400:
37
+ handle_response_error(response, "map")
38
+ body = response.json()
39
+ if not body.get("success"):
40
+ raise Exception(body.get("error", "Unknown error occurred"))
41
+
42
+
43
+ # data = body.get("data", {})
44
+ # result_links: list[LinkResult] = []
45
+ # for item in data.get("links", []):
46
+ # if isinstance(item, dict):
47
+ # result_links.append(
48
+ # LinkResult(
49
+ # url=item.get("url", ""),
50
+ # title=item.get("title"),
51
+ # description=item.get("description"),
52
+ # )
53
+ # )
54
+ # elif isinstance(item, str):
55
+ # result_links.append(LinkResult(url=item))
56
+
57
+ result_links: list[LinkResult] = []
58
+ for item in body.get("links", []):
59
+ if isinstance(item, dict):
60
+ result_links.append(LinkResult(url=item.get("url", ""), title=item.get("title"), description=item.get("description")))
61
+ elif isinstance(item, str):
62
+ result_links.append(LinkResult(url=item))
63
+
64
+ return MapData(links=result_links)
65
+
@@ -0,0 +1,33 @@
1
+ from typing import Optional, Dict, Any
2
+ from ...types import ScrapeOptions, Document
3
+ from ...utils.normalize import normalize_document_input
4
+ from ...utils.error_handler import handle_response_error
5
+ from ...utils.validation import prepare_scrape_options, validate_scrape_options
6
+ from ...utils.http_client_async import AsyncHttpClient
7
+
8
+
9
+ async def _prepare_scrape_request(url: str, options: Optional[ScrapeOptions] = None) -> Dict[str, Any]:
10
+ if not url or not url.strip():
11
+ raise ValueError("URL cannot be empty")
12
+ payload: Dict[str, Any] = {"url": url.strip()}
13
+ if options is not None:
14
+ validated = validate_scrape_options(options)
15
+ if validated is not None:
16
+ opts = prepare_scrape_options(validated)
17
+ if opts:
18
+ payload.update(opts)
19
+ return payload
20
+
21
+
22
+ async def scrape(client: AsyncHttpClient, url: str, options: Optional[ScrapeOptions] = None) -> Document:
23
+ payload = await _prepare_scrape_request(url, options)
24
+ response = await client.post("/v2/scrape", payload)
25
+ if response.status_code >= 400:
26
+ handle_response_error(response, "scrape")
27
+ body = response.json()
28
+ if not body.get("success"):
29
+ raise Exception(body.get("error", "Unknown error occurred"))
30
+ document_data = body.get("data", {})
31
+ normalized = normalize_document_input(document_data)
32
+ return Document(**normalized)
33
+
@@ -0,0 +1,176 @@
1
+ import re
2
+ from typing import Dict, Any, Union, List, TypeVar, Type
3
+ from ...types import (
4
+ SearchRequest,
5
+ SearchData,
6
+ Document,
7
+ SearchResultWeb,
8
+ SearchResultNews,
9
+ SearchResultImages,
10
+ )
11
+ from ...utils.http_client_async import AsyncHttpClient
12
+ from ...utils.error_handler import handle_response_error
13
+ from ...utils.normalize import normalize_document_input
14
+ from ...utils.validation import validate_scrape_options, prepare_scrape_options
15
+
16
+ T = TypeVar("T")
17
+
18
+ async def search(
19
+ client: AsyncHttpClient,
20
+ request: SearchRequest
21
+ ) -> SearchData:
22
+ """
23
+ Async search for documents.
24
+
25
+ Args:
26
+ client: Async HTTP client instance
27
+ request: Search request
28
+
29
+ Returns:
30
+ SearchData with search results grouped by source type
31
+
32
+ Raises:
33
+ FirecrawlError: If the search operation fails
34
+ """
35
+ request_data = _prepare_search_request(request)
36
+ try:
37
+ response = await client.post("/v2/search", request_data)
38
+ if response.status_code != 200:
39
+ handle_response_error(response, "search")
40
+ response_data = response.json()
41
+ if not response_data.get("success"):
42
+ handle_response_error(response, "search")
43
+ data = response_data.get("data", {}) or {}
44
+ out = SearchData()
45
+ if "web" in data:
46
+ out.web = _transform_array(data["web"], SearchResultWeb)
47
+ if "news" in data:
48
+ out.news = _transform_array(data["news"], SearchResultNews)
49
+ if "images" in data:
50
+ out.images = _transform_array(data["images"], SearchResultImages)
51
+ return out
52
+ except Exception as err:
53
+ if hasattr(err, "response"):
54
+ handle_response_error(getattr(err, "response"), "search")
55
+ raise err
56
+
57
+ def _transform_array(arr: List[Any], result_type: Type[T]) -> List[Union[T, Document]]:
58
+ """
59
+ Transforms an array of items into a list of result_type or Document.
60
+ If the item dict contains any of the special keys, it is treated as a Document.
61
+ Otherwise, it is treated as result_type.
62
+ If the item is not a dict, it is wrapped as result_type with url=item.
63
+ """
64
+ results: List[Union[T, Document]] = []
65
+ for item in arr:
66
+ if item and isinstance(item, dict):
67
+ if (
68
+ "markdown" in item or
69
+ "html" in item or
70
+ "rawHtml" in item or
71
+ "links" in item or
72
+ "screenshot" in item or
73
+ "changeTracking" in item or
74
+ "summary" in item or
75
+ "json" in item
76
+ ):
77
+ results.append(Document(**normalize_document_input(item)))
78
+ else:
79
+ results.append(result_type(**item))
80
+ else:
81
+ results.append(result_type(url=item))
82
+ return results
83
+
84
+ def _validate_search_request(request: SearchRequest) -> SearchRequest:
85
+ """
86
+ Validate and normalize search request.
87
+
88
+ Args:
89
+ request: Search request to validate
90
+
91
+ Returns:
92
+ Validated request
93
+
94
+ Raises:
95
+ ValueError: If request is invalid
96
+ """
97
+ if not request.query or not request.query.strip():
98
+ raise ValueError("Query cannot be empty")
99
+
100
+ if request.limit is not None:
101
+ if request.limit <= 0:
102
+ raise ValueError("Limit must be positive")
103
+ if request.limit > 100:
104
+ raise ValueError("Limit cannot exceed 100")
105
+
106
+ if request.timeout is not None:
107
+ if request.timeout <= 0:
108
+ raise ValueError("Timeout must be positive")
109
+ if request.timeout > 300000:
110
+ raise ValueError("Timeout cannot exceed 300000ms (5 minutes)")
111
+
112
+ if request.sources is not None:
113
+ valid_sources = {"web", "news", "images"}
114
+ for source in request.sources:
115
+ if isinstance(source, str):
116
+ if source not in valid_sources:
117
+ raise ValueError(f"Invalid source type: {source}. Valid types: {valid_sources}")
118
+ elif hasattr(source, 'type'):
119
+ if source.type not in valid_sources:
120
+ raise ValueError(f"Invalid source type: {source.type}. Valid types: {valid_sources}")
121
+
122
+ if request.location is not None:
123
+ if not isinstance(request.location, str) or len(request.location.strip()) == 0:
124
+ raise ValueError("Location must be a non-empty string")
125
+
126
+ if request.tbs is not None:
127
+ valid_tbs_values = {
128
+ "qdr:h", "qdr:d", "qdr:w", "qdr:m", "qdr:y",
129
+ "d", "w", "m", "y"
130
+ }
131
+ if request.tbs in valid_tbs_values:
132
+ pass
133
+ elif request.tbs.startswith("cdr:"):
134
+ custom_date_pattern = r"^cdr:1,cd_min:\d{1,2}/\d{1,2}/\d{4},cd_max:\d{1,2}/\d{1,2}/\d{4}$"
135
+ if not re.match(custom_date_pattern, request.tbs):
136
+ raise ValueError(f"Invalid custom date range format: {request.tbs}. Expected format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
137
+ else:
138
+ raise ValueError(f"Invalid tbs value: {request.tbs}. Valid values: {valid_tbs_values} or custom date range format: cdr:1,cd_min:MM/DD/YYYY,cd_max:MM/DD/YYYY")
139
+
140
+ if request.scrape_options is not None:
141
+ validate_scrape_options(request.scrape_options)
142
+
143
+ return request
144
+
145
+ def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
146
+ """
147
+ Prepare a search request payload.
148
+
149
+ Args:
150
+ request: Search request
151
+
152
+ Returns:
153
+ Request payload dictionary
154
+ """
155
+ validated_request = _validate_search_request(request)
156
+ data = validated_request.model_dump(exclude_none=True, by_alias=True)
157
+
158
+ if "limit" not in data and validated_request.limit is not None:
159
+ data["limit"] = validated_request.limit
160
+ if "timeout" not in data and validated_request.timeout is not None:
161
+ data["timeout"] = validated_request.timeout
162
+
163
+ if validated_request.ignore_invalid_urls is not None:
164
+ data["ignoreInvalidURLs"] = validated_request.ignore_invalid_urls
165
+ data.pop("ignore_invalid_urls", None)
166
+
167
+ if validated_request.scrape_options is not None:
168
+ scrape_data = prepare_scrape_options(validated_request.scrape_options)
169
+ if scrape_data:
170
+ data["scrapeOptions"] = scrape_data
171
+ data.pop("scrape_options", None)
172
+
173
+ if (v := getattr(validated_request, "integration", None)) is not None and str(v).strip():
174
+ data["integration"] = str(validated_request.integration).strip()
175
+
176
+ return data
@@ -0,0 +1,89 @@
1
+ from ...utils.http_client_async import AsyncHttpClient
2
+ from ...utils.error_handler import handle_response_error
3
+ from ...types import ConcurrencyCheck, CreditUsage, TokenUsage, CreditUsageHistoricalResponse, TokenUsageHistoricalResponse, QueueStatusResponse
4
+
5
+
6
+ async def get_concurrency(client: AsyncHttpClient) -> ConcurrencyCheck:
7
+ resp = await client.get("/v2/concurrency-check")
8
+ if resp.status_code >= 400:
9
+ handle_response_error(resp, "get concurrency")
10
+ body = resp.json()
11
+ if not body.get("success"):
12
+ raise Exception(body.get("error", "Unknown error"))
13
+ data = body.get("data", body)
14
+ return ConcurrencyCheck(
15
+ concurrency=data.get("concurrency"),
16
+ max_concurrency=data.get("maxConcurrency", data.get("max_concurrency")),
17
+ )
18
+
19
+
20
+ async def get_credit_usage(client: AsyncHttpClient) -> CreditUsage:
21
+ resp = await client.get("/v2/team/credit-usage")
22
+ if resp.status_code >= 400:
23
+ handle_response_error(resp, "get credit usage")
24
+ body = resp.json()
25
+ if not body.get("success"):
26
+ raise Exception(body.get("error", "Unknown error"))
27
+ data = body.get("data", body)
28
+ return CreditUsage(
29
+ remaining_credits=data.get("remainingCredits", data.get("remaining_credits", 0)),
30
+ plan_credits=data.get("planCredits", data.get("plan_credits")),
31
+ billing_period_start=data.get("billingPeriodStart", data.get("billing_period_start")),
32
+ billing_period_end=data.get("billingPeriodEnd", data.get("billing_period_end")),
33
+ )
34
+
35
+
36
+ async def get_token_usage(client: AsyncHttpClient) -> TokenUsage:
37
+ resp = await client.get("/v2/team/token-usage")
38
+ if resp.status_code >= 400:
39
+ handle_response_error(resp, "get token usage")
40
+ body = resp.json()
41
+ if not body.get("success"):
42
+ raise Exception(body.get("error", "Unknown error"))
43
+ data = body.get("data", body)
44
+ return TokenUsage(
45
+ remaining_tokens=data.get("remainingTokens", data.get("remaining_tokens", 0)),
46
+ plan_tokens=data.get("planTokens", data.get("plan_tokens")),
47
+ billing_period_start=data.get("billingPeriodStart", data.get("billing_period_start")),
48
+ billing_period_end=data.get("billingPeriodEnd", data.get("billing_period_end")),
49
+ )
50
+
51
+
52
+ async def get_queue_status(client: AsyncHttpClient) -> QueueStatusResponse:
53
+ resp = await client.get("/v2/team/queue-status")
54
+ if resp.status_code >= 400:
55
+ handle_response_error(resp, "get queue status")
56
+ body = resp.json()
57
+ if not body.get("success"):
58
+ raise Exception(body.get("error", "Unknown error"))
59
+ data = body.get("data", body)
60
+ return QueueStatusResponse(
61
+ jobs_in_queue=data.get("jobsInQueue", 0),
62
+ active_jobs_in_queue=data.get("activeJobsInQueue", 0),
63
+ waiting_jobs_in_queue=data.get("waitingJobsInQueue", 0),
64
+ max_concurrency=data.get("maxConcurrency", 0),
65
+ most_recent_success=data.get("mostRecentSuccess", None),
66
+ )
67
+
68
+
69
+ async def get_credit_usage_historical(client: AsyncHttpClient, by_api_key: bool = False) -> CreditUsageHistoricalResponse:
70
+ query = "?byApiKey=true" if by_api_key else ""
71
+ resp = await client.get(f"/v2/team/credit-usage/historical{query}")
72
+ if resp.status_code >= 400:
73
+ handle_response_error(resp, "get credit usage historical")
74
+ body = resp.json()
75
+ if not body.get("success"):
76
+ raise Exception(body.get("error", "Unknown error"))
77
+ return CreditUsageHistoricalResponse(**body)
78
+
79
+
80
+ async def get_token_usage_historical(client: AsyncHttpClient, by_api_key: bool = False) -> TokenUsageHistoricalResponse:
81
+ query = "?byApiKey=true" if by_api_key else ""
82
+ resp = await client.get(f"/v2/team/token-usage/historical{query}")
83
+ if resp.status_code >= 400:
84
+ handle_response_error(resp, "get token usage historical")
85
+ body = resp.json()
86
+ if not body.get("success"):
87
+ raise Exception(body.get("error", "Unknown error"))
88
+ return TokenUsageHistoricalResponse(**body)
89
+