firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (82) hide show
  1. firecrawl/__init__.py +27 -19
  2. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. firecrawl/__tests__/e2e/v2/test_search.py +265 -0
  18. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
  36. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. firecrawl/client.py +241 -0
  41. firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
  42. firecrawl/types.py +157 -0
  43. firecrawl/v1/__init__.py +14 -0
  44. firecrawl/v1/client.py +4653 -0
  45. firecrawl/v2/__init__.py +4 -0
  46. firecrawl/v2/client.py +802 -0
  47. firecrawl/v2/client_async.py +250 -0
  48. firecrawl/v2/methods/aio/__init__.py +1 -0
  49. firecrawl/v2/methods/aio/batch.py +85 -0
  50. firecrawl/v2/methods/aio/crawl.py +174 -0
  51. firecrawl/v2/methods/aio/extract.py +126 -0
  52. firecrawl/v2/methods/aio/map.py +59 -0
  53. firecrawl/v2/methods/aio/scrape.py +36 -0
  54. firecrawl/v2/methods/aio/search.py +58 -0
  55. firecrawl/v2/methods/aio/usage.py +42 -0
  56. firecrawl/v2/methods/batch.py +420 -0
  57. firecrawl/v2/methods/crawl.py +468 -0
  58. firecrawl/v2/methods/extract.py +131 -0
  59. firecrawl/v2/methods/map.py +77 -0
  60. firecrawl/v2/methods/scrape.py +68 -0
  61. firecrawl/v2/methods/search.py +173 -0
  62. firecrawl/v2/methods/usage.py +41 -0
  63. firecrawl/v2/types.py +546 -0
  64. firecrawl/v2/utils/__init__.py +9 -0
  65. firecrawl/v2/utils/error_handler.py +107 -0
  66. firecrawl/v2/utils/get_version.py +15 -0
  67. firecrawl/v2/utils/http_client.py +153 -0
  68. firecrawl/v2/utils/http_client_async.py +64 -0
  69. firecrawl/v2/utils/validation.py +324 -0
  70. firecrawl/v2/watcher.py +312 -0
  71. firecrawl/v2/watcher_async.py +245 -0
  72. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
  73. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
  74. firecrawl-3.0.3.dist-info/RECORD +78 -0
  75. tests/test_timeout_conversion.py +117 -0
  76. firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  77. firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  78. firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  79. firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
  80. firecrawl-2.16.5.dist-info/RECORD +0 -12
  81. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
  82. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,68 @@
1
+ """
2
+ Scraping functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ from typing import Optional, Dict, Any
6
+ from ..types import ScrapeOptions, Document
7
+ from ..utils import HttpClient, handle_response_error, prepare_scrape_options, validate_scrape_options
8
+
9
+
10
+ def _prepare_scrape_request(url: str, options: Optional[ScrapeOptions] = None) -> Dict[str, Any]:
11
+ """
12
+ Prepare a scrape request payload for v2 API.
13
+
14
+ Args:
15
+ url: URL to scrape
16
+ options: ScrapeOptions (snake_case) to convert and include
17
+
18
+ Returns:
19
+ Request payload dictionary with camelCase fields
20
+ """
21
+ if not url or not url.strip():
22
+ raise ValueError("URL cannot be empty")
23
+
24
+ request_data: Dict[str, Any] = {"url": url.strip()}
25
+
26
+ if options is not None:
27
+ validated = validate_scrape_options(options)
28
+ if validated is not None:
29
+ opts = prepare_scrape_options(validated)
30
+ if opts:
31
+ request_data.update(opts)
32
+
33
+ return request_data
34
+
35
+ def scrape(client: HttpClient, url: str, options: Optional[ScrapeOptions] = None) -> Document:
36
+ """
37
+ Scrape a single URL and return the document.
38
+
39
+ The v2 API returns: { success: boolean, data: Document }
40
+ We surface just the Document to callers.
41
+
42
+ Args:
43
+ client: HTTP client instance
44
+ url: URL to scrape
45
+ options: Scraping options (snake_case)
46
+
47
+ Returns:
48
+ Document
49
+ """
50
+ payload = _prepare_scrape_request(url, options)
51
+
52
+ response = client.post("/v2/scrape", payload)
53
+
54
+ if not response.ok:
55
+ handle_response_error(response, "scrape")
56
+
57
+ body = response.json()
58
+ if not body.get("success"):
59
+ raise Exception(body.get("error", "Unknown error occurred"))
60
+
61
+ document_data = body.get("data", {})
62
+ # Normalize keys for Document (no Pydantic aliases)
63
+ normalized = dict(document_data)
64
+ if 'rawHtml' in normalized and 'raw_html' not in normalized:
65
+ normalized['raw_html'] = normalized.pop('rawHtml')
66
+ if 'changeTracking' in normalized and 'change_tracking' not in normalized:
67
+ normalized['change_tracking'] = normalized.pop('changeTracking')
68
+ return Document(**normalized)
@@ -0,0 +1,173 @@
1
+ """
2
+ Search functionality for Firecrawl v2 API.
3
+ """
4
+
5
+ from typing import Optional, Dict, Any, Union
6
+ from ..types import SearchRequest, SearchData, SearchResult, Document
7
+ from ..utils import HttpClient, handle_response_error, validate_scrape_options, prepare_scrape_options
8
+
9
+
10
+ def search(
11
+ client: HttpClient,
12
+ request: SearchRequest
13
+ ) -> SearchData:
14
+ """
15
+ Search for documents.
16
+
17
+ Args:
18
+ client: HTTP client instance
19
+ request: Search request
20
+
21
+ Returns:
22
+ SearchData with search results grouped by source type
23
+
24
+ Raises:
25
+ FirecrawlError: If the search operation fails
26
+ """
27
+ request_data = _prepare_search_request(request)
28
+
29
+ response = client.post("/v2/search", request_data)
30
+
31
+ if not response.ok:
32
+ handle_response_error(response, "search")
33
+
34
+ response_data = response.json()
35
+
36
+ if not response_data.get("success"):
37
+ # Handle error case
38
+ error_msg = response_data.get("error", "Unknown error occurred")
39
+ raise Exception(f"Search failed: {error_msg}")
40
+
41
+ data = response_data.get("data", {})
42
+ search_data = SearchData()
43
+
44
+ for source_type, source_documents in data.items():
45
+ if isinstance(source_documents, list):
46
+ results = []
47
+ for doc_data in source_documents:
48
+ if isinstance(doc_data, dict):
49
+ # If page scraping options were provided, API returns full Document objects
50
+ if request.scrape_options is not None and any(
51
+ key in doc_data for key in ['markdown', 'html', 'rawHtml', 'links', 'summary', 'screenshot', 'changeTracking']
52
+ ):
53
+ # Normalize keys for Document (no Pydantic aliases)
54
+ normalized = dict(doc_data)
55
+ if 'rawHtml' in normalized and 'raw_html' not in normalized:
56
+ normalized['raw_html'] = normalized.pop('rawHtml')
57
+ if 'changeTracking' in normalized and 'change_tracking' not in normalized:
58
+ normalized['change_tracking'] = normalized.pop('changeTracking')
59
+ results.append(Document(**normalized))
60
+ else:
61
+ # Minimal search result shape
62
+ results.append(SearchResult(
63
+ url=doc_data.get('url', ''),
64
+ title=doc_data.get('title'),
65
+ description=doc_data.get('description')
66
+ ))
67
+ elif isinstance(doc_data, str):
68
+ results.append(SearchResult(url=doc_data))
69
+
70
+ if hasattr(search_data, source_type):
71
+ setattr(search_data, source_type, results)
72
+
73
+ return search_data
74
+
75
+
76
+ def _validate_search_request(request: SearchRequest) -> SearchRequest:
77
+ """
78
+ Validate and normalize search request.
79
+
80
+ Args:
81
+ request: Search request to validate
82
+
83
+ Returns:
84
+ Validated request
85
+
86
+ Raises:
87
+ ValueError: If request is invalid
88
+ """
89
+ # Validate query
90
+ if not request.query or not request.query.strip():
91
+ raise ValueError("Query cannot be empty")
92
+
93
+ # Validate limit
94
+ if request.limit is not None:
95
+ if request.limit <= 0:
96
+ raise ValueError("Limit must be positive")
97
+ if request.limit > 100:
98
+ raise ValueError("Limit cannot exceed 100")
99
+
100
+ # Validate timeout
101
+ if request.timeout is not None:
102
+ if request.timeout <= 0:
103
+ raise ValueError("Timeout must be positive")
104
+ if request.timeout > 300000: # 5 minutes max
105
+ raise ValueError("Timeout cannot exceed 300000ms (5 minutes)")
106
+
107
+ # Validate sources (if provided)
108
+ if request.sources is not None:
109
+ valid_sources = {"web", "news", "images"}
110
+ for source in request.sources:
111
+ if isinstance(source, str):
112
+ if source not in valid_sources:
113
+ raise ValueError(f"Invalid source type: {source}. Valid types: {valid_sources}")
114
+ elif hasattr(source, 'type'):
115
+ if source.type not in valid_sources:
116
+ raise ValueError(f"Invalid source type: {source.type}. Valid types: {valid_sources}")
117
+
118
+ # Validate location (if provided)
119
+ if request.location is not None:
120
+ if not isinstance(request.location, str) or len(request.location.strip()) == 0:
121
+ raise ValueError("Location must be a non-empty string")
122
+
123
+ # Validate tbs (time-based search, if provided)
124
+ if request.tbs is not None:
125
+ valid_tbs_values = {
126
+ "qdr:d", "qdr:w", "qdr:m", "qdr:y", # Google time filters
127
+ "d", "w", "m", "y" # Short forms
128
+ }
129
+ if request.tbs not in valid_tbs_values:
130
+ raise ValueError(f"Invalid tbs value: {request.tbs}. Valid values: {valid_tbs_values}")
131
+
132
+ # Validate scrape_options (if provided)
133
+ if request.scrape_options is not None:
134
+ validate_scrape_options(request.scrape_options)
135
+
136
+ return request
137
+
138
+
139
+ def _prepare_search_request(request: SearchRequest) -> Dict[str, Any]:
140
+ """
141
+ Prepare a search request payload.
142
+
143
+ Args:
144
+ request: Search request
145
+
146
+ Returns:
147
+ Request payload dictionary
148
+ """
149
+ validated_request = _validate_search_request(request)
150
+ data = validated_request.model_dump(exclude_none=True, by_alias=True)
151
+
152
+ # Ensure default values are included only if not explicitly set to None
153
+ if "limit" not in data and validated_request.limit is not None:
154
+ data["limit"] = validated_request.limit
155
+ if "timeout" not in data and validated_request.timeout is not None:
156
+ data["timeout"] = validated_request.timeout
157
+
158
+ # Handle snake_case to camelCase conversions manually
159
+ # (Pydantic Field() aliases interfere with value assignment)
160
+
161
+ # ignore_invalid_urls → ignoreInvalidURLs
162
+ if validated_request.ignore_invalid_urls is not None:
163
+ data["ignoreInvalidURLs"] = validated_request.ignore_invalid_urls
164
+ data.pop("ignore_invalid_urls", None)
165
+
166
+ # scrape_options → scrapeOptions
167
+ if validated_request.scrape_options is not None:
168
+ scrape_data = prepare_scrape_options(validated_request.scrape_options)
169
+ if scrape_data:
170
+ data["scrapeOptions"] = scrape_data
171
+ data.pop("scrape_options", None)
172
+
173
+ return data
@@ -0,0 +1,41 @@
1
+ from ..utils import HttpClient, handle_response_error
2
+ from ..types import ConcurrencyCheck, CreditUsage, TokenUsage
3
+
4
+
5
+ def get_concurrency(client: HttpClient) -> ConcurrencyCheck:
6
+ resp = client.get("/v2/concurrency-check")
7
+ if not resp.ok:
8
+ handle_response_error(resp, "get concurrency")
9
+ body = resp.json()
10
+ if not body.get("success"):
11
+ raise Exception(body.get("error", "Unknown error"))
12
+ data = body.get("data", body)
13
+ return ConcurrencyCheck(
14
+ concurrency=data.get("concurrency"),
15
+ max_concurrency=data.get("maxConcurrency", data.get("max_concurrency")),
16
+ )
17
+
18
+
19
+ def get_credit_usage(client: HttpClient) -> CreditUsage:
20
+ resp = client.get("/v2/team/credit-usage")
21
+ if not resp.ok:
22
+ handle_response_error(resp, "get credit usage")
23
+ body = resp.json()
24
+ if not body.get("success"):
25
+ raise Exception(body.get("error", "Unknown error"))
26
+ data = body.get("data", body)
27
+ return CreditUsage(remaining_credits=data.get("remainingCredits", data.get("remaining_credits", 0)))
28
+
29
+
30
+ def get_token_usage(client: HttpClient) -> TokenUsage:
31
+ resp = client.get("/v2/team/token-usage")
32
+ if not resp.ok:
33
+ handle_response_error(resp, "get token usage")
34
+ body = resp.json()
35
+ if not body.get("success"):
36
+ raise Exception(body.get("error", "Unknown error"))
37
+ data = body.get("data", body)
38
+ return TokenUsage(
39
+ remaining_tokens=data.get("remainingTokens", 0)
40
+ )
41
+