firecrawl-py 3.2.1__py3-none-any.whl → 3.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (86) hide show
  1. build/lib/firecrawl/__init__.py +87 -0
  2. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +188 -0
  4. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +248 -0
  8. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. build/lib/firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. build/lib/firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. build/lib/firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. build/lib/firecrawl/__tests__/e2e/v2/test_search.py +269 -0
  18. build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  36. build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. build/lib/firecrawl/client.py +242 -0
  41. build/lib/firecrawl/firecrawl.backup.py +4635 -0
  42. build/lib/firecrawl/types.py +161 -0
  43. build/lib/firecrawl/v1/__init__.py +14 -0
  44. build/lib/firecrawl/v1/client.py +4653 -0
  45. build/lib/firecrawl/v2/__init__.py +4 -0
  46. build/lib/firecrawl/v2/client.py +805 -0
  47. build/lib/firecrawl/v2/client_async.py +250 -0
  48. build/lib/firecrawl/v2/methods/aio/__init__.py +1 -0
  49. build/lib/firecrawl/v2/methods/aio/batch.py +85 -0
  50. build/lib/firecrawl/v2/methods/aio/crawl.py +171 -0
  51. build/lib/firecrawl/v2/methods/aio/extract.py +126 -0
  52. build/lib/firecrawl/v2/methods/aio/map.py +59 -0
  53. build/lib/firecrawl/v2/methods/aio/scrape.py +33 -0
  54. build/lib/firecrawl/v2/methods/aio/search.py +172 -0
  55. build/lib/firecrawl/v2/methods/aio/usage.py +42 -0
  56. build/lib/firecrawl/v2/methods/batch.py +417 -0
  57. build/lib/firecrawl/v2/methods/crawl.py +469 -0
  58. build/lib/firecrawl/v2/methods/extract.py +131 -0
  59. build/lib/firecrawl/v2/methods/map.py +77 -0
  60. build/lib/firecrawl/v2/methods/scrape.py +64 -0
  61. build/lib/firecrawl/v2/methods/search.py +197 -0
  62. build/lib/firecrawl/v2/methods/usage.py +41 -0
  63. build/lib/firecrawl/v2/types.py +665 -0
  64. build/lib/firecrawl/v2/utils/__init__.py +9 -0
  65. build/lib/firecrawl/v2/utils/error_handler.py +107 -0
  66. build/lib/firecrawl/v2/utils/get_version.py +15 -0
  67. build/lib/firecrawl/v2/utils/http_client.py +153 -0
  68. build/lib/firecrawl/v2/utils/http_client_async.py +65 -0
  69. build/lib/firecrawl/v2/utils/normalize.py +107 -0
  70. build/lib/firecrawl/v2/utils/validation.py +324 -0
  71. build/lib/firecrawl/v2/watcher.py +301 -0
  72. build/lib/firecrawl/v2/watcher_async.py +242 -0
  73. build/lib/tests/test_change_tracking.py +98 -0
  74. build/lib/tests/test_timeout_conversion.py +117 -0
  75. firecrawl/__init__.py +1 -1
  76. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +2 -2
  77. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +6 -6
  78. firecrawl/v2/client.py +3 -0
  79. firecrawl/v2/methods/search.py +11 -0
  80. firecrawl/v2/types.py +30 -1
  81. {firecrawl_py-3.2.1.dist-info/licenses → firecrawl_py-3.3.1.dist-info}/LICENSE +0 -0
  82. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.1.dist-info}/METADATA +3 -7
  83. firecrawl_py-3.3.1.dist-info/RECORD +153 -0
  84. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.1.dist-info}/WHEEL +1 -1
  85. {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.1.dist-info}/top_level.txt +2 -0
  86. firecrawl_py-3.2.1.dist-info/RECORD +0 -79
@@ -0,0 +1,242 @@
1
+ """
2
+ Async WebSocket watcher with async iterator interface for v2 jobs.
3
+
4
+ Usage:
5
+ async for snapshot in AsyncWatcher(client, job_id, kind="crawl"):
6
+ print(snapshot.status)
7
+ """
8
+
9
+ import asyncio
10
+ import inspect
11
+ import json
12
+ import time
13
+ from typing import AsyncIterator, Dict, List, Literal, Optional
14
+
15
+ import websockets
16
+ from websockets.exceptions import ConnectionClosed, ConnectionClosedOK, ConnectionClosedError
17
+
18
+ from .types import BatchScrapeJob, CrawlJob, Document
19
+ from .utils.normalize import normalize_document_input
20
+
21
+ JobKind = Literal["crawl", "batch"]
22
+
23
+
24
+ class AsyncWatcher:
25
+ def __init__(
26
+ self,
27
+ client: object,
28
+ job_id: str,
29
+ *,
30
+ kind: JobKind = "crawl",
31
+ timeout: Optional[int] = None,
32
+ ) -> None:
33
+ self._client = client
34
+ self._job_id = job_id
35
+ self._kind = kind
36
+ self._timeout = timeout
37
+ self._poll_interval: float = 2.0
38
+
39
+ http_client = getattr(client, "http_client", None)
40
+ if http_client is not None:
41
+ self._api_url = getattr(http_client, "api_url", None)
42
+ self._api_key = getattr(http_client, "api_key", None)
43
+ else:
44
+ # Allow passing the top-level Firecrawl client directly
45
+ self._api_url = getattr(client, "api_url", None)
46
+ self._api_key = getattr(client, "api_key", None)
47
+
48
+ self._status: str = "scraping"
49
+ self._data: List[Dict] = []
50
+
51
+ def __aiter__(self) -> AsyncIterator[object]:
52
+ return self._iterate()
53
+
54
+ def _build_ws_url(self) -> str:
55
+ if not self._api_url:
56
+ raise ValueError("API URL is required for WebSocket watcher")
57
+ ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
58
+ if self._kind == "crawl":
59
+ return f"{ws_base}/v2/crawl/{self._job_id}"
60
+ return f"{ws_base}/v2/batch/scrape/{self._job_id}"
61
+
62
+ async def _iterate(self) -> AsyncIterator[object]:
63
+ uri = self._build_ws_url()
64
+ headers_list = []
65
+ if self._api_key:
66
+ headers_list.append(("Authorization", f"Bearer {self._api_key}"))
67
+
68
+ # Attempt to establish WS; on failure, fall back to HTTP polling immediately
69
+ try:
70
+ async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
71
+ deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
72
+ # Pre-yield a snapshot if available to ensure progress is visible
73
+ try:
74
+ pre = await self._fetch_job_status()
75
+ yield pre
76
+ if pre.status in ("completed", "failed", "cancelled"):
77
+ return
78
+ except Exception:
79
+ pass
80
+
81
+ while True:
82
+ try:
83
+ if deadline is not None:
84
+ remaining = max(0.0, deadline - asyncio.get_event_loop().time())
85
+ timeout = min(self._poll_interval, remaining) if remaining > 0 else 0.0
86
+ else:
87
+ timeout = self._poll_interval
88
+ msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
89
+ except asyncio.TimeoutError:
90
+ # Quiet period: poll HTTP once
91
+ job = await self._safe_fetch()
92
+ if job is not None:
93
+ yield job
94
+ if job.status in ("completed", "failed", "cancelled"):
95
+ return
96
+ if deadline is not None and asyncio.get_event_loop().time() >= deadline:
97
+ return
98
+ continue
99
+ except (ConnectionClosedOK, ConnectionClosed, ConnectionClosedError):
100
+ # Graceful/abrupt close: poll HTTP until terminal (bounded by timeout)
101
+ deadline = time.time() + (self._timeout or 30)
102
+ while True:
103
+ try:
104
+ job = await self._fetch_job_status()
105
+ yield job
106
+ if job.status in ("completed", "failed", "cancelled"):
107
+ return
108
+ except Exception:
109
+ return
110
+ if time.time() >= deadline:
111
+ return
112
+ await asyncio.sleep(1)
113
+ try:
114
+ body = json.loads(msg)
115
+ except Exception:
116
+ continue
117
+
118
+ msg_type = body.get("type")
119
+ if msg_type == "error":
120
+ self._status = "failed"
121
+ # Yield a terminal snapshot
122
+ if self._kind == "crawl":
123
+ yield CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
124
+ else:
125
+ yield BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
126
+ return
127
+ elif msg_type == "catchup":
128
+ d = body.get("data", {})
129
+ self._status = d.get("status", self._status)
130
+ docs_in = d.get("data", []) or []
131
+ self._data.extend(docs_in)
132
+ # Fall through to emit a snapshot below
133
+ elif msg_type == "document":
134
+ doc = body.get("data")
135
+ if isinstance(doc, dict):
136
+ self._data.append(doc)
137
+ # Fall through to emit a snapshot below
138
+ elif msg_type == "done":
139
+ self._status = "completed"
140
+ raw_payload = body.get("data", {}) or {}
141
+ docs_in = raw_payload.get("data", []) or []
142
+ if isinstance(docs_in, list) and docs_in:
143
+ for doc in docs_in:
144
+ if isinstance(doc, dict):
145
+ self._data.append(doc)
146
+ # Emit final snapshot then end
147
+ yield self._make_snapshot(status="completed", payload=raw_payload, docs_override=self._data)
148
+ return
149
+
150
+ # Generic snapshot emit for status messages and periodic progress
151
+ payload = body.get("data", body)
152
+ status_str = payload.get("status", body.get("status", self._status))
153
+ snapshot = self._make_snapshot(status=status_str, payload=payload)
154
+ yield snapshot
155
+ if status_str in ("completed", "failed", "cancelled"):
156
+ return
157
+ except Exception:
158
+ # WS connect failure: fallback to HTTP polling loop until terminal/timeout
159
+ deadline = time.time() + (self._timeout or 30)
160
+ while True:
161
+ try:
162
+ job = await self._fetch_job_status()
163
+ yield job
164
+ if job.status in ("completed", "failed", "cancelled"):
165
+ return
166
+ except Exception:
167
+ return
168
+ if time.time() >= deadline:
169
+ return
170
+ await asyncio.sleep(1)
171
+
172
+ async def _fetch_job_status(self):
173
+ if self._kind == "crawl":
174
+ return await self._call_status_method("get_crawl_status")
175
+ return await self._call_status_method("get_batch_scrape_status")
176
+
177
+ async def _call_status_method(self, method_name: str):
178
+ # Try on client directly
179
+ meth = getattr(self._client, method_name, None)
180
+ if meth is not None:
181
+ try:
182
+ result = meth(self._job_id)
183
+ except TypeError:
184
+ result = None
185
+ if result is not None:
186
+ if inspect.isawaitable(result):
187
+ return await result
188
+ return result
189
+ # Fallback: if we couldn't call directly, try to_thread
190
+ return await asyncio.to_thread(meth, self._job_id)
191
+
192
+ # Try on client.v2
193
+ v2 = getattr(self._client, "v2", None)
194
+ if v2 is not None:
195
+ meth = getattr(v2, method_name, None)
196
+ if meth is not None:
197
+ try:
198
+ result = meth(self._job_id)
199
+ except TypeError:
200
+ result = None
201
+ if result is not None:
202
+ if inspect.isawaitable(result):
203
+ return await result
204
+ return result
205
+ return await asyncio.to_thread(meth, self._job_id)
206
+
207
+ raise RuntimeError(f"Client does not expose {method_name}")
208
+
209
+ async def _safe_fetch(self):
210
+ try:
211
+ return await self._fetch_job_status()
212
+ except Exception:
213
+ return None
214
+
215
+ def _make_snapshot(self, *, status: str, payload: Dict, docs_override: Optional[List[Dict]] = None):
216
+ docs = []
217
+ source_docs = docs_override if docs_override is not None else payload.get("data", []) or []
218
+ for doc in source_docs:
219
+ if isinstance(doc, dict):
220
+ d = normalize_document_input(doc)
221
+ docs.append(Document(**d))
222
+
223
+ if self._kind == "crawl":
224
+ return CrawlJob(
225
+ status=status,
226
+ completed=payload.get("completed", 0),
227
+ total=payload.get("total", 0),
228
+ credits_used=payload.get("creditsUsed", 0),
229
+ expires_at=payload.get("expiresAt"),
230
+ next=payload.get("next"),
231
+ data=docs,
232
+ )
233
+ return BatchScrapeJob(
234
+ status=status,
235
+ completed=payload.get("completed", 0),
236
+ total=payload.get("total", 0),
237
+ credits_used=payload.get("creditsUsed"),
238
+ expires_at=payload.get("expiresAt"),
239
+ next=payload.get("next"),
240
+ data=docs,
241
+ )
242
+
@@ -0,0 +1,98 @@
1
+ import unittest
2
+ from unittest.mock import patch, MagicMock
3
+ import json
4
+ import os
5
+ from firecrawl import FirecrawlApp
6
+
7
+ class TestChangeTracking(unittest.TestCase):
8
+ @patch('requests.post')
9
+ def test_change_tracking_format(self, mock_post):
10
+ mock_response = MagicMock()
11
+ mock_response.status_code = 200
12
+ mock_response.json.return_value = {
13
+ 'success': True,
14
+ 'data': {
15
+ 'markdown': 'Test markdown content',
16
+ 'changeTracking': {
17
+ 'previousScrapeAt': '2023-01-01T00:00:00Z',
18
+ 'changeStatus': 'changed',
19
+ 'visibility': 'visible'
20
+ }
21
+ }
22
+ }
23
+ mock_post.return_value = mock_response
24
+
25
+ app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
26
+ result = app.scrape_url('https://example.com', {
27
+ 'formats': ['markdown', 'changeTracking']
28
+ })
29
+
30
+ args, kwargs = mock_post.call_args
31
+ self.assertEqual(kwargs['json']['formats'], ['markdown', 'changeTracking'])
32
+
33
+ self.assertEqual(result['changeTracking']['previousScrapeAt'], '2023-01-01T00:00:00Z')
34
+ self.assertEqual(result['changeTracking']['changeStatus'], 'changed')
35
+ self.assertEqual(result['changeTracking']['visibility'], 'visible')
36
+
37
+ @patch('requests.post')
38
+ def test_change_tracking_options(self, mock_post):
39
+ mock_response = MagicMock()
40
+ mock_response.status_code = 200
41
+ mock_response.json.return_value = {
42
+ 'success': True,
43
+ 'data': {
44
+ 'markdown': 'Test markdown content',
45
+ 'changeTracking': {
46
+ 'previousScrapeAt': '2023-01-01T00:00:00Z',
47
+ 'changeStatus': 'changed',
48
+ 'visibility': 'visible',
49
+ 'diff': {
50
+ 'text': '@@ -1,1 +1,1 @@\n-old content\n+new content',
51
+ 'json': {
52
+ 'files': [{
53
+ 'from': None,
54
+ 'to': None,
55
+ 'chunks': [{
56
+ 'content': '@@ -1,1 +1,1 @@',
57
+ 'changes': [{
58
+ 'type': 'del',
59
+ 'content': '-old content',
60
+ 'del': True,
61
+ 'ln': 1
62
+ }, {
63
+ 'type': 'add',
64
+ 'content': '+new content',
65
+ 'add': True,
66
+ 'ln': 1
67
+ }]
68
+ }]
69
+ }]
70
+ }
71
+ },
72
+ 'json': {
73
+ 'title': {
74
+ 'previous': 'Old Title',
75
+ 'current': 'New Title'
76
+ }
77
+ }
78
+ }
79
+ }
80
+ }
81
+ mock_post.return_value = mock_response
82
+
83
+ app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
84
+ result = app.scrape_url('https://example.com', {
85
+ 'formats': ['markdown', 'changeTracking'],
86
+ 'changeTrackingOptions': {
87
+ 'modes': ['git-diff', 'json'],
88
+ 'schema': {'type': 'object', 'properties': {'title': {'type': 'string'}}}
89
+ }
90
+ })
91
+
92
+ args, kwargs = mock_post.call_args
93
+ self.assertEqual(kwargs['json']['formats'], ['markdown', 'changeTracking'])
94
+ self.assertEqual(kwargs['json']['changeTrackingOptions']['modes'], ['git-diff', 'json'])
95
+
96
+ self.assertEqual(result['changeTracking']['diff']['text'], '@@ -1,1 +1,1 @@\n-old content\n+new content')
97
+ self.assertEqual(result['changeTracking']['json']['title']['previous'], 'Old Title')
98
+ self.assertEqual(result['changeTracking']['json']['title']['current'], 'New Title')
@@ -0,0 +1,117 @@
1
+ import unittest
2
+ from unittest.mock import patch, MagicMock
3
+ import os
4
+ from firecrawl import FirecrawlApp
5
+
6
+
7
+ class TestTimeoutConversion(unittest.TestCase):
8
+
9
+ @patch('requests.post')
10
+ def test_scrape_url_timeout_conversion(self, mock_post):
11
+ mock_response = MagicMock()
12
+ mock_response.status_code = 200
13
+ mock_response.json.return_value = {
14
+ 'success': True,
15
+ 'data': {
16
+ 'markdown': 'Test content'
17
+ }
18
+ }
19
+ mock_post.return_value = mock_response
20
+
21
+ app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
22
+ app.scrape_url('https://example.com', timeout=60000)
23
+
24
+ args, kwargs = mock_post.call_args
25
+ self.assertEqual(kwargs['timeout'], 65.0)
26
+
27
+ @patch('requests.post')
28
+ def test_scrape_url_default_timeout(self, mock_post):
29
+ mock_response = MagicMock()
30
+ mock_response.status_code = 200
31
+ mock_response.json.return_value = {
32
+ 'success': True,
33
+ 'data': {
34
+ 'markdown': 'Test content'
35
+ }
36
+ }
37
+ mock_post.return_value = mock_response
38
+
39
+ app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
40
+ app.scrape_url('https://example.com')
41
+
42
+ args, kwargs = mock_post.call_args
43
+ self.assertEqual(kwargs['timeout'], 35.0)
44
+
45
+ @patch('requests.post')
46
+ def test_post_request_timeout_conversion(self, mock_post):
47
+ mock_response = MagicMock()
48
+ mock_response.status_code = 200
49
+ mock_post.return_value = mock_response
50
+
51
+ app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
52
+
53
+ data = {'timeout': 30000}
54
+ headers = {'Content-Type': 'application/json'}
55
+
56
+ app._post_request('https://example.com/api', data, headers)
57
+
58
+ args, kwargs = mock_post.call_args
59
+ self.assertEqual(kwargs['timeout'], 35.0)
60
+
61
+ @patch('requests.post')
62
+ def test_post_request_default_timeout(self, mock_post):
63
+ mock_response = MagicMock()
64
+ mock_response.status_code = 200
65
+ mock_post.return_value = mock_response
66
+
67
+ app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
68
+
69
+ data = {'timeout': 30000, 'url': 'https://example.com'}
70
+ headers = {'Content-Type': 'application/json'}
71
+
72
+ app._post_request('https://example.com/api', data, headers)
73
+
74
+ args, kwargs = mock_post.call_args
75
+ self.assertEqual(kwargs['timeout'], 35.0)
76
+
77
+ @patch('requests.post')
78
+ def test_timeout_edge_cases(self, mock_post):
79
+ mock_response = MagicMock()
80
+ mock_response.status_code = 200
81
+ mock_response.json.return_value = {
82
+ 'success': True,
83
+ 'data': {
84
+ 'markdown': 'Test content'
85
+ }
86
+ }
87
+ mock_post.return_value = mock_response
88
+
89
+ app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
90
+
91
+ app.scrape_url('https://example.com', timeout=1000)
92
+ args, kwargs = mock_post.call_args
93
+ self.assertEqual(kwargs['timeout'], 6.0)
94
+
95
+ app.scrape_url('https://example.com', timeout=0)
96
+ args, kwargs = mock_post.call_args
97
+ self.assertEqual(kwargs['timeout'], 5.0)
98
+
99
+ @patch('requests.post')
100
+ def test_post_request_no_timeout_key(self, mock_post):
101
+ mock_response = MagicMock()
102
+ mock_response.status_code = 200
103
+ mock_post.return_value = mock_response
104
+
105
+ app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
106
+
107
+ data = {'url': 'https://example.com'}
108
+ headers = {'Content-Type': 'application/json'}
109
+
110
+ app._post_request('https://example.com/api', data, headers)
111
+
112
+ args, kwargs = mock_post.call_args
113
+ self.assertIsNone(kwargs['timeout'])
114
+
115
+
116
+ if __name__ == '__main__':
117
+ unittest.main()
firecrawl/__init__.py CHANGED
@@ -17,7 +17,7 @@ from .v1 import (
17
17
  V1ChangeTrackingOptions,
18
18
  )
19
19
 
20
- __version__ = "3.2.1"
20
+ __version__ = "3.3.1"
21
21
 
22
22
  # Define the logger for the Firecrawl project
23
23
  logger: logging.Logger = logging.getLogger("firecrawl")
@@ -14,7 +14,7 @@ class TestAsyncCrawlRequestPreparation:
14
14
  include_paths=["/docs/*"],
15
15
  exclude_paths=["/admin/*"],
16
16
  max_discovery_depth=2,
17
- ignore_sitemap=True,
17
+ sitemap="skip",
18
18
  ignore_query_parameters=True,
19
19
  crawl_entire_domain=True,
20
20
  allow_external_links=False,
@@ -26,7 +26,7 @@ class TestAsyncCrawlRequestPreparation:
26
26
  assert payload["includePaths"] == ["/docs/*"]
27
27
  assert payload["excludePaths"] == ["/admin/*"]
28
28
  assert payload["maxDiscoveryDepth"] == 2
29
- assert payload["ignoreSitemap"] is True
29
+ assert payload["sitemap"] == "skip"
30
30
  assert payload["ignoreQueryParameters"] is True
31
31
  assert payload["crawlEntireDomain"] is True
32
32
  assert payload["allowExternalLinks"] is False
@@ -24,7 +24,7 @@ class TestCrawlRequestPreparation:
24
24
  url="https://example.com",
25
25
  limit=10,
26
26
  max_discovery_depth=3,
27
- ignore_sitemap=True,
27
+ sitemap="skip",
28
28
  crawl_entire_domain=False,
29
29
  allow_external_links=True
30
30
  )
@@ -39,8 +39,8 @@ class TestCrawlRequestPreparation:
39
39
  assert data["limit"] == 10
40
40
  assert "maxDiscoveryDepth" in data
41
41
  assert data["maxDiscoveryDepth"] == 3
42
- assert "ignoreSitemap" in data
43
- assert data["ignoreSitemap"] is True
42
+ assert "sitemap" in data
43
+ assert data["sitemap"] == "skip"
44
44
  assert "crawlEntireDomain" in data
45
45
  assert data["crawlEntireDomain"] is False
46
46
  assert "allowExternalLinks" in data
@@ -106,7 +106,7 @@ class TestCrawlRequestPreparation:
106
106
  include_paths=["/blog/*", "/docs/*"],
107
107
  exclude_paths=["/admin/*"],
108
108
  max_discovery_depth=3,
109
- ignore_sitemap=False,
109
+ sitemap="include",
110
110
  limit=100,
111
111
  crawl_entire_domain=True,
112
112
  allow_external_links=False,
@@ -126,8 +126,8 @@ class TestCrawlRequestPreparation:
126
126
  assert data["excludePaths"] == ["/admin/*"]
127
127
  assert "maxDiscoveryDepth" in data
128
128
  assert data["maxDiscoveryDepth"] == 3
129
- assert "ignoreSitemap" in data
130
- assert data["ignoreSitemap"] is False
129
+ assert "sitemap" in data
130
+ assert data["sitemap"] == "include"
131
131
  assert "limit" in data
132
132
  assert data["limit"] == 100
133
133
  assert "crawlEntireDomain" in data
firecrawl/v2/client.py CHANGED
@@ -13,6 +13,7 @@ from .types import (
13
13
  SearchRequest,
14
14
  SearchData,
15
15
  SourceOption,
16
+ CategoryOption,
16
17
  CrawlRequest,
17
18
  CrawlResponse,
18
19
  CrawlJob,
@@ -171,6 +172,7 @@ class FirecrawlClient:
171
172
  query: str,
172
173
  *,
173
174
  sources: Optional[List[SourceOption]] = None,
175
+ categories: Optional[List[CategoryOption]] = None,
174
176
  limit: Optional[int] = None,
175
177
  tbs: Optional[str] = None,
176
178
  location: Optional[str] = None,
@@ -195,6 +197,7 @@ class FirecrawlClient:
195
197
  request = SearchRequest(
196
198
  query=query,
197
199
  sources=sources,
200
+ categories=categories,
198
201
  limit=limit,
199
202
  tbs=tbs,
200
203
  location=location,
@@ -121,6 +121,17 @@ def _validate_search_request(request: SearchRequest) -> SearchRequest:
121
121
  if source.type not in valid_sources:
122
122
  raise ValueError(f"Invalid source type: {source.type}. Valid types: {valid_sources}")
123
123
 
124
+ # Validate categories (if provided)
125
+ if request.categories is not None:
126
+ valid_categories = {"github", "research"}
127
+ for category in request.categories:
128
+ if isinstance(category, str):
129
+ if category not in valid_categories:
130
+ raise ValueError(f"Invalid category type: {category}. Valid types: {valid_categories}")
131
+ elif hasattr(category, 'type'):
132
+ if category.type not in valid_categories:
133
+ raise ValueError(f"Invalid category type: {category.type}. Valid types: {valid_categories}")
134
+
124
135
  # Validate location (if provided)
125
136
  if request.location is not None:
126
137
  if not isinstance(request.location, str) or len(request.location.strip()) == 0:
firecrawl/v2/types.py CHANGED
@@ -174,6 +174,12 @@ class Source(BaseModel):
174
174
 
175
175
  SourceOption = Union[str, Source]
176
176
 
177
+ class Category(BaseModel):
178
+ """Configuration for a search category."""
179
+ type: str
180
+
181
+ CategoryOption = Union[str, Category]
182
+
177
183
  FormatString = Literal[
178
184
  # camelCase versions (API format)
179
185
  "markdown", "html", "rawHtml", "links", "screenshot", "summary", "changeTracking", "json",
@@ -331,7 +337,8 @@ class SearchResultWeb(BaseModel):
331
337
  """A web search result with URL, title, and description."""
332
338
  url: str
333
339
  title: Optional[str] = None
334
- description: Optional[str] = None
340
+ description: Optional[str] = None
341
+ category: Optional[str] = None
335
342
 
336
343
  class SearchResultNews(BaseModel):
337
344
  """A news search result with URL, title, snippet, date, image URL, and position."""
@@ -341,6 +348,7 @@ class SearchResultNews(BaseModel):
341
348
  date: Optional[str] = None
342
349
  image_url: Optional[str] = None
343
350
  position: Optional[int] = None
351
+ category: Optional[str] = None
344
352
 
345
353
  class SearchResultImages(BaseModel):
346
354
  """An image search result with URL, title, image URL, image width, image height, and position."""
@@ -521,6 +529,7 @@ class SearchRequest(BaseModel):
521
529
  """Request for search operations."""
522
530
  query: str
523
531
  sources: Optional[List[SourceOption]] = None
532
+ categories: Optional[List[CategoryOption]] = None
524
533
  limit: Optional[int] = 5
525
534
  tbs: Optional[str] = None
526
535
  location: Optional[str] = None
@@ -547,6 +556,26 @@ class SearchRequest(BaseModel):
547
556
  raise ValueError(f"Invalid source format: {source}")
548
557
 
549
558
  return normalized_sources
559
+
560
+ @field_validator('categories')
561
+ @classmethod
562
+ def validate_categories(cls, v):
563
+ """Validate and normalize categories input."""
564
+ if v is None:
565
+ return v
566
+
567
+ normalized_categories = []
568
+ for category in v:
569
+ if isinstance(category, str):
570
+ normalized_categories.append(Category(type=category))
571
+ elif isinstance(category, dict):
572
+ normalized_categories.append(Category(**category))
573
+ elif isinstance(category, Category):
574
+ normalized_categories.append(category)
575
+ else:
576
+ raise ValueError(f"Invalid category format: {category}")
577
+
578
+ return normalized_categories
550
579
 
551
580
  class LinkResult(BaseModel):
552
581
  """A generic link result with optional metadata (used by search and map)."""
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.4
1
+ Metadata-Version: 2.1
2
2
  Name: firecrawl-py
3
- Version: 3.2.1
3
+ Version: 3.3.1
4
4
  Summary: Python SDK for Firecrawl API
5
5
  Home-page: https://github.com/firecrawl/firecrawl
6
6
  Author: Mendable.ai
@@ -38,12 +38,8 @@ Requires-Dist: httpx
38
38
  Requires-Dist: python-dotenv
39
39
  Requires-Dist: websockets
40
40
  Requires-Dist: nest-asyncio
41
- Requires-Dist: pydantic>=2.0
41
+ Requires-Dist: pydantic (>=2.0)
42
42
  Requires-Dist: aiohttp
43
- Dynamic: author
44
- Dynamic: home-page
45
- Dynamic: license-file
46
- Dynamic: requires-python
47
43
 
48
44
  # Firecrawl Python SDK
49
45