firecrawl-py 3.2.1__py3-none-any.whl → 3.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl-py might be problematic. Click here for more details.
- build/lib/firecrawl/__init__.py +87 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +188 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +248 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- build/lib/firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_map.py +60 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_search.py +269 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
- build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- build/lib/firecrawl/client.py +242 -0
- build/lib/firecrawl/firecrawl.backup.py +4635 -0
- build/lib/firecrawl/types.py +161 -0
- build/lib/firecrawl/v1/__init__.py +14 -0
- build/lib/firecrawl/v1/client.py +4653 -0
- build/lib/firecrawl/v2/__init__.py +4 -0
- build/lib/firecrawl/v2/client.py +805 -0
- build/lib/firecrawl/v2/client_async.py +250 -0
- build/lib/firecrawl/v2/methods/aio/__init__.py +1 -0
- build/lib/firecrawl/v2/methods/aio/batch.py +85 -0
- build/lib/firecrawl/v2/methods/aio/crawl.py +171 -0
- build/lib/firecrawl/v2/methods/aio/extract.py +126 -0
- build/lib/firecrawl/v2/methods/aio/map.py +59 -0
- build/lib/firecrawl/v2/methods/aio/scrape.py +33 -0
- build/lib/firecrawl/v2/methods/aio/search.py +172 -0
- build/lib/firecrawl/v2/methods/aio/usage.py +42 -0
- build/lib/firecrawl/v2/methods/batch.py +417 -0
- build/lib/firecrawl/v2/methods/crawl.py +469 -0
- build/lib/firecrawl/v2/methods/extract.py +131 -0
- build/lib/firecrawl/v2/methods/map.py +77 -0
- build/lib/firecrawl/v2/methods/scrape.py +64 -0
- build/lib/firecrawl/v2/methods/search.py +197 -0
- build/lib/firecrawl/v2/methods/usage.py +41 -0
- build/lib/firecrawl/v2/types.py +665 -0
- build/lib/firecrawl/v2/utils/__init__.py +9 -0
- build/lib/firecrawl/v2/utils/error_handler.py +107 -0
- build/lib/firecrawl/v2/utils/get_version.py +15 -0
- build/lib/firecrawl/v2/utils/http_client.py +153 -0
- build/lib/firecrawl/v2/utils/http_client_async.py +65 -0
- build/lib/firecrawl/v2/utils/normalize.py +107 -0
- build/lib/firecrawl/v2/utils/validation.py +324 -0
- build/lib/firecrawl/v2/watcher.py +301 -0
- build/lib/firecrawl/v2/watcher_async.py +242 -0
- build/lib/tests/test_change_tracking.py +98 -0
- build/lib/tests/test_timeout_conversion.py +117 -0
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +2 -2
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +6 -6
- firecrawl/v2/client.py +3 -0
- firecrawl/v2/methods/search.py +11 -0
- firecrawl/v2/types.py +30 -1
- {firecrawl_py-3.2.1.dist-info/licenses → firecrawl_py-3.3.1.dist-info}/LICENSE +0 -0
- {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.1.dist-info}/METADATA +3 -7
- firecrawl_py-3.3.1.dist-info/RECORD +153 -0
- {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.1.dist-info}/WHEEL +1 -1
- {firecrawl_py-3.2.1.dist-info → firecrawl_py-3.3.1.dist-info}/top_level.txt +2 -0
- firecrawl_py-3.2.1.dist-info/RECORD +0 -79
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Async WebSocket watcher with async iterator interface for v2 jobs.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
async for snapshot in AsyncWatcher(client, job_id, kind="crawl"):
|
|
6
|
+
print(snapshot.status)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import inspect
|
|
11
|
+
import json
|
|
12
|
+
import time
|
|
13
|
+
from typing import AsyncIterator, Dict, List, Literal, Optional
|
|
14
|
+
|
|
15
|
+
import websockets
|
|
16
|
+
from websockets.exceptions import ConnectionClosed, ConnectionClosedOK, ConnectionClosedError
|
|
17
|
+
|
|
18
|
+
from .types import BatchScrapeJob, CrawlJob, Document
|
|
19
|
+
from .utils.normalize import normalize_document_input
|
|
20
|
+
|
|
21
|
+
JobKind = Literal["crawl", "batch"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AsyncWatcher:
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
client: object,
|
|
28
|
+
job_id: str,
|
|
29
|
+
*,
|
|
30
|
+
kind: JobKind = "crawl",
|
|
31
|
+
timeout: Optional[int] = None,
|
|
32
|
+
) -> None:
|
|
33
|
+
self._client = client
|
|
34
|
+
self._job_id = job_id
|
|
35
|
+
self._kind = kind
|
|
36
|
+
self._timeout = timeout
|
|
37
|
+
self._poll_interval: float = 2.0
|
|
38
|
+
|
|
39
|
+
http_client = getattr(client, "http_client", None)
|
|
40
|
+
if http_client is not None:
|
|
41
|
+
self._api_url = getattr(http_client, "api_url", None)
|
|
42
|
+
self._api_key = getattr(http_client, "api_key", None)
|
|
43
|
+
else:
|
|
44
|
+
# Allow passing the top-level Firecrawl client directly
|
|
45
|
+
self._api_url = getattr(client, "api_url", None)
|
|
46
|
+
self._api_key = getattr(client, "api_key", None)
|
|
47
|
+
|
|
48
|
+
self._status: str = "scraping"
|
|
49
|
+
self._data: List[Dict] = []
|
|
50
|
+
|
|
51
|
+
def __aiter__(self) -> AsyncIterator[object]:
|
|
52
|
+
return self._iterate()
|
|
53
|
+
|
|
54
|
+
def _build_ws_url(self) -> str:
|
|
55
|
+
if not self._api_url:
|
|
56
|
+
raise ValueError("API URL is required for WebSocket watcher")
|
|
57
|
+
ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
|
|
58
|
+
if self._kind == "crawl":
|
|
59
|
+
return f"{ws_base}/v2/crawl/{self._job_id}"
|
|
60
|
+
return f"{ws_base}/v2/batch/scrape/{self._job_id}"
|
|
61
|
+
|
|
62
|
+
async def _iterate(self) -> AsyncIterator[object]:
|
|
63
|
+
uri = self._build_ws_url()
|
|
64
|
+
headers_list = []
|
|
65
|
+
if self._api_key:
|
|
66
|
+
headers_list.append(("Authorization", f"Bearer {self._api_key}"))
|
|
67
|
+
|
|
68
|
+
# Attempt to establish WS; on failure, fall back to HTTP polling immediately
|
|
69
|
+
try:
|
|
70
|
+
async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
|
|
71
|
+
deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
|
|
72
|
+
# Pre-yield a snapshot if available to ensure progress is visible
|
|
73
|
+
try:
|
|
74
|
+
pre = await self._fetch_job_status()
|
|
75
|
+
yield pre
|
|
76
|
+
if pre.status in ("completed", "failed", "cancelled"):
|
|
77
|
+
return
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
while True:
|
|
82
|
+
try:
|
|
83
|
+
if deadline is not None:
|
|
84
|
+
remaining = max(0.0, deadline - asyncio.get_event_loop().time())
|
|
85
|
+
timeout = min(self._poll_interval, remaining) if remaining > 0 else 0.0
|
|
86
|
+
else:
|
|
87
|
+
timeout = self._poll_interval
|
|
88
|
+
msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
|
|
89
|
+
except asyncio.TimeoutError:
|
|
90
|
+
# Quiet period: poll HTTP once
|
|
91
|
+
job = await self._safe_fetch()
|
|
92
|
+
if job is not None:
|
|
93
|
+
yield job
|
|
94
|
+
if job.status in ("completed", "failed", "cancelled"):
|
|
95
|
+
return
|
|
96
|
+
if deadline is not None and asyncio.get_event_loop().time() >= deadline:
|
|
97
|
+
return
|
|
98
|
+
continue
|
|
99
|
+
except (ConnectionClosedOK, ConnectionClosed, ConnectionClosedError):
|
|
100
|
+
# Graceful/abrupt close: poll HTTP until terminal (bounded by timeout)
|
|
101
|
+
deadline = time.time() + (self._timeout or 30)
|
|
102
|
+
while True:
|
|
103
|
+
try:
|
|
104
|
+
job = await self._fetch_job_status()
|
|
105
|
+
yield job
|
|
106
|
+
if job.status in ("completed", "failed", "cancelled"):
|
|
107
|
+
return
|
|
108
|
+
except Exception:
|
|
109
|
+
return
|
|
110
|
+
if time.time() >= deadline:
|
|
111
|
+
return
|
|
112
|
+
await asyncio.sleep(1)
|
|
113
|
+
try:
|
|
114
|
+
body = json.loads(msg)
|
|
115
|
+
except Exception:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
msg_type = body.get("type")
|
|
119
|
+
if msg_type == "error":
|
|
120
|
+
self._status = "failed"
|
|
121
|
+
# Yield a terminal snapshot
|
|
122
|
+
if self._kind == "crawl":
|
|
123
|
+
yield CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
|
|
124
|
+
else:
|
|
125
|
+
yield BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
|
|
126
|
+
return
|
|
127
|
+
elif msg_type == "catchup":
|
|
128
|
+
d = body.get("data", {})
|
|
129
|
+
self._status = d.get("status", self._status)
|
|
130
|
+
docs_in = d.get("data", []) or []
|
|
131
|
+
self._data.extend(docs_in)
|
|
132
|
+
# Fall through to emit a snapshot below
|
|
133
|
+
elif msg_type == "document":
|
|
134
|
+
doc = body.get("data")
|
|
135
|
+
if isinstance(doc, dict):
|
|
136
|
+
self._data.append(doc)
|
|
137
|
+
# Fall through to emit a snapshot below
|
|
138
|
+
elif msg_type == "done":
|
|
139
|
+
self._status = "completed"
|
|
140
|
+
raw_payload = body.get("data", {}) or {}
|
|
141
|
+
docs_in = raw_payload.get("data", []) or []
|
|
142
|
+
if isinstance(docs_in, list) and docs_in:
|
|
143
|
+
for doc in docs_in:
|
|
144
|
+
if isinstance(doc, dict):
|
|
145
|
+
self._data.append(doc)
|
|
146
|
+
# Emit final snapshot then end
|
|
147
|
+
yield self._make_snapshot(status="completed", payload=raw_payload, docs_override=self._data)
|
|
148
|
+
return
|
|
149
|
+
|
|
150
|
+
# Generic snapshot emit for status messages and periodic progress
|
|
151
|
+
payload = body.get("data", body)
|
|
152
|
+
status_str = payload.get("status", body.get("status", self._status))
|
|
153
|
+
snapshot = self._make_snapshot(status=status_str, payload=payload)
|
|
154
|
+
yield snapshot
|
|
155
|
+
if status_str in ("completed", "failed", "cancelled"):
|
|
156
|
+
return
|
|
157
|
+
except Exception:
|
|
158
|
+
# WS connect failure: fallback to HTTP polling loop until terminal/timeout
|
|
159
|
+
deadline = time.time() + (self._timeout or 30)
|
|
160
|
+
while True:
|
|
161
|
+
try:
|
|
162
|
+
job = await self._fetch_job_status()
|
|
163
|
+
yield job
|
|
164
|
+
if job.status in ("completed", "failed", "cancelled"):
|
|
165
|
+
return
|
|
166
|
+
except Exception:
|
|
167
|
+
return
|
|
168
|
+
if time.time() >= deadline:
|
|
169
|
+
return
|
|
170
|
+
await asyncio.sleep(1)
|
|
171
|
+
|
|
172
|
+
async def _fetch_job_status(self):
|
|
173
|
+
if self._kind == "crawl":
|
|
174
|
+
return await self._call_status_method("get_crawl_status")
|
|
175
|
+
return await self._call_status_method("get_batch_scrape_status")
|
|
176
|
+
|
|
177
|
+
async def _call_status_method(self, method_name: str):
|
|
178
|
+
# Try on client directly
|
|
179
|
+
meth = getattr(self._client, method_name, None)
|
|
180
|
+
if meth is not None:
|
|
181
|
+
try:
|
|
182
|
+
result = meth(self._job_id)
|
|
183
|
+
except TypeError:
|
|
184
|
+
result = None
|
|
185
|
+
if result is not None:
|
|
186
|
+
if inspect.isawaitable(result):
|
|
187
|
+
return await result
|
|
188
|
+
return result
|
|
189
|
+
# Fallback: if we couldn't call directly, try to_thread
|
|
190
|
+
return await asyncio.to_thread(meth, self._job_id)
|
|
191
|
+
|
|
192
|
+
# Try on client.v2
|
|
193
|
+
v2 = getattr(self._client, "v2", None)
|
|
194
|
+
if v2 is not None:
|
|
195
|
+
meth = getattr(v2, method_name, None)
|
|
196
|
+
if meth is not None:
|
|
197
|
+
try:
|
|
198
|
+
result = meth(self._job_id)
|
|
199
|
+
except TypeError:
|
|
200
|
+
result = None
|
|
201
|
+
if result is not None:
|
|
202
|
+
if inspect.isawaitable(result):
|
|
203
|
+
return await result
|
|
204
|
+
return result
|
|
205
|
+
return await asyncio.to_thread(meth, self._job_id)
|
|
206
|
+
|
|
207
|
+
raise RuntimeError(f"Client does not expose {method_name}")
|
|
208
|
+
|
|
209
|
+
async def _safe_fetch(self):
|
|
210
|
+
try:
|
|
211
|
+
return await self._fetch_job_status()
|
|
212
|
+
except Exception:
|
|
213
|
+
return None
|
|
214
|
+
|
|
215
|
+
def _make_snapshot(self, *, status: str, payload: Dict, docs_override: Optional[List[Dict]] = None):
|
|
216
|
+
docs = []
|
|
217
|
+
source_docs = docs_override if docs_override is not None else payload.get("data", []) or []
|
|
218
|
+
for doc in source_docs:
|
|
219
|
+
if isinstance(doc, dict):
|
|
220
|
+
d = normalize_document_input(doc)
|
|
221
|
+
docs.append(Document(**d))
|
|
222
|
+
|
|
223
|
+
if self._kind == "crawl":
|
|
224
|
+
return CrawlJob(
|
|
225
|
+
status=status,
|
|
226
|
+
completed=payload.get("completed", 0),
|
|
227
|
+
total=payload.get("total", 0),
|
|
228
|
+
credits_used=payload.get("creditsUsed", 0),
|
|
229
|
+
expires_at=payload.get("expiresAt"),
|
|
230
|
+
next=payload.get("next"),
|
|
231
|
+
data=docs,
|
|
232
|
+
)
|
|
233
|
+
return BatchScrapeJob(
|
|
234
|
+
status=status,
|
|
235
|
+
completed=payload.get("completed", 0),
|
|
236
|
+
total=payload.get("total", 0),
|
|
237
|
+
credits_used=payload.get("creditsUsed"),
|
|
238
|
+
expires_at=payload.get("expiresAt"),
|
|
239
|
+
next=payload.get("next"),
|
|
240
|
+
data=docs,
|
|
241
|
+
)
|
|
242
|
+
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import patch, MagicMock
|
|
3
|
+
import json
|
|
4
|
+
import os
|
|
5
|
+
from firecrawl import FirecrawlApp
|
|
6
|
+
|
|
7
|
+
class TestChangeTracking(unittest.TestCase):
|
|
8
|
+
@patch('requests.post')
|
|
9
|
+
def test_change_tracking_format(self, mock_post):
|
|
10
|
+
mock_response = MagicMock()
|
|
11
|
+
mock_response.status_code = 200
|
|
12
|
+
mock_response.json.return_value = {
|
|
13
|
+
'success': True,
|
|
14
|
+
'data': {
|
|
15
|
+
'markdown': 'Test markdown content',
|
|
16
|
+
'changeTracking': {
|
|
17
|
+
'previousScrapeAt': '2023-01-01T00:00:00Z',
|
|
18
|
+
'changeStatus': 'changed',
|
|
19
|
+
'visibility': 'visible'
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
mock_post.return_value = mock_response
|
|
24
|
+
|
|
25
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
26
|
+
result = app.scrape_url('https://example.com', {
|
|
27
|
+
'formats': ['markdown', 'changeTracking']
|
|
28
|
+
})
|
|
29
|
+
|
|
30
|
+
args, kwargs = mock_post.call_args
|
|
31
|
+
self.assertEqual(kwargs['json']['formats'], ['markdown', 'changeTracking'])
|
|
32
|
+
|
|
33
|
+
self.assertEqual(result['changeTracking']['previousScrapeAt'], '2023-01-01T00:00:00Z')
|
|
34
|
+
self.assertEqual(result['changeTracking']['changeStatus'], 'changed')
|
|
35
|
+
self.assertEqual(result['changeTracking']['visibility'], 'visible')
|
|
36
|
+
|
|
37
|
+
@patch('requests.post')
|
|
38
|
+
def test_change_tracking_options(self, mock_post):
|
|
39
|
+
mock_response = MagicMock()
|
|
40
|
+
mock_response.status_code = 200
|
|
41
|
+
mock_response.json.return_value = {
|
|
42
|
+
'success': True,
|
|
43
|
+
'data': {
|
|
44
|
+
'markdown': 'Test markdown content',
|
|
45
|
+
'changeTracking': {
|
|
46
|
+
'previousScrapeAt': '2023-01-01T00:00:00Z',
|
|
47
|
+
'changeStatus': 'changed',
|
|
48
|
+
'visibility': 'visible',
|
|
49
|
+
'diff': {
|
|
50
|
+
'text': '@@ -1,1 +1,1 @@\n-old content\n+new content',
|
|
51
|
+
'json': {
|
|
52
|
+
'files': [{
|
|
53
|
+
'from': None,
|
|
54
|
+
'to': None,
|
|
55
|
+
'chunks': [{
|
|
56
|
+
'content': '@@ -1,1 +1,1 @@',
|
|
57
|
+
'changes': [{
|
|
58
|
+
'type': 'del',
|
|
59
|
+
'content': '-old content',
|
|
60
|
+
'del': True,
|
|
61
|
+
'ln': 1
|
|
62
|
+
}, {
|
|
63
|
+
'type': 'add',
|
|
64
|
+
'content': '+new content',
|
|
65
|
+
'add': True,
|
|
66
|
+
'ln': 1
|
|
67
|
+
}]
|
|
68
|
+
}]
|
|
69
|
+
}]
|
|
70
|
+
}
|
|
71
|
+
},
|
|
72
|
+
'json': {
|
|
73
|
+
'title': {
|
|
74
|
+
'previous': 'Old Title',
|
|
75
|
+
'current': 'New Title'
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
mock_post.return_value = mock_response
|
|
82
|
+
|
|
83
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
84
|
+
result = app.scrape_url('https://example.com', {
|
|
85
|
+
'formats': ['markdown', 'changeTracking'],
|
|
86
|
+
'changeTrackingOptions': {
|
|
87
|
+
'modes': ['git-diff', 'json'],
|
|
88
|
+
'schema': {'type': 'object', 'properties': {'title': {'type': 'string'}}}
|
|
89
|
+
}
|
|
90
|
+
})
|
|
91
|
+
|
|
92
|
+
args, kwargs = mock_post.call_args
|
|
93
|
+
self.assertEqual(kwargs['json']['formats'], ['markdown', 'changeTracking'])
|
|
94
|
+
self.assertEqual(kwargs['json']['changeTrackingOptions']['modes'], ['git-diff', 'json'])
|
|
95
|
+
|
|
96
|
+
self.assertEqual(result['changeTracking']['diff']['text'], '@@ -1,1 +1,1 @@\n-old content\n+new content')
|
|
97
|
+
self.assertEqual(result['changeTracking']['json']['title']['previous'], 'Old Title')
|
|
98
|
+
self.assertEqual(result['changeTracking']['json']['title']['current'], 'New Title')
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
from unittest.mock import patch, MagicMock
|
|
3
|
+
import os
|
|
4
|
+
from firecrawl import FirecrawlApp
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class TestTimeoutConversion(unittest.TestCase):
|
|
8
|
+
|
|
9
|
+
@patch('requests.post')
|
|
10
|
+
def test_scrape_url_timeout_conversion(self, mock_post):
|
|
11
|
+
mock_response = MagicMock()
|
|
12
|
+
mock_response.status_code = 200
|
|
13
|
+
mock_response.json.return_value = {
|
|
14
|
+
'success': True,
|
|
15
|
+
'data': {
|
|
16
|
+
'markdown': 'Test content'
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
mock_post.return_value = mock_response
|
|
20
|
+
|
|
21
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
22
|
+
app.scrape_url('https://example.com', timeout=60000)
|
|
23
|
+
|
|
24
|
+
args, kwargs = mock_post.call_args
|
|
25
|
+
self.assertEqual(kwargs['timeout'], 65.0)
|
|
26
|
+
|
|
27
|
+
@patch('requests.post')
|
|
28
|
+
def test_scrape_url_default_timeout(self, mock_post):
|
|
29
|
+
mock_response = MagicMock()
|
|
30
|
+
mock_response.status_code = 200
|
|
31
|
+
mock_response.json.return_value = {
|
|
32
|
+
'success': True,
|
|
33
|
+
'data': {
|
|
34
|
+
'markdown': 'Test content'
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
mock_post.return_value = mock_response
|
|
38
|
+
|
|
39
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
40
|
+
app.scrape_url('https://example.com')
|
|
41
|
+
|
|
42
|
+
args, kwargs = mock_post.call_args
|
|
43
|
+
self.assertEqual(kwargs['timeout'], 35.0)
|
|
44
|
+
|
|
45
|
+
@patch('requests.post')
|
|
46
|
+
def test_post_request_timeout_conversion(self, mock_post):
|
|
47
|
+
mock_response = MagicMock()
|
|
48
|
+
mock_response.status_code = 200
|
|
49
|
+
mock_post.return_value = mock_response
|
|
50
|
+
|
|
51
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
52
|
+
|
|
53
|
+
data = {'timeout': 30000}
|
|
54
|
+
headers = {'Content-Type': 'application/json'}
|
|
55
|
+
|
|
56
|
+
app._post_request('https://example.com/api', data, headers)
|
|
57
|
+
|
|
58
|
+
args, kwargs = mock_post.call_args
|
|
59
|
+
self.assertEqual(kwargs['timeout'], 35.0)
|
|
60
|
+
|
|
61
|
+
@patch('requests.post')
|
|
62
|
+
def test_post_request_default_timeout(self, mock_post):
|
|
63
|
+
mock_response = MagicMock()
|
|
64
|
+
mock_response.status_code = 200
|
|
65
|
+
mock_post.return_value = mock_response
|
|
66
|
+
|
|
67
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
68
|
+
|
|
69
|
+
data = {'timeout': 30000, 'url': 'https://example.com'}
|
|
70
|
+
headers = {'Content-Type': 'application/json'}
|
|
71
|
+
|
|
72
|
+
app._post_request('https://example.com/api', data, headers)
|
|
73
|
+
|
|
74
|
+
args, kwargs = mock_post.call_args
|
|
75
|
+
self.assertEqual(kwargs['timeout'], 35.0)
|
|
76
|
+
|
|
77
|
+
@patch('requests.post')
|
|
78
|
+
def test_timeout_edge_cases(self, mock_post):
|
|
79
|
+
mock_response = MagicMock()
|
|
80
|
+
mock_response.status_code = 200
|
|
81
|
+
mock_response.json.return_value = {
|
|
82
|
+
'success': True,
|
|
83
|
+
'data': {
|
|
84
|
+
'markdown': 'Test content'
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
mock_post.return_value = mock_response
|
|
88
|
+
|
|
89
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
90
|
+
|
|
91
|
+
app.scrape_url('https://example.com', timeout=1000)
|
|
92
|
+
args, kwargs = mock_post.call_args
|
|
93
|
+
self.assertEqual(kwargs['timeout'], 6.0)
|
|
94
|
+
|
|
95
|
+
app.scrape_url('https://example.com', timeout=0)
|
|
96
|
+
args, kwargs = mock_post.call_args
|
|
97
|
+
self.assertEqual(kwargs['timeout'], 5.0)
|
|
98
|
+
|
|
99
|
+
@patch('requests.post')
|
|
100
|
+
def test_post_request_no_timeout_key(self, mock_post):
|
|
101
|
+
mock_response = MagicMock()
|
|
102
|
+
mock_response.status_code = 200
|
|
103
|
+
mock_post.return_value = mock_response
|
|
104
|
+
|
|
105
|
+
app = FirecrawlApp(api_key=os.environ.get('TEST_API_KEY', 'dummy-api-key-for-testing'))
|
|
106
|
+
|
|
107
|
+
data = {'url': 'https://example.com'}
|
|
108
|
+
headers = {'Content-Type': 'application/json'}
|
|
109
|
+
|
|
110
|
+
app._post_request('https://example.com/api', data, headers)
|
|
111
|
+
|
|
112
|
+
args, kwargs = mock_post.call_args
|
|
113
|
+
self.assertIsNone(kwargs['timeout'])
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
if __name__ == '__main__':
|
|
117
|
+
unittest.main()
|
firecrawl/__init__.py
CHANGED
|
@@ -14,7 +14,7 @@ class TestAsyncCrawlRequestPreparation:
|
|
|
14
14
|
include_paths=["/docs/*"],
|
|
15
15
|
exclude_paths=["/admin/*"],
|
|
16
16
|
max_discovery_depth=2,
|
|
17
|
-
|
|
17
|
+
sitemap="skip",
|
|
18
18
|
ignore_query_parameters=True,
|
|
19
19
|
crawl_entire_domain=True,
|
|
20
20
|
allow_external_links=False,
|
|
@@ -26,7 +26,7 @@ class TestAsyncCrawlRequestPreparation:
|
|
|
26
26
|
assert payload["includePaths"] == ["/docs/*"]
|
|
27
27
|
assert payload["excludePaths"] == ["/admin/*"]
|
|
28
28
|
assert payload["maxDiscoveryDepth"] == 2
|
|
29
|
-
assert payload["
|
|
29
|
+
assert payload["sitemap"] == "skip"
|
|
30
30
|
assert payload["ignoreQueryParameters"] is True
|
|
31
31
|
assert payload["crawlEntireDomain"] is True
|
|
32
32
|
assert payload["allowExternalLinks"] is False
|
|
@@ -24,7 +24,7 @@ class TestCrawlRequestPreparation:
|
|
|
24
24
|
url="https://example.com",
|
|
25
25
|
limit=10,
|
|
26
26
|
max_discovery_depth=3,
|
|
27
|
-
|
|
27
|
+
sitemap="skip",
|
|
28
28
|
crawl_entire_domain=False,
|
|
29
29
|
allow_external_links=True
|
|
30
30
|
)
|
|
@@ -39,8 +39,8 @@ class TestCrawlRequestPreparation:
|
|
|
39
39
|
assert data["limit"] == 10
|
|
40
40
|
assert "maxDiscoveryDepth" in data
|
|
41
41
|
assert data["maxDiscoveryDepth"] == 3
|
|
42
|
-
assert "
|
|
43
|
-
assert data["
|
|
42
|
+
assert "sitemap" in data
|
|
43
|
+
assert data["sitemap"] == "skip"
|
|
44
44
|
assert "crawlEntireDomain" in data
|
|
45
45
|
assert data["crawlEntireDomain"] is False
|
|
46
46
|
assert "allowExternalLinks" in data
|
|
@@ -106,7 +106,7 @@ class TestCrawlRequestPreparation:
|
|
|
106
106
|
include_paths=["/blog/*", "/docs/*"],
|
|
107
107
|
exclude_paths=["/admin/*"],
|
|
108
108
|
max_discovery_depth=3,
|
|
109
|
-
|
|
109
|
+
sitemap="include",
|
|
110
110
|
limit=100,
|
|
111
111
|
crawl_entire_domain=True,
|
|
112
112
|
allow_external_links=False,
|
|
@@ -126,8 +126,8 @@ class TestCrawlRequestPreparation:
|
|
|
126
126
|
assert data["excludePaths"] == ["/admin/*"]
|
|
127
127
|
assert "maxDiscoveryDepth" in data
|
|
128
128
|
assert data["maxDiscoveryDepth"] == 3
|
|
129
|
-
assert "
|
|
130
|
-
assert data["
|
|
129
|
+
assert "sitemap" in data
|
|
130
|
+
assert data["sitemap"] == "include"
|
|
131
131
|
assert "limit" in data
|
|
132
132
|
assert data["limit"] == 100
|
|
133
133
|
assert "crawlEntireDomain" in data
|
firecrawl/v2/client.py
CHANGED
|
@@ -13,6 +13,7 @@ from .types import (
|
|
|
13
13
|
SearchRequest,
|
|
14
14
|
SearchData,
|
|
15
15
|
SourceOption,
|
|
16
|
+
CategoryOption,
|
|
16
17
|
CrawlRequest,
|
|
17
18
|
CrawlResponse,
|
|
18
19
|
CrawlJob,
|
|
@@ -171,6 +172,7 @@ class FirecrawlClient:
|
|
|
171
172
|
query: str,
|
|
172
173
|
*,
|
|
173
174
|
sources: Optional[List[SourceOption]] = None,
|
|
175
|
+
categories: Optional[List[CategoryOption]] = None,
|
|
174
176
|
limit: Optional[int] = None,
|
|
175
177
|
tbs: Optional[str] = None,
|
|
176
178
|
location: Optional[str] = None,
|
|
@@ -195,6 +197,7 @@ class FirecrawlClient:
|
|
|
195
197
|
request = SearchRequest(
|
|
196
198
|
query=query,
|
|
197
199
|
sources=sources,
|
|
200
|
+
categories=categories,
|
|
198
201
|
limit=limit,
|
|
199
202
|
tbs=tbs,
|
|
200
203
|
location=location,
|
firecrawl/v2/methods/search.py
CHANGED
|
@@ -121,6 +121,17 @@ def _validate_search_request(request: SearchRequest) -> SearchRequest:
|
|
|
121
121
|
if source.type not in valid_sources:
|
|
122
122
|
raise ValueError(f"Invalid source type: {source.type}. Valid types: {valid_sources}")
|
|
123
123
|
|
|
124
|
+
# Validate categories (if provided)
|
|
125
|
+
if request.categories is not None:
|
|
126
|
+
valid_categories = {"github", "research"}
|
|
127
|
+
for category in request.categories:
|
|
128
|
+
if isinstance(category, str):
|
|
129
|
+
if category not in valid_categories:
|
|
130
|
+
raise ValueError(f"Invalid category type: {category}. Valid types: {valid_categories}")
|
|
131
|
+
elif hasattr(category, 'type'):
|
|
132
|
+
if category.type not in valid_categories:
|
|
133
|
+
raise ValueError(f"Invalid category type: {category.type}. Valid types: {valid_categories}")
|
|
134
|
+
|
|
124
135
|
# Validate location (if provided)
|
|
125
136
|
if request.location is not None:
|
|
126
137
|
if not isinstance(request.location, str) or len(request.location.strip()) == 0:
|
firecrawl/v2/types.py
CHANGED
|
@@ -174,6 +174,12 @@ class Source(BaseModel):
|
|
|
174
174
|
|
|
175
175
|
SourceOption = Union[str, Source]
|
|
176
176
|
|
|
177
|
+
class Category(BaseModel):
|
|
178
|
+
"""Configuration for a search category."""
|
|
179
|
+
type: str
|
|
180
|
+
|
|
181
|
+
CategoryOption = Union[str, Category]
|
|
182
|
+
|
|
177
183
|
FormatString = Literal[
|
|
178
184
|
# camelCase versions (API format)
|
|
179
185
|
"markdown", "html", "rawHtml", "links", "screenshot", "summary", "changeTracking", "json",
|
|
@@ -331,7 +337,8 @@ class SearchResultWeb(BaseModel):
|
|
|
331
337
|
"""A web search result with URL, title, and description."""
|
|
332
338
|
url: str
|
|
333
339
|
title: Optional[str] = None
|
|
334
|
-
description: Optional[str] = None
|
|
340
|
+
description: Optional[str] = None
|
|
341
|
+
category: Optional[str] = None
|
|
335
342
|
|
|
336
343
|
class SearchResultNews(BaseModel):
|
|
337
344
|
"""A news search result with URL, title, snippet, date, image URL, and position."""
|
|
@@ -341,6 +348,7 @@ class SearchResultNews(BaseModel):
|
|
|
341
348
|
date: Optional[str] = None
|
|
342
349
|
image_url: Optional[str] = None
|
|
343
350
|
position: Optional[int] = None
|
|
351
|
+
category: Optional[str] = None
|
|
344
352
|
|
|
345
353
|
class SearchResultImages(BaseModel):
|
|
346
354
|
"""An image search result with URL, title, image URL, image width, image height, and position."""
|
|
@@ -521,6 +529,7 @@ class SearchRequest(BaseModel):
|
|
|
521
529
|
"""Request for search operations."""
|
|
522
530
|
query: str
|
|
523
531
|
sources: Optional[List[SourceOption]] = None
|
|
532
|
+
categories: Optional[List[CategoryOption]] = None
|
|
524
533
|
limit: Optional[int] = 5
|
|
525
534
|
tbs: Optional[str] = None
|
|
526
535
|
location: Optional[str] = None
|
|
@@ -547,6 +556,26 @@ class SearchRequest(BaseModel):
|
|
|
547
556
|
raise ValueError(f"Invalid source format: {source}")
|
|
548
557
|
|
|
549
558
|
return normalized_sources
|
|
559
|
+
|
|
560
|
+
@field_validator('categories')
|
|
561
|
+
@classmethod
|
|
562
|
+
def validate_categories(cls, v):
|
|
563
|
+
"""Validate and normalize categories input."""
|
|
564
|
+
if v is None:
|
|
565
|
+
return v
|
|
566
|
+
|
|
567
|
+
normalized_categories = []
|
|
568
|
+
for category in v:
|
|
569
|
+
if isinstance(category, str):
|
|
570
|
+
normalized_categories.append(Category(type=category))
|
|
571
|
+
elif isinstance(category, dict):
|
|
572
|
+
normalized_categories.append(Category(**category))
|
|
573
|
+
elif isinstance(category, Category):
|
|
574
|
+
normalized_categories.append(category)
|
|
575
|
+
else:
|
|
576
|
+
raise ValueError(f"Invalid category format: {category}")
|
|
577
|
+
|
|
578
|
+
return normalized_categories
|
|
550
579
|
|
|
551
580
|
class LinkResult(BaseModel):
|
|
552
581
|
"""A generic link result with optional metadata (used by search and map)."""
|
|
File without changes
|
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
Metadata-Version: 2.
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
2
|
Name: firecrawl-py
|
|
3
|
-
Version: 3.
|
|
3
|
+
Version: 3.3.1
|
|
4
4
|
Summary: Python SDK for Firecrawl API
|
|
5
5
|
Home-page: https://github.com/firecrawl/firecrawl
|
|
6
6
|
Author: Mendable.ai
|
|
@@ -38,12 +38,8 @@ Requires-Dist: httpx
|
|
|
38
38
|
Requires-Dist: python-dotenv
|
|
39
39
|
Requires-Dist: websockets
|
|
40
40
|
Requires-Dist: nest-asyncio
|
|
41
|
-
Requires-Dist: pydantic>=2.0
|
|
41
|
+
Requires-Dist: pydantic (>=2.0)
|
|
42
42
|
Requires-Dist: aiohttp
|
|
43
|
-
Dynamic: author
|
|
44
|
-
Dynamic: home-page
|
|
45
|
-
Dynamic: license-file
|
|
46
|
-
Dynamic: requires-python
|
|
47
43
|
|
|
48
44
|
# Firecrawl Python SDK
|
|
49
45
|
|