firecrawl 4.12.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- firecrawl/__init__.py +87 -0
- firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
- firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
- firecrawl/__tests__/e2e/v2/conftest.py +73 -0
- firecrawl/__tests__/e2e/v2/test_async.py +73 -0
- firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
- firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
- firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
- firecrawl/__tests__/e2e/v2/test_map.py +61 -0
- firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
- firecrawl/__tests__/e2e/v2/test_search.py +270 -0
- firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
- firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
- firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
- firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
- firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
- firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
- firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
- firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
- firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
- firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
- firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
- firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
- firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
- firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
- firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
- firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
- firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
- firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
- firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
- firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
- firecrawl/client.py +281 -0
- firecrawl/firecrawl.backup.py +4635 -0
- firecrawl/types.py +167 -0
- firecrawl/v1/__init__.py +14 -0
- firecrawl/v1/client.py +5164 -0
- firecrawl/v2/__init__.py +4 -0
- firecrawl/v2/client.py +967 -0
- firecrawl/v2/client_async.py +408 -0
- firecrawl/v2/methods/agent.py +144 -0
- firecrawl/v2/methods/aio/__init__.py +1 -0
- firecrawl/v2/methods/aio/agent.py +137 -0
- firecrawl/v2/methods/aio/batch.py +188 -0
- firecrawl/v2/methods/aio/crawl.py +351 -0
- firecrawl/v2/methods/aio/extract.py +133 -0
- firecrawl/v2/methods/aio/map.py +65 -0
- firecrawl/v2/methods/aio/scrape.py +33 -0
- firecrawl/v2/methods/aio/search.py +176 -0
- firecrawl/v2/methods/aio/usage.py +89 -0
- firecrawl/v2/methods/batch.py +499 -0
- firecrawl/v2/methods/crawl.py +592 -0
- firecrawl/v2/methods/extract.py +161 -0
- firecrawl/v2/methods/map.py +83 -0
- firecrawl/v2/methods/scrape.py +64 -0
- firecrawl/v2/methods/search.py +215 -0
- firecrawl/v2/methods/usage.py +84 -0
- firecrawl/v2/types.py +1143 -0
- firecrawl/v2/utils/__init__.py +9 -0
- firecrawl/v2/utils/error_handler.py +107 -0
- firecrawl/v2/utils/get_version.py +15 -0
- firecrawl/v2/utils/http_client.py +178 -0
- firecrawl/v2/utils/http_client_async.py +69 -0
- firecrawl/v2/utils/normalize.py +125 -0
- firecrawl/v2/utils/validation.py +692 -0
- firecrawl/v2/watcher.py +301 -0
- firecrawl/v2/watcher_async.py +243 -0
- firecrawl-4.12.0.dist-info/METADATA +234 -0
- firecrawl-4.12.0.dist-info/RECORD +92 -0
- firecrawl-4.12.0.dist-info/WHEEL +5 -0
- firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
- firecrawl-4.12.0.dist-info/top_level.txt +2 -0
- tests/test_agent_integration.py +277 -0
- tests/test_api_key_handling.py +44 -0
- tests/test_change_tracking.py +98 -0
- tests/test_timeout_conversion.py +117 -0
firecrawl/v2/watcher.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
"""
|
|
2
|
+
WebSocket-based watcher for v2 jobs (crawl and batch), mirroring v1 behavior.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
watcher = client.watcher(job_id, kind="crawl")
|
|
6
|
+
watcher.add_listener(lambda status: print(status.status))
|
|
7
|
+
watcher.start()
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import asyncio
|
|
11
|
+
import json
|
|
12
|
+
import threading
|
|
13
|
+
from typing import Callable, List, Optional, Literal, Union, Dict, Any
|
|
14
|
+
|
|
15
|
+
import websockets
|
|
16
|
+
|
|
17
|
+
from .types import CrawlJob, BatchScrapeJob, Document
|
|
18
|
+
from .utils.normalize import normalize_document_input
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
JobKind = Literal["crawl", "batch"]
|
|
22
|
+
JobType = Union[CrawlJob, BatchScrapeJob]
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class Watcher:
|
|
26
|
+
def __init__(
|
|
27
|
+
self,
|
|
28
|
+
client: object,
|
|
29
|
+
job_id: str,
|
|
30
|
+
kind: JobKind = "crawl",
|
|
31
|
+
poll_interval: int = 2,
|
|
32
|
+
timeout: Optional[int] = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
self._client = client
|
|
35
|
+
self._job_id = job_id
|
|
36
|
+
self._kind = kind
|
|
37
|
+
self._timeout = timeout
|
|
38
|
+
self._poll_interval = poll_interval
|
|
39
|
+
self._listeners: List[Callable[[JobType], None]] = []
|
|
40
|
+
self._thread: Optional[threading.Thread] = None
|
|
41
|
+
self._stop = threading.Event()
|
|
42
|
+
|
|
43
|
+
http_client = getattr(client, "http_client", None)
|
|
44
|
+
self._api_url: Optional[str] = getattr(http_client, "api_url", None)
|
|
45
|
+
self._api_key: Optional[str] = getattr(http_client, "api_key", None)
|
|
46
|
+
|
|
47
|
+
# v1-parity state and event handlers
|
|
48
|
+
self.status: str = "scraping"
|
|
49
|
+
self.data: List[Dict[str, Any]] = []
|
|
50
|
+
self._event_handlers: Dict[str, List[Callable[[Dict[str, Any]], None]]] = {
|
|
51
|
+
"done": [],
|
|
52
|
+
"error": [],
|
|
53
|
+
"document": [],
|
|
54
|
+
}
|
|
55
|
+
self._sent_done: bool = False
|
|
56
|
+
self._sent_error: bool = False
|
|
57
|
+
|
|
58
|
+
def add_listener(self, callback: Callable[[JobType], None]) -> None:
|
|
59
|
+
self._listeners.append(callback)
|
|
60
|
+
|
|
61
|
+
def _emit(self, status: JobType) -> None:
|
|
62
|
+
for cb in list(self._listeners):
|
|
63
|
+
try:
|
|
64
|
+
cb(status)
|
|
65
|
+
except Exception:
|
|
66
|
+
pass
|
|
67
|
+
|
|
68
|
+
# v1-like events API
|
|
69
|
+
def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
|
|
70
|
+
if event_type in self._event_handlers:
|
|
71
|
+
self._event_handlers[event_type].append(handler)
|
|
72
|
+
|
|
73
|
+
def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
|
|
74
|
+
if event_type in self._event_handlers:
|
|
75
|
+
for handler in self._event_handlers[event_type]:
|
|
76
|
+
try:
|
|
77
|
+
handler(detail)
|
|
78
|
+
except Exception:
|
|
79
|
+
pass
|
|
80
|
+
|
|
81
|
+
def _build_ws_url(self) -> str:
|
|
82
|
+
if not self._api_url:
|
|
83
|
+
raise ValueError("API URL is required for WebSocket watcher")
|
|
84
|
+
ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
|
|
85
|
+
if self._kind == "crawl":
|
|
86
|
+
return f"{ws_base}/v2/crawl/{self._job_id}"
|
|
87
|
+
return f"{ws_base}/v2/batch/scrape/{self._job_id}"
|
|
88
|
+
|
|
89
|
+
async def _run_ws(self) -> None:
|
|
90
|
+
uri = self._build_ws_url()
|
|
91
|
+
headers_list = []
|
|
92
|
+
if self._api_key:
|
|
93
|
+
headers_list.append(("Authorization", f"Bearer {self._api_key}"))
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
|
|
97
|
+
deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
|
|
98
|
+
while not self._stop.is_set():
|
|
99
|
+
# Use short recv timeouts to allow HTTP polling fallback
|
|
100
|
+
if deadline is not None:
|
|
101
|
+
remaining = max(0.0, deadline - asyncio.get_event_loop().time())
|
|
102
|
+
timeout = min(self._poll_interval or remaining, remaining)
|
|
103
|
+
else:
|
|
104
|
+
timeout = self._poll_interval or 5
|
|
105
|
+
try:
|
|
106
|
+
msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
|
|
107
|
+
except asyncio.TimeoutError:
|
|
108
|
+
# Quiet period: poll HTTP once to progress statuses
|
|
109
|
+
if await self._poll_status_once():
|
|
110
|
+
break
|
|
111
|
+
else:
|
|
112
|
+
continue
|
|
113
|
+
except asyncio.CancelledError:
|
|
114
|
+
break
|
|
115
|
+
except Exception:
|
|
116
|
+
# Connection error: switch to HTTP polling until terminal or timeout
|
|
117
|
+
while not self._stop.is_set():
|
|
118
|
+
if await self._poll_status_once():
|
|
119
|
+
return
|
|
120
|
+
if deadline is not None and asyncio.get_event_loop().time() >= deadline:
|
|
121
|
+
return
|
|
122
|
+
await asyncio.sleep(self._poll_interval or 2)
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
body = json.loads(msg)
|
|
127
|
+
except Exception:
|
|
128
|
+
continue
|
|
129
|
+
|
|
130
|
+
# v1-style typed event handling
|
|
131
|
+
msg_type = body.get("type")
|
|
132
|
+
if msg_type == "error":
|
|
133
|
+
self.status = "failed"
|
|
134
|
+
self.dispatch_event("error", {
|
|
135
|
+
"status": self.status,
|
|
136
|
+
"data": self.data,
|
|
137
|
+
"error": body.get("error"),
|
|
138
|
+
"id": self._job_id,
|
|
139
|
+
})
|
|
140
|
+
self._sent_error = True
|
|
141
|
+
# Emit a final failed snapshot for listeners
|
|
142
|
+
if self._kind == "crawl":
|
|
143
|
+
job = CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
|
|
144
|
+
else:
|
|
145
|
+
job = BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
|
|
146
|
+
self._emit(job)
|
|
147
|
+
break
|
|
148
|
+
elif msg_type == "catchup":
|
|
149
|
+
d = body.get("data", {})
|
|
150
|
+
self.status = d.get("status", self.status)
|
|
151
|
+
docs_in = d.get("data", [])
|
|
152
|
+
self.data.extend(docs_in)
|
|
153
|
+
for doc in docs_in:
|
|
154
|
+
self.dispatch_event("document", {"data": doc, "id": self._job_id})
|
|
155
|
+
elif msg_type == "document":
|
|
156
|
+
doc = body.get("data")
|
|
157
|
+
if isinstance(doc, dict):
|
|
158
|
+
self.data.append(doc)
|
|
159
|
+
self.dispatch_event("document", {"data": doc, "id": self._job_id})
|
|
160
|
+
elif msg_type == "done":
|
|
161
|
+
self.status = "completed"
|
|
162
|
+
# Gather any documents in the done payload
|
|
163
|
+
raw_payload = body.get("data", {}) or {}
|
|
164
|
+
docs_in = raw_payload.get("data", []) or []
|
|
165
|
+
if isinstance(docs_in, list) and docs_in:
|
|
166
|
+
for doc in docs_in:
|
|
167
|
+
if isinstance(doc, dict):
|
|
168
|
+
self.data.append(doc)
|
|
169
|
+
# Dispatch done event first
|
|
170
|
+
self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
|
|
171
|
+
self._sent_done = True
|
|
172
|
+
# Emit a final completed snapshot for listeners and break immediately
|
|
173
|
+
docs: List[Document] = []
|
|
174
|
+
for doc in self.data:
|
|
175
|
+
if isinstance(doc, dict):
|
|
176
|
+
d = normalize_document_input(doc)
|
|
177
|
+
docs.append(Document(**d))
|
|
178
|
+
if self._kind == "crawl":
|
|
179
|
+
job = CrawlJob(
|
|
180
|
+
status="completed",
|
|
181
|
+
completed=raw_payload.get("completed", 0),
|
|
182
|
+
total=raw_payload.get("total", 0),
|
|
183
|
+
credits_used=raw_payload.get("creditsUsed", 0),
|
|
184
|
+
expires_at=raw_payload.get("expiresAt"),
|
|
185
|
+
next=raw_payload.get("next"),
|
|
186
|
+
data=docs,
|
|
187
|
+
)
|
|
188
|
+
else:
|
|
189
|
+
job = BatchScrapeJob(
|
|
190
|
+
status="completed",
|
|
191
|
+
completed=raw_payload.get("completed", 0),
|
|
192
|
+
total=raw_payload.get("total", 0),
|
|
193
|
+
credits_used=raw_payload.get("creditsUsed", 0),
|
|
194
|
+
expires_at=raw_payload.get("expiresAt"),
|
|
195
|
+
next=raw_payload.get("next"),
|
|
196
|
+
data=docs,
|
|
197
|
+
)
|
|
198
|
+
self._emit(job)
|
|
199
|
+
break
|
|
200
|
+
|
|
201
|
+
payload = body.get("data", body)
|
|
202
|
+
# Only treat messages with an explicit status as job snapshots
|
|
203
|
+
has_status_field = (isinstance(payload, dict) and "status" in payload) or ("status" in body)
|
|
204
|
+
if not has_status_field:
|
|
205
|
+
continue
|
|
206
|
+
status_str = payload.get("status", body.get("status", self.status))
|
|
207
|
+
|
|
208
|
+
if self._kind == "crawl":
|
|
209
|
+
docs = []
|
|
210
|
+
for doc in payload.get("data", []):
|
|
211
|
+
if isinstance(doc, dict):
|
|
212
|
+
d = normalize_document_input(doc)
|
|
213
|
+
docs.append(Document(**d))
|
|
214
|
+
job = CrawlJob(
|
|
215
|
+
status=status_str,
|
|
216
|
+
completed=payload.get("completed", 0),
|
|
217
|
+
total=payload.get("total", 0),
|
|
218
|
+
credits_used=payload.get("creditsUsed", 0),
|
|
219
|
+
expires_at=payload.get("expiresAt"),
|
|
220
|
+
next=payload.get("next"),
|
|
221
|
+
data=docs,
|
|
222
|
+
)
|
|
223
|
+
self._emit(job)
|
|
224
|
+
if status_str in ("completed", "failed", "cancelled"):
|
|
225
|
+
# Ensure done/error dispatched even if server didn't send explicit event type
|
|
226
|
+
if status_str == "completed" and not self._sent_done:
|
|
227
|
+
self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
|
|
228
|
+
self._sent_done = True
|
|
229
|
+
if status_str == "failed" and not self._sent_error:
|
|
230
|
+
self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
|
|
231
|
+
self._sent_error = True
|
|
232
|
+
break
|
|
233
|
+
else:
|
|
234
|
+
docs = []
|
|
235
|
+
for doc in payload.get("data", []):
|
|
236
|
+
if isinstance(doc, dict):
|
|
237
|
+
d = normalize_document_input(doc)
|
|
238
|
+
docs.append(Document(**d))
|
|
239
|
+
job = BatchScrapeJob(
|
|
240
|
+
status=status_str,
|
|
241
|
+
completed=payload.get("completed", 0),
|
|
242
|
+
total=payload.get("total", 0),
|
|
243
|
+
credits_used=payload.get("creditsUsed"),
|
|
244
|
+
expires_at=payload.get("expiresAt"),
|
|
245
|
+
next=payload.get("next"),
|
|
246
|
+
data=docs,
|
|
247
|
+
)
|
|
248
|
+
self._emit(job)
|
|
249
|
+
if status_str in ("completed", "failed", "cancelled"):
|
|
250
|
+
if status_str == "completed" and not self._sent_done:
|
|
251
|
+
self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
|
|
252
|
+
self._sent_done = True
|
|
253
|
+
if status_str == "failed" and not self._sent_error:
|
|
254
|
+
self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
|
|
255
|
+
self._sent_error = True
|
|
256
|
+
break
|
|
257
|
+
except Exception:
|
|
258
|
+
pass
|
|
259
|
+
finally:
|
|
260
|
+
# Ensure terminal event parity with v1 even on abrupt disconnects
|
|
261
|
+
if self.status == "completed" and not self._sent_done:
|
|
262
|
+
self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
|
|
263
|
+
self._sent_done = True
|
|
264
|
+
|
|
265
|
+
async def _poll_status_once(self) -> bool:
|
|
266
|
+
"""Poll job status over HTTP once. Returns True if terminal."""
|
|
267
|
+
try:
|
|
268
|
+
if self._kind == "crawl":
|
|
269
|
+
job: CrawlJob = await asyncio.to_thread(self._client.get_crawl_status, self._job_id)
|
|
270
|
+
else:
|
|
271
|
+
job: BatchScrapeJob = await asyncio.to_thread(self._client.get_batch_scrape_status, self._job_id)
|
|
272
|
+
except Exception:
|
|
273
|
+
return False
|
|
274
|
+
|
|
275
|
+
self.status = job.status
|
|
276
|
+
self._emit(job)
|
|
277
|
+
if job.status in ("completed", "failed", "cancelled"):
|
|
278
|
+
if job.status == "completed" and not self._sent_done:
|
|
279
|
+
self.dispatch_event("done", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
|
|
280
|
+
self._sent_done = True
|
|
281
|
+
if job.status == "failed" and not self._sent_error:
|
|
282
|
+
self.dispatch_event("error", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
|
|
283
|
+
self._sent_error = True
|
|
284
|
+
return True
|
|
285
|
+
return False
|
|
286
|
+
|
|
287
|
+
def _loop(self) -> None:
|
|
288
|
+
asyncio.run(self._run_ws())
|
|
289
|
+
|
|
290
|
+
def start(self) -> None:
|
|
291
|
+
if self._thread and self._thread.is_alive():
|
|
292
|
+
return
|
|
293
|
+
self._stop.clear()
|
|
294
|
+
self._thread = threading.Thread(target=self._loop, daemon=True)
|
|
295
|
+
self._thread.start()
|
|
296
|
+
|
|
297
|
+
def stop(self) -> None:
|
|
298
|
+
self._stop.set()
|
|
299
|
+
if self._thread:
|
|
300
|
+
self._thread.join(timeout=1)
|
|
301
|
+
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Async WebSocket watcher with async iterator interface for v2 jobs.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
async for snapshot in AsyncWatcher(client, job_id, kind="crawl"):
|
|
6
|
+
print(snapshot.status)
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import inspect
|
|
11
|
+
import json
|
|
12
|
+
import time
|
|
13
|
+
from typing import AsyncIterator, Dict, List, Literal, Optional
|
|
14
|
+
|
|
15
|
+
import websockets
|
|
16
|
+
from websockets.exceptions import ConnectionClosed, ConnectionClosedOK, ConnectionClosedError
|
|
17
|
+
|
|
18
|
+
from .types import BatchScrapeJob, CrawlJob, Document
|
|
19
|
+
from .utils.normalize import normalize_document_input
|
|
20
|
+
|
|
21
|
+
JobKind = Literal["crawl", "batch"]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class AsyncWatcher:
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
client: object,
|
|
28
|
+
job_id: str,
|
|
29
|
+
*,
|
|
30
|
+
kind: JobKind = "crawl",
|
|
31
|
+
poll_interval: int = 2,
|
|
32
|
+
timeout: Optional[int] = None,
|
|
33
|
+
) -> None:
|
|
34
|
+
self._client = client
|
|
35
|
+
self._job_id = job_id
|
|
36
|
+
self._kind = kind
|
|
37
|
+
self._timeout = timeout
|
|
38
|
+
self._poll_interval: float = max(0.0, float(poll_interval)) # Guard against negative values
|
|
39
|
+
|
|
40
|
+
http_client = getattr(client, "http_client", None)
|
|
41
|
+
if http_client is not None:
|
|
42
|
+
self._api_url = getattr(http_client, "api_url", None)
|
|
43
|
+
self._api_key = getattr(http_client, "api_key", None)
|
|
44
|
+
else:
|
|
45
|
+
# Allow passing the top-level Firecrawl client directly
|
|
46
|
+
self._api_url = getattr(client, "api_url", None)
|
|
47
|
+
self._api_key = getattr(client, "api_key", None)
|
|
48
|
+
|
|
49
|
+
self._status: str = "scraping"
|
|
50
|
+
self._data: List[Dict] = []
|
|
51
|
+
|
|
52
|
+
def __aiter__(self) -> AsyncIterator[object]:
|
|
53
|
+
return self._iterate()
|
|
54
|
+
|
|
55
|
+
def _build_ws_url(self) -> str:
|
|
56
|
+
if not self._api_url:
|
|
57
|
+
raise ValueError("API URL is required for WebSocket watcher")
|
|
58
|
+
ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
|
|
59
|
+
if self._kind == "crawl":
|
|
60
|
+
return f"{ws_base}/v2/crawl/{self._job_id}"
|
|
61
|
+
return f"{ws_base}/v2/batch/scrape/{self._job_id}"
|
|
62
|
+
|
|
63
|
+
async def _iterate(self) -> AsyncIterator[object]:
|
|
64
|
+
uri = self._build_ws_url()
|
|
65
|
+
headers_list = []
|
|
66
|
+
if self._api_key:
|
|
67
|
+
headers_list.append(("Authorization", f"Bearer {self._api_key}"))
|
|
68
|
+
|
|
69
|
+
# Attempt to establish WS; on failure, fall back to HTTP polling immediately
|
|
70
|
+
try:
|
|
71
|
+
async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
|
|
72
|
+
deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
|
|
73
|
+
# Pre-yield a snapshot if available to ensure progress is visible
|
|
74
|
+
try:
|
|
75
|
+
pre = await self._fetch_job_status()
|
|
76
|
+
yield pre
|
|
77
|
+
if pre.status in ("completed", "failed", "cancelled"):
|
|
78
|
+
return
|
|
79
|
+
except Exception:
|
|
80
|
+
pass
|
|
81
|
+
|
|
82
|
+
while True:
|
|
83
|
+
try:
|
|
84
|
+
if deadline is not None:
|
|
85
|
+
remaining = max(0.0, deadline - asyncio.get_event_loop().time())
|
|
86
|
+
timeout = min(self._poll_interval, remaining) if remaining > 0 else 0.0
|
|
87
|
+
else:
|
|
88
|
+
timeout = self._poll_interval
|
|
89
|
+
msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
|
|
90
|
+
except asyncio.TimeoutError:
|
|
91
|
+
# Quiet period: poll HTTP once
|
|
92
|
+
job = await self._safe_fetch()
|
|
93
|
+
if job is not None:
|
|
94
|
+
yield job
|
|
95
|
+
if job.status in ("completed", "failed", "cancelled"):
|
|
96
|
+
return
|
|
97
|
+
if deadline is not None and asyncio.get_event_loop().time() >= deadline:
|
|
98
|
+
return
|
|
99
|
+
continue
|
|
100
|
+
except (ConnectionClosedOK, ConnectionClosed, ConnectionClosedError):
|
|
101
|
+
# Graceful/abrupt close: poll HTTP until terminal (bounded by timeout)
|
|
102
|
+
deadline = time.time() + (self._timeout or 30)
|
|
103
|
+
while True:
|
|
104
|
+
try:
|
|
105
|
+
job = await self._fetch_job_status()
|
|
106
|
+
yield job
|
|
107
|
+
if job.status in ("completed", "failed", "cancelled"):
|
|
108
|
+
return
|
|
109
|
+
except Exception:
|
|
110
|
+
return
|
|
111
|
+
if time.time() >= deadline:
|
|
112
|
+
return
|
|
113
|
+
await asyncio.sleep(1)
|
|
114
|
+
try:
|
|
115
|
+
body = json.loads(msg)
|
|
116
|
+
except Exception:
|
|
117
|
+
continue
|
|
118
|
+
|
|
119
|
+
msg_type = body.get("type")
|
|
120
|
+
if msg_type == "error":
|
|
121
|
+
self._status = "failed"
|
|
122
|
+
# Yield a terminal snapshot
|
|
123
|
+
if self._kind == "crawl":
|
|
124
|
+
yield CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
|
|
125
|
+
else:
|
|
126
|
+
yield BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
|
|
127
|
+
return
|
|
128
|
+
elif msg_type == "catchup":
|
|
129
|
+
d = body.get("data", {})
|
|
130
|
+
self._status = d.get("status", self._status)
|
|
131
|
+
docs_in = d.get("data", []) or []
|
|
132
|
+
self._data.extend(docs_in)
|
|
133
|
+
# Fall through to emit a snapshot below
|
|
134
|
+
elif msg_type == "document":
|
|
135
|
+
doc = body.get("data")
|
|
136
|
+
if isinstance(doc, dict):
|
|
137
|
+
self._data.append(doc)
|
|
138
|
+
# Fall through to emit a snapshot below
|
|
139
|
+
elif msg_type == "done":
|
|
140
|
+
self._status = "completed"
|
|
141
|
+
raw_payload = body.get("data", {}) or {}
|
|
142
|
+
docs_in = raw_payload.get("data", []) or []
|
|
143
|
+
if isinstance(docs_in, list) and docs_in:
|
|
144
|
+
for doc in docs_in:
|
|
145
|
+
if isinstance(doc, dict):
|
|
146
|
+
self._data.append(doc)
|
|
147
|
+
# Emit final snapshot then end
|
|
148
|
+
yield self._make_snapshot(status="completed", payload=raw_payload, docs_override=self._data)
|
|
149
|
+
return
|
|
150
|
+
|
|
151
|
+
# Generic snapshot emit for status messages and periodic progress
|
|
152
|
+
payload = body.get("data", body)
|
|
153
|
+
status_str = payload.get("status", body.get("status", self._status))
|
|
154
|
+
snapshot = self._make_snapshot(status=status_str, payload=payload)
|
|
155
|
+
yield snapshot
|
|
156
|
+
if status_str in ("completed", "failed", "cancelled"):
|
|
157
|
+
return
|
|
158
|
+
except Exception:
|
|
159
|
+
# WS connect failure: fallback to HTTP polling loop until terminal/timeout
|
|
160
|
+
deadline = time.time() + (self._timeout or 30)
|
|
161
|
+
while True:
|
|
162
|
+
try:
|
|
163
|
+
job = await self._fetch_job_status()
|
|
164
|
+
yield job
|
|
165
|
+
if job.status in ("completed", "failed", "cancelled"):
|
|
166
|
+
return
|
|
167
|
+
except Exception:
|
|
168
|
+
return
|
|
169
|
+
if time.time() >= deadline:
|
|
170
|
+
return
|
|
171
|
+
await asyncio.sleep(1)
|
|
172
|
+
|
|
173
|
+
async def _fetch_job_status(self):
|
|
174
|
+
if self._kind == "crawl":
|
|
175
|
+
return await self._call_status_method("get_crawl_status")
|
|
176
|
+
return await self._call_status_method("get_batch_scrape_status")
|
|
177
|
+
|
|
178
|
+
async def _call_status_method(self, method_name: str):
|
|
179
|
+
# Try on client directly
|
|
180
|
+
meth = getattr(self._client, method_name, None)
|
|
181
|
+
if meth is not None:
|
|
182
|
+
try:
|
|
183
|
+
result = meth(self._job_id)
|
|
184
|
+
except TypeError:
|
|
185
|
+
result = None
|
|
186
|
+
if result is not None:
|
|
187
|
+
if inspect.isawaitable(result):
|
|
188
|
+
return await result
|
|
189
|
+
return result
|
|
190
|
+
# Fallback: if we couldn't call directly, try to_thread
|
|
191
|
+
return await asyncio.to_thread(meth, self._job_id)
|
|
192
|
+
|
|
193
|
+
# Try on client.v2
|
|
194
|
+
v2 = getattr(self._client, "v2", None)
|
|
195
|
+
if v2 is not None:
|
|
196
|
+
meth = getattr(v2, method_name, None)
|
|
197
|
+
if meth is not None:
|
|
198
|
+
try:
|
|
199
|
+
result = meth(self._job_id)
|
|
200
|
+
except TypeError:
|
|
201
|
+
result = None
|
|
202
|
+
if result is not None:
|
|
203
|
+
if inspect.isawaitable(result):
|
|
204
|
+
return await result
|
|
205
|
+
return result
|
|
206
|
+
return await asyncio.to_thread(meth, self._job_id)
|
|
207
|
+
|
|
208
|
+
raise RuntimeError(f"Client does not expose {method_name}")
|
|
209
|
+
|
|
210
|
+
async def _safe_fetch(self):
|
|
211
|
+
try:
|
|
212
|
+
return await self._fetch_job_status()
|
|
213
|
+
except Exception:
|
|
214
|
+
return None
|
|
215
|
+
|
|
216
|
+
def _make_snapshot(self, *, status: str, payload: Dict, docs_override: Optional[List[Dict]] = None):
|
|
217
|
+
docs = []
|
|
218
|
+
source_docs = docs_override if docs_override is not None else payload.get("data", []) or []
|
|
219
|
+
for doc in source_docs:
|
|
220
|
+
if isinstance(doc, dict):
|
|
221
|
+
d = normalize_document_input(doc)
|
|
222
|
+
docs.append(Document(**d))
|
|
223
|
+
|
|
224
|
+
if self._kind == "crawl":
|
|
225
|
+
return CrawlJob(
|
|
226
|
+
status=status,
|
|
227
|
+
completed=payload.get("completed", 0),
|
|
228
|
+
total=payload.get("total", 0),
|
|
229
|
+
credits_used=payload.get("creditsUsed", 0),
|
|
230
|
+
expires_at=payload.get("expiresAt"),
|
|
231
|
+
next=payload.get("next"),
|
|
232
|
+
data=docs,
|
|
233
|
+
)
|
|
234
|
+
return BatchScrapeJob(
|
|
235
|
+
status=status,
|
|
236
|
+
completed=payload.get("completed", 0),
|
|
237
|
+
total=payload.get("total", 0),
|
|
238
|
+
credits_used=payload.get("creditsUsed"),
|
|
239
|
+
expires_at=payload.get("expiresAt"),
|
|
240
|
+
next=payload.get("next"),
|
|
241
|
+
data=docs,
|
|
242
|
+
)
|
|
243
|
+
|