firecrawl 2.16.5__py3-none-any.whl → 3.0.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl might be problematic. Click here for more details.

Files changed (82) hide show
  1. firecrawl/__init__.py +27 -19
  2. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +79 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +38 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +40 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +137 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +183 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +35 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  10. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  11. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +105 -0
  13. firecrawl/__tests__/e2e/v2/test_crawl.py +276 -0
  14. firecrawl/__tests__/e2e/v2/test_extract.py +54 -0
  15. firecrawl/__tests__/e2e/v2/test_map.py +60 -0
  16. firecrawl/__tests__/e2e/v2/test_scrape.py +154 -0
  17. firecrawl/__tests__/e2e/v2/test_search.py +265 -0
  18. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  19. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  20. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  21. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +61 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +19 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +63 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  28. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  29. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  30. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  31. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  32. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +53 -0
  33. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +92 -0
  34. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +167 -0
  35. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +206 -0
  36. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  37. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  38. firecrawl/__tests__/unit/v2/utils/test_validation.py +290 -0
  39. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  40. firecrawl/client.py +241 -0
  41. firecrawl/{firecrawl.py → firecrawl.backup.py} +17 -15
  42. firecrawl/types.py +157 -0
  43. firecrawl/v1/__init__.py +14 -0
  44. firecrawl/v1/client.py +4653 -0
  45. firecrawl/v2/__init__.py +4 -0
  46. firecrawl/v2/client.py +802 -0
  47. firecrawl/v2/client_async.py +250 -0
  48. firecrawl/v2/methods/aio/__init__.py +1 -0
  49. firecrawl/v2/methods/aio/batch.py +85 -0
  50. firecrawl/v2/methods/aio/crawl.py +174 -0
  51. firecrawl/v2/methods/aio/extract.py +126 -0
  52. firecrawl/v2/methods/aio/map.py +59 -0
  53. firecrawl/v2/methods/aio/scrape.py +36 -0
  54. firecrawl/v2/methods/aio/search.py +58 -0
  55. firecrawl/v2/methods/aio/usage.py +42 -0
  56. firecrawl/v2/methods/batch.py +420 -0
  57. firecrawl/v2/methods/crawl.py +468 -0
  58. firecrawl/v2/methods/extract.py +131 -0
  59. firecrawl/v2/methods/map.py +77 -0
  60. firecrawl/v2/methods/scrape.py +68 -0
  61. firecrawl/v2/methods/search.py +173 -0
  62. firecrawl/v2/methods/usage.py +41 -0
  63. firecrawl/v2/types.py +546 -0
  64. firecrawl/v2/utils/__init__.py +9 -0
  65. firecrawl/v2/utils/error_handler.py +107 -0
  66. firecrawl/v2/utils/get_version.py +15 -0
  67. firecrawl/v2/utils/http_client.py +153 -0
  68. firecrawl/v2/utils/http_client_async.py +64 -0
  69. firecrawl/v2/utils/validation.py +324 -0
  70. firecrawl/v2/watcher.py +312 -0
  71. firecrawl/v2/watcher_async.py +245 -0
  72. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/LICENSE +0 -0
  73. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/METADATA +49 -32
  74. firecrawl-3.0.3.dist-info/RECORD +78 -0
  75. tests/test_timeout_conversion.py +117 -0
  76. firecrawl/__tests__/e2e_withAuth/__init__.py +0 -0
  77. firecrawl/__tests__/e2e_withAuth/test.py +0 -170
  78. firecrawl/__tests__/v1/e2e_withAuth/__init__.py +0 -0
  79. firecrawl/__tests__/v1/e2e_withAuth/test.py +0 -465
  80. firecrawl-2.16.5.dist-info/RECORD +0 -12
  81. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/WHEEL +0 -0
  82. {firecrawl-2.16.5.dist-info → firecrawl-3.0.3.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,312 @@
1
+ """
2
+ WebSocket-based watcher for v2 jobs (crawl and batch), mirroring v1 behavior.
3
+
4
+ Usage:
5
+ watcher = client.watcher(job_id, kind="crawl")
6
+ watcher.add_listener(lambda status: print(status.status))
7
+ watcher.start()
8
+ """
9
+
10
+ import asyncio
11
+ import json
12
+ import threading
13
+ from typing import Callable, List, Optional, Literal, Union, Dict, Any
14
+
15
+ import websockets
16
+
17
+ from .types import CrawlJob, BatchScrapeJob, Document
18
+
19
+
20
+ JobKind = Literal["crawl", "batch"]
21
+ JobType = Union[CrawlJob, BatchScrapeJob]
22
+
23
+
24
+ class Watcher:
25
+ def __init__(
26
+ self,
27
+ client: object,
28
+ job_id: str,
29
+ kind: JobKind = "crawl",
30
+ poll_interval: int = 2,
31
+ timeout: Optional[int] = None,
32
+ ) -> None:
33
+ self._client = client
34
+ self._job_id = job_id
35
+ self._kind = kind
36
+ self._timeout = timeout
37
+ self._poll_interval = poll_interval
38
+ self._listeners: List[Callable[[JobType], None]] = []
39
+ self._thread: Optional[threading.Thread] = None
40
+ self._stop = threading.Event()
41
+
42
+ http_client = getattr(client, "http_client", None)
43
+ self._api_url: Optional[str] = getattr(http_client, "api_url", None)
44
+ self._api_key: Optional[str] = getattr(http_client, "api_key", None)
45
+
46
+ # v1-parity state and event handlers
47
+ self.status: str = "scraping"
48
+ self.data: List[Dict[str, Any]] = []
49
+ self._event_handlers: Dict[str, List[Callable[[Dict[str, Any]], None]]] = {
50
+ "done": [],
51
+ "error": [],
52
+ "document": [],
53
+ }
54
+ self._sent_done: bool = False
55
+ self._sent_error: bool = False
56
+
57
+ def add_listener(self, callback: Callable[[JobType], None]) -> None:
58
+ self._listeners.append(callback)
59
+
60
+ def _emit(self, status: JobType) -> None:
61
+ for cb in list(self._listeners):
62
+ try:
63
+ cb(status)
64
+ except Exception:
65
+ pass
66
+
67
+ # v1-like events API
68
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
69
+ if event_type in self._event_handlers:
70
+ self._event_handlers[event_type].append(handler)
71
+
72
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
73
+ if event_type in self._event_handlers:
74
+ for handler in self._event_handlers[event_type]:
75
+ try:
76
+ handler(detail)
77
+ except Exception:
78
+ pass
79
+
80
+ def _build_ws_url(self) -> str:
81
+ if not self._api_url:
82
+ raise ValueError("API URL is required for WebSocket watcher")
83
+ ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
84
+ if self._kind == "crawl":
85
+ return f"{ws_base}/v2/crawl/{self._job_id}"
86
+ return f"{ws_base}/v2/batch/scrape/{self._job_id}"
87
+
88
+ async def _run_ws(self) -> None:
89
+ uri = self._build_ws_url()
90
+ headers_list = []
91
+ if self._api_key:
92
+ headers_list.append(("Authorization", f"Bearer {self._api_key}"))
93
+
94
+ try:
95
+ async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
96
+ deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
97
+ while not self._stop.is_set():
98
+ # Use short recv timeouts to allow HTTP polling fallback
99
+ if deadline is not None:
100
+ remaining = max(0.0, deadline - asyncio.get_event_loop().time())
101
+ timeout = min(self._poll_interval or remaining, remaining)
102
+ else:
103
+ timeout = self._poll_interval or 5
104
+ try:
105
+ msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
106
+ except asyncio.TimeoutError:
107
+ # Quiet period: poll HTTP once to progress statuses
108
+ if await self._poll_status_once():
109
+ break
110
+ else:
111
+ continue
112
+ except asyncio.CancelledError:
113
+ break
114
+ except Exception:
115
+ # Connection error: switch to HTTP polling until terminal or timeout
116
+ while not self._stop.is_set():
117
+ if await self._poll_status_once():
118
+ return
119
+ if deadline is not None and asyncio.get_event_loop().time() >= deadline:
120
+ return
121
+ await asyncio.sleep(self._poll_interval or 2)
122
+ return
123
+
124
+ try:
125
+ body = json.loads(msg)
126
+ except Exception:
127
+ continue
128
+
129
+ # v1-style typed event handling
130
+ msg_type = body.get("type")
131
+ if msg_type == "error":
132
+ self.status = "failed"
133
+ self.dispatch_event("error", {
134
+ "status": self.status,
135
+ "data": self.data,
136
+ "error": body.get("error"),
137
+ "id": self._job_id,
138
+ })
139
+ self._sent_error = True
140
+ # Emit a final failed snapshot for listeners
141
+ if self._kind == "crawl":
142
+ job = CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
143
+ else:
144
+ job = BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
145
+ self._emit(job)
146
+ break
147
+ elif msg_type == "catchup":
148
+ d = body.get("data", {})
149
+ self.status = d.get("status", self.status)
150
+ docs_in = d.get("data", [])
151
+ self.data.extend(docs_in)
152
+ for doc in docs_in:
153
+ self.dispatch_event("document", {"data": doc, "id": self._job_id})
154
+ elif msg_type == "document":
155
+ doc = body.get("data")
156
+ if isinstance(doc, dict):
157
+ self.data.append(doc)
158
+ self.dispatch_event("document", {"data": doc, "id": self._job_id})
159
+ elif msg_type == "done":
160
+ self.status = "completed"
161
+ # Gather any documents in the done payload
162
+ raw_payload = body.get("data", {}) or {}
163
+ docs_in = raw_payload.get("data", []) or []
164
+ if isinstance(docs_in, list) and docs_in:
165
+ for doc in docs_in:
166
+ if isinstance(doc, dict):
167
+ self.data.append(doc)
168
+ # Dispatch done event first
169
+ self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
170
+ self._sent_done = True
171
+ # Emit a final completed snapshot for listeners and break immediately
172
+ docs: List[Document] = []
173
+ for doc in self.data:
174
+ if isinstance(doc, dict):
175
+ d = dict(doc)
176
+ if "rawHtml" in d and "raw_html" not in d:
177
+ d["raw_html"] = d.pop("rawHtml")
178
+ if "changeTracking" in d and "change_tracking" not in d:
179
+ d["change_tracking"] = d.pop("changeTracking")
180
+ docs.append(Document(**d))
181
+ if self._kind == "crawl":
182
+ job = CrawlJob(
183
+ status="completed",
184
+ completed=raw_payload.get("completed", 0),
185
+ total=raw_payload.get("total", 0),
186
+ credits_used=raw_payload.get("creditsUsed", 0),
187
+ expires_at=raw_payload.get("expiresAt"),
188
+ next=raw_payload.get("next"),
189
+ data=docs,
190
+ )
191
+ else:
192
+ job = BatchScrapeJob(
193
+ status="completed",
194
+ completed=raw_payload.get("completed", 0),
195
+ total=raw_payload.get("total", 0),
196
+ credits_used=raw_payload.get("creditsUsed", 0),
197
+ expires_at=raw_payload.get("expiresAt"),
198
+ next=raw_payload.get("next"),
199
+ data=docs,
200
+ )
201
+ self._emit(job)
202
+ break
203
+
204
+ payload = body.get("data", body)
205
+ # Only treat messages with an explicit status as job snapshots
206
+ has_status_field = (isinstance(payload, dict) and "status" in payload) or ("status" in body)
207
+ if not has_status_field:
208
+ continue
209
+ status_str = payload.get("status", body.get("status", self.status))
210
+
211
+ if self._kind == "crawl":
212
+ docs = []
213
+ for doc in payload.get("data", []):
214
+ if isinstance(doc, dict):
215
+ d = dict(doc)
216
+ if "rawHtml" in d and "raw_html" not in d:
217
+ d["raw_html"] = d.pop("rawHtml")
218
+ if "changeTracking" in d and "change_tracking" not in d:
219
+ d["change_tracking"] = d.pop("changeTracking")
220
+ docs.append(Document(**d))
221
+ job = CrawlJob(
222
+ status=status_str,
223
+ completed=payload.get("completed", 0),
224
+ total=payload.get("total", 0),
225
+ credits_used=payload.get("creditsUsed", 0),
226
+ expires_at=payload.get("expiresAt"),
227
+ next=payload.get("next"),
228
+ data=docs,
229
+ )
230
+ self._emit(job)
231
+ if status_str in ("completed", "failed", "cancelled"):
232
+ # Ensure done/error dispatched even if server didn't send explicit event type
233
+ if status_str == "completed" and not self._sent_done:
234
+ self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
235
+ self._sent_done = True
236
+ if status_str == "failed" and not self._sent_error:
237
+ self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
238
+ self._sent_error = True
239
+ break
240
+ else:
241
+ docs = []
242
+ for doc in payload.get("data", []):
243
+ if isinstance(doc, dict):
244
+ d = dict(doc)
245
+ if "rawHtml" in d and "raw_html" not in d:
246
+ d["raw_html"] = d.pop("rawHtml")
247
+ if "changeTracking" in d and "change_tracking" not in d:
248
+ d["change_tracking"] = d.pop("changeTracking")
249
+ docs.append(Document(**d))
250
+ job = BatchScrapeJob(
251
+ status=status_str,
252
+ completed=payload.get("completed", 0),
253
+ total=payload.get("total", 0),
254
+ credits_used=payload.get("creditsUsed"),
255
+ expires_at=payload.get("expiresAt"),
256
+ next=payload.get("next"),
257
+ data=docs,
258
+ )
259
+ self._emit(job)
260
+ if status_str in ("completed", "failed", "cancelled"):
261
+ if status_str == "completed" and not self._sent_done:
262
+ self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
263
+ self._sent_done = True
264
+ if status_str == "failed" and not self._sent_error:
265
+ self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
266
+ self._sent_error = True
267
+ break
268
+ except Exception:
269
+ pass
270
+ finally:
271
+ # Ensure terminal event parity with v1 even on abrupt disconnects
272
+ if self.status == "completed" and not self._sent_done:
273
+ self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
274
+ self._sent_done = True
275
+
276
+ async def _poll_status_once(self) -> bool:
277
+ """Poll job status over HTTP once. Returns True if terminal."""
278
+ try:
279
+ if self._kind == "crawl":
280
+ job: CrawlJob = await asyncio.to_thread(self._client.get_crawl_status, self._job_id)
281
+ else:
282
+ job: BatchScrapeJob = await asyncio.to_thread(self._client.get_batch_scrape_status, self._job_id)
283
+ except Exception:
284
+ return False
285
+
286
+ self.status = job.status
287
+ self._emit(job)
288
+ if job.status in ("completed", "failed", "cancelled"):
289
+ if job.status == "completed" and not self._sent_done:
290
+ self.dispatch_event("done", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
291
+ self._sent_done = True
292
+ if job.status == "failed" and not self._sent_error:
293
+ self.dispatch_event("error", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
294
+ self._sent_error = True
295
+ return True
296
+ return False
297
+
298
+ def _loop(self) -> None:
299
+ asyncio.run(self._run_ws())
300
+
301
+ def start(self) -> None:
302
+ if self._thread and self._thread.is_alive():
303
+ return
304
+ self._stop.clear()
305
+ self._thread = threading.Thread(target=self._loop, daemon=True)
306
+ self._thread.start()
307
+
308
+ def stop(self) -> None:
309
+ self._stop.set()
310
+ if self._thread:
311
+ self._thread.join(timeout=1)
312
+
@@ -0,0 +1,245 @@
1
+ """
2
+ Async WebSocket watcher with async iterator interface for v2 jobs.
3
+
4
+ Usage:
5
+ async for snapshot in AsyncWatcher(client, job_id, kind="crawl"):
6
+ print(snapshot.status)
7
+ """
8
+
9
+ import asyncio
10
+ import inspect
11
+ import json
12
+ import time
13
+ from typing import AsyncIterator, Dict, List, Literal, Optional
14
+
15
+ import websockets
16
+ from websockets.exceptions import ConnectionClosed, ConnectionClosedOK, ConnectionClosedError
17
+
18
+ from .types import BatchScrapeJob, CrawlJob, Document
19
+
20
+ JobKind = Literal["crawl", "batch"]
21
+
22
+
23
+ class AsyncWatcher:
24
+ def __init__(
25
+ self,
26
+ client: object,
27
+ job_id: str,
28
+ *,
29
+ kind: JobKind = "crawl",
30
+ timeout: Optional[int] = None,
31
+ ) -> None:
32
+ self._client = client
33
+ self._job_id = job_id
34
+ self._kind = kind
35
+ self._timeout = timeout
36
+ self._poll_interval: float = 2.0
37
+
38
+ http_client = getattr(client, "http_client", None)
39
+ if http_client is not None:
40
+ self._api_url = getattr(http_client, "api_url", None)
41
+ self._api_key = getattr(http_client, "api_key", None)
42
+ else:
43
+ # Allow passing the top-level Firecrawl client directly
44
+ self._api_url = getattr(client, "api_url", None)
45
+ self._api_key = getattr(client, "api_key", None)
46
+
47
+ self._status: str = "scraping"
48
+ self._data: List[Dict] = []
49
+
50
+ def __aiter__(self) -> AsyncIterator[object]:
51
+ return self._iterate()
52
+
53
+ def _build_ws_url(self) -> str:
54
+ if not self._api_url:
55
+ raise ValueError("API URL is required for WebSocket watcher")
56
+ ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
57
+ if self._kind == "crawl":
58
+ return f"{ws_base}/v2/crawl/{self._job_id}"
59
+ return f"{ws_base}/v2/batch/scrape/{self._job_id}"
60
+
61
+ async def _iterate(self) -> AsyncIterator[object]:
62
+ uri = self._build_ws_url()
63
+ headers_list = []
64
+ if self._api_key:
65
+ headers_list.append(("Authorization", f"Bearer {self._api_key}"))
66
+
67
+ # Attempt to establish WS; on failure, fall back to HTTP polling immediately
68
+ try:
69
+ async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
70
+ deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
71
+ # Pre-yield a snapshot if available to ensure progress is visible
72
+ try:
73
+ pre = await self._fetch_job_status()
74
+ yield pre
75
+ if pre.status in ("completed", "failed", "cancelled"):
76
+ return
77
+ except Exception:
78
+ pass
79
+
80
+ while True:
81
+ try:
82
+ if deadline is not None:
83
+ remaining = max(0.0, deadline - asyncio.get_event_loop().time())
84
+ timeout = min(self._poll_interval, remaining) if remaining > 0 else 0.0
85
+ else:
86
+ timeout = self._poll_interval
87
+ msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
88
+ except asyncio.TimeoutError:
89
+ # Quiet period: poll HTTP once
90
+ job = await self._safe_fetch()
91
+ if job is not None:
92
+ yield job
93
+ if job.status in ("completed", "failed", "cancelled"):
94
+ return
95
+ if deadline is not None and asyncio.get_event_loop().time() >= deadline:
96
+ return
97
+ continue
98
+ except (ConnectionClosedOK, ConnectionClosed, ConnectionClosedError):
99
+ # Graceful/abrupt close: poll HTTP until terminal (bounded by timeout)
100
+ deadline = time.time() + (self._timeout or 30)
101
+ while True:
102
+ try:
103
+ job = await self._fetch_job_status()
104
+ yield job
105
+ if job.status in ("completed", "failed", "cancelled"):
106
+ return
107
+ except Exception:
108
+ return
109
+ if time.time() >= deadline:
110
+ return
111
+ await asyncio.sleep(1)
112
+ try:
113
+ body = json.loads(msg)
114
+ except Exception:
115
+ continue
116
+
117
+ msg_type = body.get("type")
118
+ if msg_type == "error":
119
+ self._status = "failed"
120
+ # Yield a terminal snapshot
121
+ if self._kind == "crawl":
122
+ yield CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
123
+ else:
124
+ yield BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
125
+ return
126
+ elif msg_type == "catchup":
127
+ d = body.get("data", {})
128
+ self._status = d.get("status", self._status)
129
+ docs_in = d.get("data", []) or []
130
+ self._data.extend(docs_in)
131
+ # Fall through to emit a snapshot below
132
+ elif msg_type == "document":
133
+ doc = body.get("data")
134
+ if isinstance(doc, dict):
135
+ self._data.append(doc)
136
+ # Fall through to emit a snapshot below
137
+ elif msg_type == "done":
138
+ self._status = "completed"
139
+ raw_payload = body.get("data", {}) or {}
140
+ docs_in = raw_payload.get("data", []) or []
141
+ if isinstance(docs_in, list) and docs_in:
142
+ for doc in docs_in:
143
+ if isinstance(doc, dict):
144
+ self._data.append(doc)
145
+ # Emit final snapshot then end
146
+ yield self._make_snapshot(status="completed", payload=raw_payload, docs_override=self._data)
147
+ return
148
+
149
+ # Generic snapshot emit for status messages and periodic progress
150
+ payload = body.get("data", body)
151
+ status_str = payload.get("status", body.get("status", self._status))
152
+ snapshot = self._make_snapshot(status=status_str, payload=payload)
153
+ yield snapshot
154
+ if status_str in ("completed", "failed", "cancelled"):
155
+ return
156
+ except Exception:
157
+ # WS connect failure: fallback to HTTP polling loop until terminal/timeout
158
+ deadline = time.time() + (self._timeout or 30)
159
+ while True:
160
+ try:
161
+ job = await self._fetch_job_status()
162
+ yield job
163
+ if job.status in ("completed", "failed", "cancelled"):
164
+ return
165
+ except Exception:
166
+ return
167
+ if time.time() >= deadline:
168
+ return
169
+ await asyncio.sleep(1)
170
+
171
+ async def _fetch_job_status(self):
172
+ if self._kind == "crawl":
173
+ return await self._call_status_method("get_crawl_status")
174
+ return await self._call_status_method("get_batch_scrape_status")
175
+
176
+ async def _call_status_method(self, method_name: str):
177
+ # Try on client directly
178
+ meth = getattr(self._client, method_name, None)
179
+ if meth is not None:
180
+ try:
181
+ result = meth(self._job_id)
182
+ except TypeError:
183
+ result = None
184
+ if result is not None:
185
+ if inspect.isawaitable(result):
186
+ return await result
187
+ return result
188
+ # Fallback: if we couldn't call directly, try to_thread
189
+ return await asyncio.to_thread(meth, self._job_id)
190
+
191
+ # Try on client.v2
192
+ v2 = getattr(self._client, "v2", None)
193
+ if v2 is not None:
194
+ meth = getattr(v2, method_name, None)
195
+ if meth is not None:
196
+ try:
197
+ result = meth(self._job_id)
198
+ except TypeError:
199
+ result = None
200
+ if result is not None:
201
+ if inspect.isawaitable(result):
202
+ return await result
203
+ return result
204
+ return await asyncio.to_thread(meth, self._job_id)
205
+
206
+ raise RuntimeError(f"Client does not expose {method_name}")
207
+
208
+ async def _safe_fetch(self):
209
+ try:
210
+ return await self._fetch_job_status()
211
+ except Exception:
212
+ return None
213
+
214
+ def _make_snapshot(self, *, status: str, payload: Dict, docs_override: Optional[List[Dict]] = None):
215
+ docs = []
216
+ source_docs = docs_override if docs_override is not None else payload.get("data", []) or []
217
+ for doc in source_docs:
218
+ if isinstance(doc, dict):
219
+ d = dict(doc)
220
+ if "rawHtml" in d and "raw_html" not in d:
221
+ d["raw_html"] = d.pop("rawHtml")
222
+ if "changeTracking" in d and "change_tracking" not in d:
223
+ d["change_tracking"] = d.pop("changeTracking")
224
+ docs.append(Document(**d))
225
+
226
+ if self._kind == "crawl":
227
+ return CrawlJob(
228
+ status=status,
229
+ completed=payload.get("completed", 0),
230
+ total=payload.get("total", 0),
231
+ credits_used=payload.get("creditsUsed", 0),
232
+ expires_at=payload.get("expiresAt"),
233
+ next=payload.get("next"),
234
+ data=docs,
235
+ )
236
+ return BatchScrapeJob(
237
+ status=status,
238
+ completed=payload.get("completed", 0),
239
+ total=payload.get("total", 0),
240
+ credits_used=payload.get("creditsUsed"),
241
+ expires_at=payload.get("expiresAt"),
242
+ next=payload.get("next"),
243
+ data=docs,
244
+ )
245
+