firecrawl 4.12.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (92) hide show
  1. firecrawl/__init__.py +87 -0
  2. firecrawl/__tests__/e2e/v2/aio/conftest.py +62 -0
  3. firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +69 -0
  4. firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +189 -0
  5. firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +39 -0
  6. firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +41 -0
  7. firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +138 -0
  8. firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +249 -0
  9. firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +42 -0
  10. firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +43 -0
  11. firecrawl/__tests__/e2e/v2/conftest.py +73 -0
  12. firecrawl/__tests__/e2e/v2/test_async.py +73 -0
  13. firecrawl/__tests__/e2e/v2/test_batch_scrape.py +106 -0
  14. firecrawl/__tests__/e2e/v2/test_crawl.py +278 -0
  15. firecrawl/__tests__/e2e/v2/test_extract.py +55 -0
  16. firecrawl/__tests__/e2e/v2/test_map.py +61 -0
  17. firecrawl/__tests__/e2e/v2/test_scrape.py +191 -0
  18. firecrawl/__tests__/e2e/v2/test_search.py +270 -0
  19. firecrawl/__tests__/e2e/v2/test_usage.py +26 -0
  20. firecrawl/__tests__/e2e/v2/test_watcher.py +65 -0
  21. firecrawl/__tests__/unit/test_recursive_schema_v1.py +1209 -0
  22. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +12 -0
  23. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +79 -0
  24. firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +12 -0
  25. firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +20 -0
  26. firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +50 -0
  27. firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +64 -0
  28. firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +28 -0
  29. firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +117 -0
  30. firecrawl/__tests__/unit/v2/methods/test_agent.py +367 -0
  31. firecrawl/__tests__/unit/v2/methods/test_agent_request_preparation.py +226 -0
  32. firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +90 -0
  33. firecrawl/__tests__/unit/v2/methods/test_branding.py +214 -0
  34. firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +70 -0
  35. firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +240 -0
  36. firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +107 -0
  37. firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +54 -0
  38. firecrawl/__tests__/unit/v2/methods/test_pagination.py +671 -0
  39. firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +109 -0
  40. firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +169 -0
  41. firecrawl/__tests__/unit/v2/methods/test_search_validation.py +236 -0
  42. firecrawl/__tests__/unit/v2/methods/test_usage_types.py +18 -0
  43. firecrawl/__tests__/unit/v2/methods/test_webhook.py +123 -0
  44. firecrawl/__tests__/unit/v2/utils/test_metadata_extras.py +94 -0
  45. firecrawl/__tests__/unit/v2/utils/test_metadata_extras_multivalue.py +22 -0
  46. firecrawl/__tests__/unit/v2/utils/test_recursive_schema.py +1133 -0
  47. firecrawl/__tests__/unit/v2/utils/test_validation.py +311 -0
  48. firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +332 -0
  49. firecrawl/client.py +281 -0
  50. firecrawl/firecrawl.backup.py +4635 -0
  51. firecrawl/types.py +167 -0
  52. firecrawl/v1/__init__.py +14 -0
  53. firecrawl/v1/client.py +5164 -0
  54. firecrawl/v2/__init__.py +4 -0
  55. firecrawl/v2/client.py +967 -0
  56. firecrawl/v2/client_async.py +408 -0
  57. firecrawl/v2/methods/agent.py +144 -0
  58. firecrawl/v2/methods/aio/__init__.py +1 -0
  59. firecrawl/v2/methods/aio/agent.py +137 -0
  60. firecrawl/v2/methods/aio/batch.py +188 -0
  61. firecrawl/v2/methods/aio/crawl.py +351 -0
  62. firecrawl/v2/methods/aio/extract.py +133 -0
  63. firecrawl/v2/methods/aio/map.py +65 -0
  64. firecrawl/v2/methods/aio/scrape.py +33 -0
  65. firecrawl/v2/methods/aio/search.py +176 -0
  66. firecrawl/v2/methods/aio/usage.py +89 -0
  67. firecrawl/v2/methods/batch.py +499 -0
  68. firecrawl/v2/methods/crawl.py +592 -0
  69. firecrawl/v2/methods/extract.py +161 -0
  70. firecrawl/v2/methods/map.py +83 -0
  71. firecrawl/v2/methods/scrape.py +64 -0
  72. firecrawl/v2/methods/search.py +215 -0
  73. firecrawl/v2/methods/usage.py +84 -0
  74. firecrawl/v2/types.py +1143 -0
  75. firecrawl/v2/utils/__init__.py +9 -0
  76. firecrawl/v2/utils/error_handler.py +107 -0
  77. firecrawl/v2/utils/get_version.py +15 -0
  78. firecrawl/v2/utils/http_client.py +178 -0
  79. firecrawl/v2/utils/http_client_async.py +69 -0
  80. firecrawl/v2/utils/normalize.py +125 -0
  81. firecrawl/v2/utils/validation.py +692 -0
  82. firecrawl/v2/watcher.py +301 -0
  83. firecrawl/v2/watcher_async.py +243 -0
  84. firecrawl-4.12.0.dist-info/METADATA +234 -0
  85. firecrawl-4.12.0.dist-info/RECORD +92 -0
  86. firecrawl-4.12.0.dist-info/WHEEL +5 -0
  87. firecrawl-4.12.0.dist-info/licenses/LICENSE +21 -0
  88. firecrawl-4.12.0.dist-info/top_level.txt +2 -0
  89. tests/test_agent_integration.py +277 -0
  90. tests/test_api_key_handling.py +44 -0
  91. tests/test_change_tracking.py +98 -0
  92. tests/test_timeout_conversion.py +117 -0
@@ -0,0 +1,301 @@
1
+ """
2
+ WebSocket-based watcher for v2 jobs (crawl and batch), mirroring v1 behavior.
3
+
4
+ Usage:
5
+ watcher = client.watcher(job_id, kind="crawl")
6
+ watcher.add_listener(lambda status: print(status.status))
7
+ watcher.start()
8
+ """
9
+
10
+ import asyncio
11
+ import json
12
+ import threading
13
+ from typing import Callable, List, Optional, Literal, Union, Dict, Any
14
+
15
+ import websockets
16
+
17
+ from .types import CrawlJob, BatchScrapeJob, Document
18
+ from .utils.normalize import normalize_document_input
19
+
20
+
21
+ JobKind = Literal["crawl", "batch"]
22
+ JobType = Union[CrawlJob, BatchScrapeJob]
23
+
24
+
25
+ class Watcher:
26
+ def __init__(
27
+ self,
28
+ client: object,
29
+ job_id: str,
30
+ kind: JobKind = "crawl",
31
+ poll_interval: int = 2,
32
+ timeout: Optional[int] = None,
33
+ ) -> None:
34
+ self._client = client
35
+ self._job_id = job_id
36
+ self._kind = kind
37
+ self._timeout = timeout
38
+ self._poll_interval = poll_interval
39
+ self._listeners: List[Callable[[JobType], None]] = []
40
+ self._thread: Optional[threading.Thread] = None
41
+ self._stop = threading.Event()
42
+
43
+ http_client = getattr(client, "http_client", None)
44
+ self._api_url: Optional[str] = getattr(http_client, "api_url", None)
45
+ self._api_key: Optional[str] = getattr(http_client, "api_key", None)
46
+
47
+ # v1-parity state and event handlers
48
+ self.status: str = "scraping"
49
+ self.data: List[Dict[str, Any]] = []
50
+ self._event_handlers: Dict[str, List[Callable[[Dict[str, Any]], None]]] = {
51
+ "done": [],
52
+ "error": [],
53
+ "document": [],
54
+ }
55
+ self._sent_done: bool = False
56
+ self._sent_error: bool = False
57
+
58
+ def add_listener(self, callback: Callable[[JobType], None]) -> None:
59
+ self._listeners.append(callback)
60
+
61
+ def _emit(self, status: JobType) -> None:
62
+ for cb in list(self._listeners):
63
+ try:
64
+ cb(status)
65
+ except Exception:
66
+ pass
67
+
68
+ # v1-like events API
69
+ def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
70
+ if event_type in self._event_handlers:
71
+ self._event_handlers[event_type].append(handler)
72
+
73
+ def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
74
+ if event_type in self._event_handlers:
75
+ for handler in self._event_handlers[event_type]:
76
+ try:
77
+ handler(detail)
78
+ except Exception:
79
+ pass
80
+
81
+ def _build_ws_url(self) -> str:
82
+ if not self._api_url:
83
+ raise ValueError("API URL is required for WebSocket watcher")
84
+ ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
85
+ if self._kind == "crawl":
86
+ return f"{ws_base}/v2/crawl/{self._job_id}"
87
+ return f"{ws_base}/v2/batch/scrape/{self._job_id}"
88
+
89
+ async def _run_ws(self) -> None:
90
+ uri = self._build_ws_url()
91
+ headers_list = []
92
+ if self._api_key:
93
+ headers_list.append(("Authorization", f"Bearer {self._api_key}"))
94
+
95
+ try:
96
+ async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
97
+ deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
98
+ while not self._stop.is_set():
99
+ # Use short recv timeouts to allow HTTP polling fallback
100
+ if deadline is not None:
101
+ remaining = max(0.0, deadline - asyncio.get_event_loop().time())
102
+ timeout = min(self._poll_interval or remaining, remaining)
103
+ else:
104
+ timeout = self._poll_interval or 5
105
+ try:
106
+ msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
107
+ except asyncio.TimeoutError:
108
+ # Quiet period: poll HTTP once to progress statuses
109
+ if await self._poll_status_once():
110
+ break
111
+ else:
112
+ continue
113
+ except asyncio.CancelledError:
114
+ break
115
+ except Exception:
116
+ # Connection error: switch to HTTP polling until terminal or timeout
117
+ while not self._stop.is_set():
118
+ if await self._poll_status_once():
119
+ return
120
+ if deadline is not None and asyncio.get_event_loop().time() >= deadline:
121
+ return
122
+ await asyncio.sleep(self._poll_interval or 2)
123
+ return
124
+
125
+ try:
126
+ body = json.loads(msg)
127
+ except Exception:
128
+ continue
129
+
130
+ # v1-style typed event handling
131
+ msg_type = body.get("type")
132
+ if msg_type == "error":
133
+ self.status = "failed"
134
+ self.dispatch_event("error", {
135
+ "status": self.status,
136
+ "data": self.data,
137
+ "error": body.get("error"),
138
+ "id": self._job_id,
139
+ })
140
+ self._sent_error = True
141
+ # Emit a final failed snapshot for listeners
142
+ if self._kind == "crawl":
143
+ job = CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
144
+ else:
145
+ job = BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
146
+ self._emit(job)
147
+ break
148
+ elif msg_type == "catchup":
149
+ d = body.get("data", {})
150
+ self.status = d.get("status", self.status)
151
+ docs_in = d.get("data", [])
152
+ self.data.extend(docs_in)
153
+ for doc in docs_in:
154
+ self.dispatch_event("document", {"data": doc, "id": self._job_id})
155
+ elif msg_type == "document":
156
+ doc = body.get("data")
157
+ if isinstance(doc, dict):
158
+ self.data.append(doc)
159
+ self.dispatch_event("document", {"data": doc, "id": self._job_id})
160
+ elif msg_type == "done":
161
+ self.status = "completed"
162
+ # Gather any documents in the done payload
163
+ raw_payload = body.get("data", {}) or {}
164
+ docs_in = raw_payload.get("data", []) or []
165
+ if isinstance(docs_in, list) and docs_in:
166
+ for doc in docs_in:
167
+ if isinstance(doc, dict):
168
+ self.data.append(doc)
169
+ # Dispatch done event first
170
+ self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
171
+ self._sent_done = True
172
+ # Emit a final completed snapshot for listeners and break immediately
173
+ docs: List[Document] = []
174
+ for doc in self.data:
175
+ if isinstance(doc, dict):
176
+ d = normalize_document_input(doc)
177
+ docs.append(Document(**d))
178
+ if self._kind == "crawl":
179
+ job = CrawlJob(
180
+ status="completed",
181
+ completed=raw_payload.get("completed", 0),
182
+ total=raw_payload.get("total", 0),
183
+ credits_used=raw_payload.get("creditsUsed", 0),
184
+ expires_at=raw_payload.get("expiresAt"),
185
+ next=raw_payload.get("next"),
186
+ data=docs,
187
+ )
188
+ else:
189
+ job = BatchScrapeJob(
190
+ status="completed",
191
+ completed=raw_payload.get("completed", 0),
192
+ total=raw_payload.get("total", 0),
193
+ credits_used=raw_payload.get("creditsUsed", 0),
194
+ expires_at=raw_payload.get("expiresAt"),
195
+ next=raw_payload.get("next"),
196
+ data=docs,
197
+ )
198
+ self._emit(job)
199
+ break
200
+
201
+ payload = body.get("data", body)
202
+ # Only treat messages with an explicit status as job snapshots
203
+ has_status_field = (isinstance(payload, dict) and "status" in payload) or ("status" in body)
204
+ if not has_status_field:
205
+ continue
206
+ status_str = payload.get("status", body.get("status", self.status))
207
+
208
+ if self._kind == "crawl":
209
+ docs = []
210
+ for doc in payload.get("data", []):
211
+ if isinstance(doc, dict):
212
+ d = normalize_document_input(doc)
213
+ docs.append(Document(**d))
214
+ job = CrawlJob(
215
+ status=status_str,
216
+ completed=payload.get("completed", 0),
217
+ total=payload.get("total", 0),
218
+ credits_used=payload.get("creditsUsed", 0),
219
+ expires_at=payload.get("expiresAt"),
220
+ next=payload.get("next"),
221
+ data=docs,
222
+ )
223
+ self._emit(job)
224
+ if status_str in ("completed", "failed", "cancelled"):
225
+ # Ensure done/error dispatched even if server didn't send explicit event type
226
+ if status_str == "completed" and not self._sent_done:
227
+ self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
228
+ self._sent_done = True
229
+ if status_str == "failed" and not self._sent_error:
230
+ self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
231
+ self._sent_error = True
232
+ break
233
+ else:
234
+ docs = []
235
+ for doc in payload.get("data", []):
236
+ if isinstance(doc, dict):
237
+ d = normalize_document_input(doc)
238
+ docs.append(Document(**d))
239
+ job = BatchScrapeJob(
240
+ status=status_str,
241
+ completed=payload.get("completed", 0),
242
+ total=payload.get("total", 0),
243
+ credits_used=payload.get("creditsUsed"),
244
+ expires_at=payload.get("expiresAt"),
245
+ next=payload.get("next"),
246
+ data=docs,
247
+ )
248
+ self._emit(job)
249
+ if status_str in ("completed", "failed", "cancelled"):
250
+ if status_str == "completed" and not self._sent_done:
251
+ self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
252
+ self._sent_done = True
253
+ if status_str == "failed" and not self._sent_error:
254
+ self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
255
+ self._sent_error = True
256
+ break
257
+ except Exception:
258
+ pass
259
+ finally:
260
+ # Ensure terminal event parity with v1 even on abrupt disconnects
261
+ if self.status == "completed" and not self._sent_done:
262
+ self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
263
+ self._sent_done = True
264
+
265
+ async def _poll_status_once(self) -> bool:
266
+ """Poll job status over HTTP once. Returns True if terminal."""
267
+ try:
268
+ if self._kind == "crawl":
269
+ job: CrawlJob = await asyncio.to_thread(self._client.get_crawl_status, self._job_id)
270
+ else:
271
+ job: BatchScrapeJob = await asyncio.to_thread(self._client.get_batch_scrape_status, self._job_id)
272
+ except Exception:
273
+ return False
274
+
275
+ self.status = job.status
276
+ self._emit(job)
277
+ if job.status in ("completed", "failed", "cancelled"):
278
+ if job.status == "completed" and not self._sent_done:
279
+ self.dispatch_event("done", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
280
+ self._sent_done = True
281
+ if job.status == "failed" and not self._sent_error:
282
+ self.dispatch_event("error", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
283
+ self._sent_error = True
284
+ return True
285
+ return False
286
+
287
+ def _loop(self) -> None:
288
+ asyncio.run(self._run_ws())
289
+
290
+ def start(self) -> None:
291
+ if self._thread and self._thread.is_alive():
292
+ return
293
+ self._stop.clear()
294
+ self._thread = threading.Thread(target=self._loop, daemon=True)
295
+ self._thread.start()
296
+
297
+ def stop(self) -> None:
298
+ self._stop.set()
299
+ if self._thread:
300
+ self._thread.join(timeout=1)
301
+
@@ -0,0 +1,243 @@
1
+ """
2
+ Async WebSocket watcher with async iterator interface for v2 jobs.
3
+
4
+ Usage:
5
+ async for snapshot in AsyncWatcher(client, job_id, kind="crawl"):
6
+ print(snapshot.status)
7
+ """
8
+
9
+ import asyncio
10
+ import inspect
11
+ import json
12
+ import time
13
+ from typing import AsyncIterator, Dict, List, Literal, Optional
14
+
15
+ import websockets
16
+ from websockets.exceptions import ConnectionClosed, ConnectionClosedOK, ConnectionClosedError
17
+
18
+ from .types import BatchScrapeJob, CrawlJob, Document
19
+ from .utils.normalize import normalize_document_input
20
+
21
+ JobKind = Literal["crawl", "batch"]
22
+
23
+
24
+ class AsyncWatcher:
25
+ def __init__(
26
+ self,
27
+ client: object,
28
+ job_id: str,
29
+ *,
30
+ kind: JobKind = "crawl",
31
+ poll_interval: int = 2,
32
+ timeout: Optional[int] = None,
33
+ ) -> None:
34
+ self._client = client
35
+ self._job_id = job_id
36
+ self._kind = kind
37
+ self._timeout = timeout
38
+ self._poll_interval: float = max(0.0, float(poll_interval)) # Guard against negative values
39
+
40
+ http_client = getattr(client, "http_client", None)
41
+ if http_client is not None:
42
+ self._api_url = getattr(http_client, "api_url", None)
43
+ self._api_key = getattr(http_client, "api_key", None)
44
+ else:
45
+ # Allow passing the top-level Firecrawl client directly
46
+ self._api_url = getattr(client, "api_url", None)
47
+ self._api_key = getattr(client, "api_key", None)
48
+
49
+ self._status: str = "scraping"
50
+ self._data: List[Dict] = []
51
+
52
+ def __aiter__(self) -> AsyncIterator[object]:
53
+ return self._iterate()
54
+
55
+ def _build_ws_url(self) -> str:
56
+ if not self._api_url:
57
+ raise ValueError("API URL is required for WebSocket watcher")
58
+ ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
59
+ if self._kind == "crawl":
60
+ return f"{ws_base}/v2/crawl/{self._job_id}"
61
+ return f"{ws_base}/v2/batch/scrape/{self._job_id}"
62
+
63
+ async def _iterate(self) -> AsyncIterator[object]:
64
+ uri = self._build_ws_url()
65
+ headers_list = []
66
+ if self._api_key:
67
+ headers_list.append(("Authorization", f"Bearer {self._api_key}"))
68
+
69
+ # Attempt to establish WS; on failure, fall back to HTTP polling immediately
70
+ try:
71
+ async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
72
+ deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
73
+ # Pre-yield a snapshot if available to ensure progress is visible
74
+ try:
75
+ pre = await self._fetch_job_status()
76
+ yield pre
77
+ if pre.status in ("completed", "failed", "cancelled"):
78
+ return
79
+ except Exception:
80
+ pass
81
+
82
+ while True:
83
+ try:
84
+ if deadline is not None:
85
+ remaining = max(0.0, deadline - asyncio.get_event_loop().time())
86
+ timeout = min(self._poll_interval, remaining) if remaining > 0 else 0.0
87
+ else:
88
+ timeout = self._poll_interval
89
+ msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
90
+ except asyncio.TimeoutError:
91
+ # Quiet period: poll HTTP once
92
+ job = await self._safe_fetch()
93
+ if job is not None:
94
+ yield job
95
+ if job.status in ("completed", "failed", "cancelled"):
96
+ return
97
+ if deadline is not None and asyncio.get_event_loop().time() >= deadline:
98
+ return
99
+ continue
100
+ except (ConnectionClosedOK, ConnectionClosed, ConnectionClosedError):
101
+ # Graceful/abrupt close: poll HTTP until terminal (bounded by timeout)
102
+ deadline = time.time() + (self._timeout or 30)
103
+ while True:
104
+ try:
105
+ job = await self._fetch_job_status()
106
+ yield job
107
+ if job.status in ("completed", "failed", "cancelled"):
108
+ return
109
+ except Exception:
110
+ return
111
+ if time.time() >= deadline:
112
+ return
113
+ await asyncio.sleep(1)
114
+ try:
115
+ body = json.loads(msg)
116
+ except Exception:
117
+ continue
118
+
119
+ msg_type = body.get("type")
120
+ if msg_type == "error":
121
+ self._status = "failed"
122
+ # Yield a terminal snapshot
123
+ if self._kind == "crawl":
124
+ yield CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
125
+ else:
126
+ yield BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
127
+ return
128
+ elif msg_type == "catchup":
129
+ d = body.get("data", {})
130
+ self._status = d.get("status", self._status)
131
+ docs_in = d.get("data", []) or []
132
+ self._data.extend(docs_in)
133
+ # Fall through to emit a snapshot below
134
+ elif msg_type == "document":
135
+ doc = body.get("data")
136
+ if isinstance(doc, dict):
137
+ self._data.append(doc)
138
+ # Fall through to emit a snapshot below
139
+ elif msg_type == "done":
140
+ self._status = "completed"
141
+ raw_payload = body.get("data", {}) or {}
142
+ docs_in = raw_payload.get("data", []) or []
143
+ if isinstance(docs_in, list) and docs_in:
144
+ for doc in docs_in:
145
+ if isinstance(doc, dict):
146
+ self._data.append(doc)
147
+ # Emit final snapshot then end
148
+ yield self._make_snapshot(status="completed", payload=raw_payload, docs_override=self._data)
149
+ return
150
+
151
+ # Generic snapshot emit for status messages and periodic progress
152
+ payload = body.get("data", body)
153
+ status_str = payload.get("status", body.get("status", self._status))
154
+ snapshot = self._make_snapshot(status=status_str, payload=payload)
155
+ yield snapshot
156
+ if status_str in ("completed", "failed", "cancelled"):
157
+ return
158
+ except Exception:
159
+ # WS connect failure: fallback to HTTP polling loop until terminal/timeout
160
+ deadline = time.time() + (self._timeout or 30)
161
+ while True:
162
+ try:
163
+ job = await self._fetch_job_status()
164
+ yield job
165
+ if job.status in ("completed", "failed", "cancelled"):
166
+ return
167
+ except Exception:
168
+ return
169
+ if time.time() >= deadline:
170
+ return
171
+ await asyncio.sleep(1)
172
+
173
+ async def _fetch_job_status(self):
174
+ if self._kind == "crawl":
175
+ return await self._call_status_method("get_crawl_status")
176
+ return await self._call_status_method("get_batch_scrape_status")
177
+
178
+ async def _call_status_method(self, method_name: str):
179
+ # Try on client directly
180
+ meth = getattr(self._client, method_name, None)
181
+ if meth is not None:
182
+ try:
183
+ result = meth(self._job_id)
184
+ except TypeError:
185
+ result = None
186
+ if result is not None:
187
+ if inspect.isawaitable(result):
188
+ return await result
189
+ return result
190
+ # Fallback: if we couldn't call directly, try to_thread
191
+ return await asyncio.to_thread(meth, self._job_id)
192
+
193
+ # Try on client.v2
194
+ v2 = getattr(self._client, "v2", None)
195
+ if v2 is not None:
196
+ meth = getattr(v2, method_name, None)
197
+ if meth is not None:
198
+ try:
199
+ result = meth(self._job_id)
200
+ except TypeError:
201
+ result = None
202
+ if result is not None:
203
+ if inspect.isawaitable(result):
204
+ return await result
205
+ return result
206
+ return await asyncio.to_thread(meth, self._job_id)
207
+
208
+ raise RuntimeError(f"Client does not expose {method_name}")
209
+
210
+ async def _safe_fetch(self):
211
+ try:
212
+ return await self._fetch_job_status()
213
+ except Exception:
214
+ return None
215
+
216
+ def _make_snapshot(self, *, status: str, payload: Dict, docs_override: Optional[List[Dict]] = None):
217
+ docs = []
218
+ source_docs = docs_override if docs_override is not None else payload.get("data", []) or []
219
+ for doc in source_docs:
220
+ if isinstance(doc, dict):
221
+ d = normalize_document_input(doc)
222
+ docs.append(Document(**d))
223
+
224
+ if self._kind == "crawl":
225
+ return CrawlJob(
226
+ status=status,
227
+ completed=payload.get("completed", 0),
228
+ total=payload.get("total", 0),
229
+ credits_used=payload.get("creditsUsed", 0),
230
+ expires_at=payload.get("expiresAt"),
231
+ next=payload.get("next"),
232
+ data=docs,
233
+ )
234
+ return BatchScrapeJob(
235
+ status=status,
236
+ completed=payload.get("completed", 0),
237
+ total=payload.get("total", 0),
238
+ credits_used=payload.get("creditsUsed"),
239
+ expires_at=payload.get("expiresAt"),
240
+ next=payload.get("next"),
241
+ data=docs,
242
+ )
243
+