firecrawl-py 3.3.1__py3-none-any.whl → 3.3.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of firecrawl-py might be problematic. Click here for more details.

Files changed (84) hide show
  1. firecrawl/__init__.py +1 -1
  2. firecrawl/__tests__/e2e/v2/test_scrape.py +37 -1
  3. firecrawl/client.py +8 -4
  4. firecrawl/v2/types.py +19 -2
  5. {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info}/METADATA +7 -3
  6. firecrawl_py-3.3.3.dist-info/RECORD +79 -0
  7. {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info}/WHEEL +1 -1
  8. {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info/licenses}/LICENSE +0 -0
  9. {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info}/top_level.txt +0 -2
  10. build/lib/firecrawl/__init__.py +0 -87
  11. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -79
  12. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -188
  13. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -38
  14. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -40
  15. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -137
  16. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -248
  17. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -35
  18. build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -43
  19. build/lib/firecrawl/__tests__/e2e/v2/conftest.py +0 -73
  20. build/lib/firecrawl/__tests__/e2e/v2/test_async.py +0 -73
  21. build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -105
  22. build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -276
  23. build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +0 -54
  24. build/lib/firecrawl/__tests__/e2e/v2/test_map.py +0 -60
  25. build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -154
  26. build/lib/firecrawl/__tests__/e2e/v2/test_search.py +0 -269
  27. build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +0 -26
  28. build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -65
  29. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -12
  30. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -61
  31. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -12
  32. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -19
  33. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -50
  34. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -63
  35. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -28
  36. build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -117
  37. build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -90
  38. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -70
  39. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -240
  40. build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -107
  41. build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -53
  42. build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -92
  43. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -167
  44. build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -236
  45. build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -18
  46. build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -123
  47. build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -290
  48. build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -332
  49. build/lib/firecrawl/client.py +0 -242
  50. build/lib/firecrawl/firecrawl.backup.py +0 -4635
  51. build/lib/firecrawl/types.py +0 -161
  52. build/lib/firecrawl/v1/__init__.py +0 -14
  53. build/lib/firecrawl/v1/client.py +0 -4653
  54. build/lib/firecrawl/v2/__init__.py +0 -4
  55. build/lib/firecrawl/v2/client.py +0 -805
  56. build/lib/firecrawl/v2/client_async.py +0 -250
  57. build/lib/firecrawl/v2/methods/aio/__init__.py +0 -1
  58. build/lib/firecrawl/v2/methods/aio/batch.py +0 -85
  59. build/lib/firecrawl/v2/methods/aio/crawl.py +0 -171
  60. build/lib/firecrawl/v2/methods/aio/extract.py +0 -126
  61. build/lib/firecrawl/v2/methods/aio/map.py +0 -59
  62. build/lib/firecrawl/v2/methods/aio/scrape.py +0 -33
  63. build/lib/firecrawl/v2/methods/aio/search.py +0 -172
  64. build/lib/firecrawl/v2/methods/aio/usage.py +0 -42
  65. build/lib/firecrawl/v2/methods/batch.py +0 -417
  66. build/lib/firecrawl/v2/methods/crawl.py +0 -469
  67. build/lib/firecrawl/v2/methods/extract.py +0 -131
  68. build/lib/firecrawl/v2/methods/map.py +0 -77
  69. build/lib/firecrawl/v2/methods/scrape.py +0 -64
  70. build/lib/firecrawl/v2/methods/search.py +0 -197
  71. build/lib/firecrawl/v2/methods/usage.py +0 -41
  72. build/lib/firecrawl/v2/types.py +0 -665
  73. build/lib/firecrawl/v2/utils/__init__.py +0 -9
  74. build/lib/firecrawl/v2/utils/error_handler.py +0 -107
  75. build/lib/firecrawl/v2/utils/get_version.py +0 -15
  76. build/lib/firecrawl/v2/utils/http_client.py +0 -153
  77. build/lib/firecrawl/v2/utils/http_client_async.py +0 -65
  78. build/lib/firecrawl/v2/utils/normalize.py +0 -107
  79. build/lib/firecrawl/v2/utils/validation.py +0 -324
  80. build/lib/firecrawl/v2/watcher.py +0 -301
  81. build/lib/firecrawl/v2/watcher_async.py +0 -242
  82. build/lib/tests/test_change_tracking.py +0 -98
  83. build/lib/tests/test_timeout_conversion.py +0 -117
  84. firecrawl_py-3.3.1.dist-info/RECORD +0 -153
@@ -1,324 +0,0 @@
1
- """
2
- Shared validation functions for Firecrawl v2 API.
3
- """
4
-
5
- from typing import Optional, Dict, Any, List
6
- from ..types import ScrapeOptions, ScrapeFormats
7
-
8
-
9
- def _convert_format_string(format_str: str) -> str:
10
- """
11
- Convert format string from snake_case to camelCase.
12
-
13
- Args:
14
- format_str: Format string in snake_case
15
-
16
- Returns:
17
- Format string in camelCase
18
- """
19
- format_mapping = {
20
- "raw_html": "rawHtml",
21
- "change_tracking": "changeTracking",
22
- "screenshot_full_page": "screenshot@fullPage"
23
- }
24
- return format_mapping.get(format_str, format_str)
25
-
26
-
27
- def _normalize_schema(schema: Any) -> Optional[Dict[str, Any]]:
28
- """
29
- Normalize a schema object which may be a dict, Pydantic BaseModel subclass,
30
- or a Pydantic model instance into a plain dict.
31
- """
32
- try:
33
- # Pydantic v2 BaseModel subclass: has "model_json_schema"
34
- if hasattr(schema, "model_json_schema") and callable(schema.model_json_schema):
35
- return schema.model_json_schema()
36
- # Pydantic v2 BaseModel instance: has "model_dump" or "model_json_schema"
37
- if hasattr(schema, "model_dump") and callable(schema.model_dump):
38
- # Try to get JSON schema if available on the class
39
- mjs = getattr(schema.__class__, "model_json_schema", None)
40
- if callable(mjs):
41
- return schema.__class__.model_json_schema()
42
- # Fallback to data shape (not ideal, but better than dropping)
43
- return schema.model_dump()
44
- # Pydantic v1 BaseModel subclass: has "schema"
45
- if hasattr(schema, "schema") and callable(schema.schema):
46
- return schema.schema()
47
- # Pydantic v1 BaseModel instance
48
- if hasattr(schema, "dict") and callable(schema.dict):
49
- # Prefer class-level schema if present
50
- sch = getattr(schema.__class__, "schema", None)
51
- if callable(sch):
52
- return schema.__class__.schema()
53
- return schema.dict()
54
- except Exception:
55
- pass
56
- # Already a dict or unsupported type
57
- return schema if isinstance(schema, dict) else None
58
-
59
-
60
- def _validate_json_format(format_obj: Any) -> Dict[str, Any]:
61
- """
62
- Validate and prepare json format object.
63
-
64
- Args:
65
- format_obj: Format object that should be json type
66
-
67
- Returns:
68
- Validated json format dict
69
-
70
- Raises:
71
- ValueError: If json format is missing required fields
72
- """
73
- if not isinstance(format_obj, dict):
74
- raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
75
-
76
- if format_obj.get('type') != 'json':
77
- raise ValueError("json format must have type='json'")
78
-
79
- # prompt is optional in v2; only normalize when present
80
- # schema is recommended; if provided, normalize Pydantic forms
81
- schema = format_obj.get('schema')
82
- normalized = dict(format_obj)
83
- if schema is not None:
84
- normalized_schema = _normalize_schema(schema)
85
- if normalized_schema is not None:
86
- normalized['schema'] = normalized_schema
87
- return normalized
88
-
89
-
90
- def validate_scrape_options(options: Optional[ScrapeOptions]) -> Optional[ScrapeOptions]:
91
- """
92
- Validate and normalize scrape options.
93
-
94
- Args:
95
- options: Scraping options to validate
96
-
97
- Returns:
98
- Validated options or None
99
-
100
- Raises:
101
- ValueError: If options are invalid
102
- """
103
- if options is None:
104
- return None
105
-
106
- # Validate timeout
107
- if options.timeout is not None and options.timeout <= 0:
108
- raise ValueError("Timeout must be positive")
109
-
110
- # Validate wait_for
111
- if options.wait_for is not None and options.wait_for < 0:
112
- raise ValueError("wait_for must be non-negative")
113
-
114
- return options
115
-
116
-
117
- def prepare_scrape_options(options: Optional[ScrapeOptions]) -> Optional[Dict[str, Any]]:
118
- """
119
- Prepare ScrapeOptions for API submission with manual snake_case to camelCase conversion.
120
-
121
- Args:
122
- options: ScrapeOptions to prepare
123
-
124
- Returns:
125
- Dictionary ready for API submission or None if options is None
126
- """
127
- if options is None:
128
- return None
129
-
130
- # Validate options first
131
- validated_options = validate_scrape_options(options)
132
- if validated_options is None:
133
- return None
134
-
135
- # Apply default values for None fields
136
- default_values = {
137
- "only_main_content": True,
138
- "mobile": False,
139
- "skip_tls_verification": True,
140
- "remove_base64_images": True,
141
- "fast_mode": False,
142
- "block_ads": True,
143
- "max_age": 14400000,
144
- "store_in_cache": True
145
- }
146
-
147
- # Convert to dict and handle manual snake_case to camelCase conversion
148
- options_data = validated_options.model_dump(exclude_none=True)
149
-
150
- # Apply defaults for None fields
151
- for field, default_value in default_values.items():
152
- if field not in options_data:
153
- options_data[field] = default_value
154
-
155
- scrape_data = {}
156
-
157
- # Manual field mapping for snake_case to camelCase conversion
158
- field_mappings = {
159
- "include_tags": "includeTags",
160
- "exclude_tags": "excludeTags",
161
- "only_main_content": "onlyMainContent",
162
- "wait_for": "waitFor",
163
- "skip_tls_verification": "skipTlsVerification",
164
- "remove_base64_images": "removeBase64Images",
165
- "fast_mode": "fastMode",
166
- "use_mock": "useMock",
167
- "block_ads": "blockAds",
168
- "store_in_cache": "storeInCache",
169
- "max_age": "maxAge"
170
- }
171
-
172
- # Apply field mappings
173
- for snake_case, camel_case in field_mappings.items():
174
- if snake_case in options_data:
175
- scrape_data[camel_case] = options_data.pop(snake_case)
176
-
177
- # Handle special cases
178
- for key, value in options_data.items():
179
- if value is not None:
180
- if key == "formats":
181
- # Handle formats conversion
182
- converted_formats: List[Any] = []
183
-
184
- # Prefer using original object to detect ScrapeFormats vs list
185
- original_formats = getattr(options, 'formats', None)
186
-
187
- if isinstance(original_formats, ScrapeFormats):
188
- # Include explicit list first
189
- if original_formats.formats:
190
- for fmt in original_formats.formats:
191
- if isinstance(fmt, str):
192
- if fmt == "json":
193
- raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
194
- converted_formats.append(_convert_format_string(fmt))
195
- elif isinstance(fmt, dict):
196
- fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None
197
- if fmt_type == 'json':
198
- validated_json = _validate_json_format({**fmt, 'type': 'json'})
199
- converted_formats.append(validated_json)
200
- elif fmt_type == 'screenshot':
201
- # Normalize screenshot options
202
- normalized = {**fmt, 'type': 'screenshot'}
203
- if 'full_page' in normalized:
204
- normalized['fullPage'] = normalized.pop('full_page')
205
- # Normalize viewport if it's a model instance
206
- vp = normalized.get('viewport')
207
- if hasattr(vp, 'model_dump'):
208
- normalized['viewport'] = vp.model_dump(exclude_none=True)
209
- converted_formats.append(normalized)
210
- else:
211
- if 'type' in fmt:
212
- fmt['type'] = fmt_type or fmt['type']
213
- converted_formats.append(fmt)
214
- elif hasattr(fmt, 'type'):
215
- if fmt.type == 'json':
216
- converted_formats.append(_validate_json_format(fmt.model_dump()))
217
- else:
218
- converted_formats.append(_convert_format_string(fmt.type))
219
- else:
220
- converted_formats.append(fmt)
221
-
222
- # Add booleans from ScrapeFormats
223
- if original_formats.markdown:
224
- converted_formats.append("markdown")
225
- if original_formats.html:
226
- converted_formats.append("html")
227
- if original_formats.raw_html:
228
- converted_formats.append("rawHtml")
229
- if original_formats.summary:
230
- converted_formats.append("summary")
231
- if original_formats.links:
232
- converted_formats.append("links")
233
- if original_formats.screenshot:
234
- converted_formats.append("screenshot")
235
- if original_formats.change_tracking:
236
- converted_formats.append("changeTracking")
237
- # Note: We intentionally do not auto-include 'json' when boolean is set,
238
- # because JSON requires an object with schema/prompt. The caller must
239
- # supply the full json format object explicitly.
240
- elif isinstance(original_formats, list):
241
- for fmt in original_formats:
242
- if isinstance(fmt, str):
243
- if fmt == "json":
244
- raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
245
- converted_formats.append(_convert_format_string(fmt))
246
- elif isinstance(fmt, dict):
247
- fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None
248
- if fmt_type == 'json':
249
- validated_json = _validate_json_format({**fmt, 'type': 'json'})
250
- converted_formats.append(validated_json)
251
- elif fmt_type == 'screenshot':
252
- normalized = {**fmt, 'type': 'screenshot'}
253
- if 'full_page' in normalized:
254
- normalized['fullPage'] = normalized.pop('full_page')
255
- vp = normalized.get('viewport')
256
- if hasattr(vp, 'model_dump'):
257
- normalized['viewport'] = vp.model_dump(exclude_none=True)
258
- converted_formats.append(normalized)
259
- else:
260
- if 'type' in fmt:
261
- fmt['type'] = fmt_type or fmt['type']
262
- converted_formats.append(fmt)
263
- elif hasattr(fmt, 'type'):
264
- if fmt.type == 'json':
265
- converted_formats.append(_validate_json_format(fmt.model_dump()))
266
- elif fmt.type == 'screenshot':
267
- normalized = {'type': 'screenshot'}
268
- if getattr(fmt, 'full_page', None) is not None:
269
- normalized['fullPage'] = fmt.full_page
270
- if getattr(fmt, 'quality', None) is not None:
271
- normalized['quality'] = fmt.quality
272
- vp = getattr(fmt, 'viewport', None)
273
- if vp is not None:
274
- normalized['viewport'] = vp.model_dump(exclude_none=True) if hasattr(vp, 'model_dump') else vp
275
- converted_formats.append(normalized)
276
- else:
277
- converted_formats.append(_convert_format_string(fmt.type))
278
- else:
279
- converted_formats.append(fmt)
280
- else:
281
- # Fallback: try to iterate over value if it's a list-like
282
- try:
283
- for fmt in value:
284
- converted_formats.append(fmt)
285
- except TypeError:
286
- pass
287
-
288
- if converted_formats:
289
- scrape_data["formats"] = converted_formats
290
- elif key == "actions":
291
- # Handle actions conversion
292
- converted_actions = []
293
- for action in value:
294
- if isinstance(action, dict):
295
- # Convert action dict
296
- converted_action = {}
297
- for action_key, action_value in action.items():
298
- if action_key == "full_page":
299
- converted_action["fullPage"] = action_value
300
- else:
301
- converted_action[action_key] = action_value
302
- converted_actions.append(converted_action)
303
- else:
304
- # Handle action objects
305
- action_data = action.model_dump(exclude_none=True)
306
- converted_action = {}
307
- for action_key, action_value in action_data.items():
308
- if action_key == "full_page":
309
- converted_action["fullPage"] = action_value
310
- else:
311
- converted_action[action_key] = action_value
312
- converted_actions.append(converted_action)
313
- scrape_data["actions"] = converted_actions
314
- elif key == "location":
315
- # Handle location conversion
316
- if isinstance(value, dict):
317
- scrape_data["location"] = value
318
- else:
319
- scrape_data["location"] = value.model_dump(exclude_none=True)
320
- else:
321
- # For fields that don't need conversion, use as-is
322
- scrape_data[key] = value
323
-
324
- return scrape_data
@@ -1,301 +0,0 @@
1
- """
2
- WebSocket-based watcher for v2 jobs (crawl and batch), mirroring v1 behavior.
3
-
4
- Usage:
5
- watcher = client.watcher(job_id, kind="crawl")
6
- watcher.add_listener(lambda status: print(status.status))
7
- watcher.start()
8
- """
9
-
10
- import asyncio
11
- import json
12
- import threading
13
- from typing import Callable, List, Optional, Literal, Union, Dict, Any
14
-
15
- import websockets
16
-
17
- from .types import CrawlJob, BatchScrapeJob, Document
18
- from .utils.normalize import normalize_document_input
19
-
20
-
21
- JobKind = Literal["crawl", "batch"]
22
- JobType = Union[CrawlJob, BatchScrapeJob]
23
-
24
-
25
- class Watcher:
26
- def __init__(
27
- self,
28
- client: object,
29
- job_id: str,
30
- kind: JobKind = "crawl",
31
- poll_interval: int = 2,
32
- timeout: Optional[int] = None,
33
- ) -> None:
34
- self._client = client
35
- self._job_id = job_id
36
- self._kind = kind
37
- self._timeout = timeout
38
- self._poll_interval = poll_interval
39
- self._listeners: List[Callable[[JobType], None]] = []
40
- self._thread: Optional[threading.Thread] = None
41
- self._stop = threading.Event()
42
-
43
- http_client = getattr(client, "http_client", None)
44
- self._api_url: Optional[str] = getattr(http_client, "api_url", None)
45
- self._api_key: Optional[str] = getattr(http_client, "api_key", None)
46
-
47
- # v1-parity state and event handlers
48
- self.status: str = "scraping"
49
- self.data: List[Dict[str, Any]] = []
50
- self._event_handlers: Dict[str, List[Callable[[Dict[str, Any]], None]]] = {
51
- "done": [],
52
- "error": [],
53
- "document": [],
54
- }
55
- self._sent_done: bool = False
56
- self._sent_error: bool = False
57
-
58
- def add_listener(self, callback: Callable[[JobType], None]) -> None:
59
- self._listeners.append(callback)
60
-
61
- def _emit(self, status: JobType) -> None:
62
- for cb in list(self._listeners):
63
- try:
64
- cb(status)
65
- except Exception:
66
- pass
67
-
68
- # v1-like events API
69
- def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
70
- if event_type in self._event_handlers:
71
- self._event_handlers[event_type].append(handler)
72
-
73
- def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
74
- if event_type in self._event_handlers:
75
- for handler in self._event_handlers[event_type]:
76
- try:
77
- handler(detail)
78
- except Exception:
79
- pass
80
-
81
- def _build_ws_url(self) -> str:
82
- if not self._api_url:
83
- raise ValueError("API URL is required for WebSocket watcher")
84
- ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
85
- if self._kind == "crawl":
86
- return f"{ws_base}/v2/crawl/{self._job_id}"
87
- return f"{ws_base}/v2/batch/scrape/{self._job_id}"
88
-
89
- async def _run_ws(self) -> None:
90
- uri = self._build_ws_url()
91
- headers_list = []
92
- if self._api_key:
93
- headers_list.append(("Authorization", f"Bearer {self._api_key}"))
94
-
95
- try:
96
- async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
97
- deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
98
- while not self._stop.is_set():
99
- # Use short recv timeouts to allow HTTP polling fallback
100
- if deadline is not None:
101
- remaining = max(0.0, deadline - asyncio.get_event_loop().time())
102
- timeout = min(self._poll_interval or remaining, remaining)
103
- else:
104
- timeout = self._poll_interval or 5
105
- try:
106
- msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
107
- except asyncio.TimeoutError:
108
- # Quiet period: poll HTTP once to progress statuses
109
- if await self._poll_status_once():
110
- break
111
- else:
112
- continue
113
- except asyncio.CancelledError:
114
- break
115
- except Exception:
116
- # Connection error: switch to HTTP polling until terminal or timeout
117
- while not self._stop.is_set():
118
- if await self._poll_status_once():
119
- return
120
- if deadline is not None and asyncio.get_event_loop().time() >= deadline:
121
- return
122
- await asyncio.sleep(self._poll_interval or 2)
123
- return
124
-
125
- try:
126
- body = json.loads(msg)
127
- except Exception:
128
- continue
129
-
130
- # v1-style typed event handling
131
- msg_type = body.get("type")
132
- if msg_type == "error":
133
- self.status = "failed"
134
- self.dispatch_event("error", {
135
- "status": self.status,
136
- "data": self.data,
137
- "error": body.get("error"),
138
- "id": self._job_id,
139
- })
140
- self._sent_error = True
141
- # Emit a final failed snapshot for listeners
142
- if self._kind == "crawl":
143
- job = CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
144
- else:
145
- job = BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
146
- self._emit(job)
147
- break
148
- elif msg_type == "catchup":
149
- d = body.get("data", {})
150
- self.status = d.get("status", self.status)
151
- docs_in = d.get("data", [])
152
- self.data.extend(docs_in)
153
- for doc in docs_in:
154
- self.dispatch_event("document", {"data": doc, "id": self._job_id})
155
- elif msg_type == "document":
156
- doc = body.get("data")
157
- if isinstance(doc, dict):
158
- self.data.append(doc)
159
- self.dispatch_event("document", {"data": doc, "id": self._job_id})
160
- elif msg_type == "done":
161
- self.status = "completed"
162
- # Gather any documents in the done payload
163
- raw_payload = body.get("data", {}) or {}
164
- docs_in = raw_payload.get("data", []) or []
165
- if isinstance(docs_in, list) and docs_in:
166
- for doc in docs_in:
167
- if isinstance(doc, dict):
168
- self.data.append(doc)
169
- # Dispatch done event first
170
- self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
171
- self._sent_done = True
172
- # Emit a final completed snapshot for listeners and break immediately
173
- docs: List[Document] = []
174
- for doc in self.data:
175
- if isinstance(doc, dict):
176
- d = normalize_document_input(doc)
177
- docs.append(Document(**d))
178
- if self._kind == "crawl":
179
- job = CrawlJob(
180
- status="completed",
181
- completed=raw_payload.get("completed", 0),
182
- total=raw_payload.get("total", 0),
183
- credits_used=raw_payload.get("creditsUsed", 0),
184
- expires_at=raw_payload.get("expiresAt"),
185
- next=raw_payload.get("next"),
186
- data=docs,
187
- )
188
- else:
189
- job = BatchScrapeJob(
190
- status="completed",
191
- completed=raw_payload.get("completed", 0),
192
- total=raw_payload.get("total", 0),
193
- credits_used=raw_payload.get("creditsUsed", 0),
194
- expires_at=raw_payload.get("expiresAt"),
195
- next=raw_payload.get("next"),
196
- data=docs,
197
- )
198
- self._emit(job)
199
- break
200
-
201
- payload = body.get("data", body)
202
- # Only treat messages with an explicit status as job snapshots
203
- has_status_field = (isinstance(payload, dict) and "status" in payload) or ("status" in body)
204
- if not has_status_field:
205
- continue
206
- status_str = payload.get("status", body.get("status", self.status))
207
-
208
- if self._kind == "crawl":
209
- docs = []
210
- for doc in payload.get("data", []):
211
- if isinstance(doc, dict):
212
- d = normalize_document_input(doc)
213
- docs.append(Document(**d))
214
- job = CrawlJob(
215
- status=status_str,
216
- completed=payload.get("completed", 0),
217
- total=payload.get("total", 0),
218
- credits_used=payload.get("creditsUsed", 0),
219
- expires_at=payload.get("expiresAt"),
220
- next=payload.get("next"),
221
- data=docs,
222
- )
223
- self._emit(job)
224
- if status_str in ("completed", "failed", "cancelled"):
225
- # Ensure done/error dispatched even if server didn't send explicit event type
226
- if status_str == "completed" and not self._sent_done:
227
- self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
228
- self._sent_done = True
229
- if status_str == "failed" and not self._sent_error:
230
- self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
231
- self._sent_error = True
232
- break
233
- else:
234
- docs = []
235
- for doc in payload.get("data", []):
236
- if isinstance(doc, dict):
237
- d = normalize_document_input(doc)
238
- docs.append(Document(**d))
239
- job = BatchScrapeJob(
240
- status=status_str,
241
- completed=payload.get("completed", 0),
242
- total=payload.get("total", 0),
243
- credits_used=payload.get("creditsUsed"),
244
- expires_at=payload.get("expiresAt"),
245
- next=payload.get("next"),
246
- data=docs,
247
- )
248
- self._emit(job)
249
- if status_str in ("completed", "failed", "cancelled"):
250
- if status_str == "completed" and not self._sent_done:
251
- self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
252
- self._sent_done = True
253
- if status_str == "failed" and not self._sent_error:
254
- self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
255
- self._sent_error = True
256
- break
257
- except Exception:
258
- pass
259
- finally:
260
- # Ensure terminal event parity with v1 even on abrupt disconnects
261
- if self.status == "completed" and not self._sent_done:
262
- self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
263
- self._sent_done = True
264
-
265
- async def _poll_status_once(self) -> bool:
266
- """Poll job status over HTTP once. Returns True if terminal."""
267
- try:
268
- if self._kind == "crawl":
269
- job: CrawlJob = await asyncio.to_thread(self._client.get_crawl_status, self._job_id)
270
- else:
271
- job: BatchScrapeJob = await asyncio.to_thread(self._client.get_batch_scrape_status, self._job_id)
272
- except Exception:
273
- return False
274
-
275
- self.status = job.status
276
- self._emit(job)
277
- if job.status in ("completed", "failed", "cancelled"):
278
- if job.status == "completed" and not self._sent_done:
279
- self.dispatch_event("done", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
280
- self._sent_done = True
281
- if job.status == "failed" and not self._sent_error:
282
- self.dispatch_event("error", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
283
- self._sent_error = True
284
- return True
285
- return False
286
-
287
- def _loop(self) -> None:
288
- asyncio.run(self._run_ws())
289
-
290
- def start(self) -> None:
291
- if self._thread and self._thread.is_alive():
292
- return
293
- self._stop.clear()
294
- self._thread = threading.Thread(target=self._loop, daemon=True)
295
- self._thread.start()
296
-
297
- def stop(self) -> None:
298
- self._stop.set()
299
- if self._thread:
300
- self._thread.join(timeout=1)
301
-