firecrawl-py 3.3.1__py3-none-any.whl → 3.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of firecrawl-py might be problematic. Click here for more details.
- firecrawl/__init__.py +1 -1
- firecrawl/__tests__/e2e/v2/test_scrape.py +37 -1
- firecrawl/client.py +8 -4
- firecrawl/v2/types.py +19 -2
- {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info}/METADATA +7 -3
- firecrawl_py-3.3.3.dist-info/RECORD +79 -0
- {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info}/WHEEL +1 -1
- {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info/licenses}/LICENSE +0 -0
- {firecrawl_py-3.3.1.dist-info → firecrawl_py-3.3.3.dist-info}/top_level.txt +0 -2
- build/lib/firecrawl/__init__.py +0 -87
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_batch_scrape.py +0 -79
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_crawl.py +0 -188
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_extract.py +0 -38
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_map.py +0 -40
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_scrape.py +0 -137
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_search.py +0 -248
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_usage.py +0 -35
- build/lib/firecrawl/__tests__/e2e/v2/aio/test_aio_watcher.py +0 -43
- build/lib/firecrawl/__tests__/e2e/v2/conftest.py +0 -73
- build/lib/firecrawl/__tests__/e2e/v2/test_async.py +0 -73
- build/lib/firecrawl/__tests__/e2e/v2/test_batch_scrape.py +0 -105
- build/lib/firecrawl/__tests__/e2e/v2/test_crawl.py +0 -276
- build/lib/firecrawl/__tests__/e2e/v2/test_extract.py +0 -54
- build/lib/firecrawl/__tests__/e2e/v2/test_map.py +0 -60
- build/lib/firecrawl/__tests__/e2e/v2/test_scrape.py +0 -154
- build/lib/firecrawl/__tests__/e2e/v2/test_search.py +0 -269
- build/lib/firecrawl/__tests__/e2e/v2/test_usage.py +0 -26
- build/lib/firecrawl/__tests__/e2e/v2/test_watcher.py +0 -65
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_params.py +0 -12
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_request_preparation.py +0 -61
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_crawl_validation.py +0 -12
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_map_request_preparation.py +0 -19
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_scrape_request_preparation.py +0 -50
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_aio_search_request_preparation.py +0 -63
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_batch_request_preparation_async.py +0 -28
- build/lib/firecrawl/__tests__/unit/v2/methods/aio/test_ensure_async.py +0 -117
- build/lib/firecrawl/__tests__/unit/v2/methods/test_batch_request_preparation.py +0 -90
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_params.py +0 -70
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_request_preparation.py +0 -240
- build/lib/firecrawl/__tests__/unit/v2/methods/test_crawl_validation.py +0 -107
- build/lib/firecrawl/__tests__/unit/v2/methods/test_map_request_preparation.py +0 -53
- build/lib/firecrawl/__tests__/unit/v2/methods/test_scrape_request_preparation.py +0 -92
- build/lib/firecrawl/__tests__/unit/v2/methods/test_search_request_preparation.py +0 -167
- build/lib/firecrawl/__tests__/unit/v2/methods/test_search_validation.py +0 -236
- build/lib/firecrawl/__tests__/unit/v2/methods/test_usage_types.py +0 -18
- build/lib/firecrawl/__tests__/unit/v2/methods/test_webhook.py +0 -123
- build/lib/firecrawl/__tests__/unit/v2/utils/test_validation.py +0 -290
- build/lib/firecrawl/__tests__/unit/v2/watcher/test_ws_watcher.py +0 -332
- build/lib/firecrawl/client.py +0 -242
- build/lib/firecrawl/firecrawl.backup.py +0 -4635
- build/lib/firecrawl/types.py +0 -161
- build/lib/firecrawl/v1/__init__.py +0 -14
- build/lib/firecrawl/v1/client.py +0 -4653
- build/lib/firecrawl/v2/__init__.py +0 -4
- build/lib/firecrawl/v2/client.py +0 -805
- build/lib/firecrawl/v2/client_async.py +0 -250
- build/lib/firecrawl/v2/methods/aio/__init__.py +0 -1
- build/lib/firecrawl/v2/methods/aio/batch.py +0 -85
- build/lib/firecrawl/v2/methods/aio/crawl.py +0 -171
- build/lib/firecrawl/v2/methods/aio/extract.py +0 -126
- build/lib/firecrawl/v2/methods/aio/map.py +0 -59
- build/lib/firecrawl/v2/methods/aio/scrape.py +0 -33
- build/lib/firecrawl/v2/methods/aio/search.py +0 -172
- build/lib/firecrawl/v2/methods/aio/usage.py +0 -42
- build/lib/firecrawl/v2/methods/batch.py +0 -417
- build/lib/firecrawl/v2/methods/crawl.py +0 -469
- build/lib/firecrawl/v2/methods/extract.py +0 -131
- build/lib/firecrawl/v2/methods/map.py +0 -77
- build/lib/firecrawl/v2/methods/scrape.py +0 -64
- build/lib/firecrawl/v2/methods/search.py +0 -197
- build/lib/firecrawl/v2/methods/usage.py +0 -41
- build/lib/firecrawl/v2/types.py +0 -665
- build/lib/firecrawl/v2/utils/__init__.py +0 -9
- build/lib/firecrawl/v2/utils/error_handler.py +0 -107
- build/lib/firecrawl/v2/utils/get_version.py +0 -15
- build/lib/firecrawl/v2/utils/http_client.py +0 -153
- build/lib/firecrawl/v2/utils/http_client_async.py +0 -65
- build/lib/firecrawl/v2/utils/normalize.py +0 -107
- build/lib/firecrawl/v2/utils/validation.py +0 -324
- build/lib/firecrawl/v2/watcher.py +0 -301
- build/lib/firecrawl/v2/watcher_async.py +0 -242
- build/lib/tests/test_change_tracking.py +0 -98
- build/lib/tests/test_timeout_conversion.py +0 -117
- firecrawl_py-3.3.1.dist-info/RECORD +0 -153
|
@@ -1,324 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Shared validation functions for Firecrawl v2 API.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from typing import Optional, Dict, Any, List
|
|
6
|
-
from ..types import ScrapeOptions, ScrapeFormats
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def _convert_format_string(format_str: str) -> str:
|
|
10
|
-
"""
|
|
11
|
-
Convert format string from snake_case to camelCase.
|
|
12
|
-
|
|
13
|
-
Args:
|
|
14
|
-
format_str: Format string in snake_case
|
|
15
|
-
|
|
16
|
-
Returns:
|
|
17
|
-
Format string in camelCase
|
|
18
|
-
"""
|
|
19
|
-
format_mapping = {
|
|
20
|
-
"raw_html": "rawHtml",
|
|
21
|
-
"change_tracking": "changeTracking",
|
|
22
|
-
"screenshot_full_page": "screenshot@fullPage"
|
|
23
|
-
}
|
|
24
|
-
return format_mapping.get(format_str, format_str)
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
def _normalize_schema(schema: Any) -> Optional[Dict[str, Any]]:
|
|
28
|
-
"""
|
|
29
|
-
Normalize a schema object which may be a dict, Pydantic BaseModel subclass,
|
|
30
|
-
or a Pydantic model instance into a plain dict.
|
|
31
|
-
"""
|
|
32
|
-
try:
|
|
33
|
-
# Pydantic v2 BaseModel subclass: has "model_json_schema"
|
|
34
|
-
if hasattr(schema, "model_json_schema") and callable(schema.model_json_schema):
|
|
35
|
-
return schema.model_json_schema()
|
|
36
|
-
# Pydantic v2 BaseModel instance: has "model_dump" or "model_json_schema"
|
|
37
|
-
if hasattr(schema, "model_dump") and callable(schema.model_dump):
|
|
38
|
-
# Try to get JSON schema if available on the class
|
|
39
|
-
mjs = getattr(schema.__class__, "model_json_schema", None)
|
|
40
|
-
if callable(mjs):
|
|
41
|
-
return schema.__class__.model_json_schema()
|
|
42
|
-
# Fallback to data shape (not ideal, but better than dropping)
|
|
43
|
-
return schema.model_dump()
|
|
44
|
-
# Pydantic v1 BaseModel subclass: has "schema"
|
|
45
|
-
if hasattr(schema, "schema") and callable(schema.schema):
|
|
46
|
-
return schema.schema()
|
|
47
|
-
# Pydantic v1 BaseModel instance
|
|
48
|
-
if hasattr(schema, "dict") and callable(schema.dict):
|
|
49
|
-
# Prefer class-level schema if present
|
|
50
|
-
sch = getattr(schema.__class__, "schema", None)
|
|
51
|
-
if callable(sch):
|
|
52
|
-
return schema.__class__.schema()
|
|
53
|
-
return schema.dict()
|
|
54
|
-
except Exception:
|
|
55
|
-
pass
|
|
56
|
-
# Already a dict or unsupported type
|
|
57
|
-
return schema if isinstance(schema, dict) else None
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
def _validate_json_format(format_obj: Any) -> Dict[str, Any]:
|
|
61
|
-
"""
|
|
62
|
-
Validate and prepare json format object.
|
|
63
|
-
|
|
64
|
-
Args:
|
|
65
|
-
format_obj: Format object that should be json type
|
|
66
|
-
|
|
67
|
-
Returns:
|
|
68
|
-
Validated json format dict
|
|
69
|
-
|
|
70
|
-
Raises:
|
|
71
|
-
ValueError: If json format is missing required fields
|
|
72
|
-
"""
|
|
73
|
-
if not isinstance(format_obj, dict):
|
|
74
|
-
raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
|
|
75
|
-
|
|
76
|
-
if format_obj.get('type') != 'json':
|
|
77
|
-
raise ValueError("json format must have type='json'")
|
|
78
|
-
|
|
79
|
-
# prompt is optional in v2; only normalize when present
|
|
80
|
-
# schema is recommended; if provided, normalize Pydantic forms
|
|
81
|
-
schema = format_obj.get('schema')
|
|
82
|
-
normalized = dict(format_obj)
|
|
83
|
-
if schema is not None:
|
|
84
|
-
normalized_schema = _normalize_schema(schema)
|
|
85
|
-
if normalized_schema is not None:
|
|
86
|
-
normalized['schema'] = normalized_schema
|
|
87
|
-
return normalized
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
def validate_scrape_options(options: Optional[ScrapeOptions]) -> Optional[ScrapeOptions]:
|
|
91
|
-
"""
|
|
92
|
-
Validate and normalize scrape options.
|
|
93
|
-
|
|
94
|
-
Args:
|
|
95
|
-
options: Scraping options to validate
|
|
96
|
-
|
|
97
|
-
Returns:
|
|
98
|
-
Validated options or None
|
|
99
|
-
|
|
100
|
-
Raises:
|
|
101
|
-
ValueError: If options are invalid
|
|
102
|
-
"""
|
|
103
|
-
if options is None:
|
|
104
|
-
return None
|
|
105
|
-
|
|
106
|
-
# Validate timeout
|
|
107
|
-
if options.timeout is not None and options.timeout <= 0:
|
|
108
|
-
raise ValueError("Timeout must be positive")
|
|
109
|
-
|
|
110
|
-
# Validate wait_for
|
|
111
|
-
if options.wait_for is not None and options.wait_for < 0:
|
|
112
|
-
raise ValueError("wait_for must be non-negative")
|
|
113
|
-
|
|
114
|
-
return options
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
def prepare_scrape_options(options: Optional[ScrapeOptions]) -> Optional[Dict[str, Any]]:
|
|
118
|
-
"""
|
|
119
|
-
Prepare ScrapeOptions for API submission with manual snake_case to camelCase conversion.
|
|
120
|
-
|
|
121
|
-
Args:
|
|
122
|
-
options: ScrapeOptions to prepare
|
|
123
|
-
|
|
124
|
-
Returns:
|
|
125
|
-
Dictionary ready for API submission or None if options is None
|
|
126
|
-
"""
|
|
127
|
-
if options is None:
|
|
128
|
-
return None
|
|
129
|
-
|
|
130
|
-
# Validate options first
|
|
131
|
-
validated_options = validate_scrape_options(options)
|
|
132
|
-
if validated_options is None:
|
|
133
|
-
return None
|
|
134
|
-
|
|
135
|
-
# Apply default values for None fields
|
|
136
|
-
default_values = {
|
|
137
|
-
"only_main_content": True,
|
|
138
|
-
"mobile": False,
|
|
139
|
-
"skip_tls_verification": True,
|
|
140
|
-
"remove_base64_images": True,
|
|
141
|
-
"fast_mode": False,
|
|
142
|
-
"block_ads": True,
|
|
143
|
-
"max_age": 14400000,
|
|
144
|
-
"store_in_cache": True
|
|
145
|
-
}
|
|
146
|
-
|
|
147
|
-
# Convert to dict and handle manual snake_case to camelCase conversion
|
|
148
|
-
options_data = validated_options.model_dump(exclude_none=True)
|
|
149
|
-
|
|
150
|
-
# Apply defaults for None fields
|
|
151
|
-
for field, default_value in default_values.items():
|
|
152
|
-
if field not in options_data:
|
|
153
|
-
options_data[field] = default_value
|
|
154
|
-
|
|
155
|
-
scrape_data = {}
|
|
156
|
-
|
|
157
|
-
# Manual field mapping for snake_case to camelCase conversion
|
|
158
|
-
field_mappings = {
|
|
159
|
-
"include_tags": "includeTags",
|
|
160
|
-
"exclude_tags": "excludeTags",
|
|
161
|
-
"only_main_content": "onlyMainContent",
|
|
162
|
-
"wait_for": "waitFor",
|
|
163
|
-
"skip_tls_verification": "skipTlsVerification",
|
|
164
|
-
"remove_base64_images": "removeBase64Images",
|
|
165
|
-
"fast_mode": "fastMode",
|
|
166
|
-
"use_mock": "useMock",
|
|
167
|
-
"block_ads": "blockAds",
|
|
168
|
-
"store_in_cache": "storeInCache",
|
|
169
|
-
"max_age": "maxAge"
|
|
170
|
-
}
|
|
171
|
-
|
|
172
|
-
# Apply field mappings
|
|
173
|
-
for snake_case, camel_case in field_mappings.items():
|
|
174
|
-
if snake_case in options_data:
|
|
175
|
-
scrape_data[camel_case] = options_data.pop(snake_case)
|
|
176
|
-
|
|
177
|
-
# Handle special cases
|
|
178
|
-
for key, value in options_data.items():
|
|
179
|
-
if value is not None:
|
|
180
|
-
if key == "formats":
|
|
181
|
-
# Handle formats conversion
|
|
182
|
-
converted_formats: List[Any] = []
|
|
183
|
-
|
|
184
|
-
# Prefer using original object to detect ScrapeFormats vs list
|
|
185
|
-
original_formats = getattr(options, 'formats', None)
|
|
186
|
-
|
|
187
|
-
if isinstance(original_formats, ScrapeFormats):
|
|
188
|
-
# Include explicit list first
|
|
189
|
-
if original_formats.formats:
|
|
190
|
-
for fmt in original_formats.formats:
|
|
191
|
-
if isinstance(fmt, str):
|
|
192
|
-
if fmt == "json":
|
|
193
|
-
raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
|
|
194
|
-
converted_formats.append(_convert_format_string(fmt))
|
|
195
|
-
elif isinstance(fmt, dict):
|
|
196
|
-
fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None
|
|
197
|
-
if fmt_type == 'json':
|
|
198
|
-
validated_json = _validate_json_format({**fmt, 'type': 'json'})
|
|
199
|
-
converted_formats.append(validated_json)
|
|
200
|
-
elif fmt_type == 'screenshot':
|
|
201
|
-
# Normalize screenshot options
|
|
202
|
-
normalized = {**fmt, 'type': 'screenshot'}
|
|
203
|
-
if 'full_page' in normalized:
|
|
204
|
-
normalized['fullPage'] = normalized.pop('full_page')
|
|
205
|
-
# Normalize viewport if it's a model instance
|
|
206
|
-
vp = normalized.get('viewport')
|
|
207
|
-
if hasattr(vp, 'model_dump'):
|
|
208
|
-
normalized['viewport'] = vp.model_dump(exclude_none=True)
|
|
209
|
-
converted_formats.append(normalized)
|
|
210
|
-
else:
|
|
211
|
-
if 'type' in fmt:
|
|
212
|
-
fmt['type'] = fmt_type or fmt['type']
|
|
213
|
-
converted_formats.append(fmt)
|
|
214
|
-
elif hasattr(fmt, 'type'):
|
|
215
|
-
if fmt.type == 'json':
|
|
216
|
-
converted_formats.append(_validate_json_format(fmt.model_dump()))
|
|
217
|
-
else:
|
|
218
|
-
converted_formats.append(_convert_format_string(fmt.type))
|
|
219
|
-
else:
|
|
220
|
-
converted_formats.append(fmt)
|
|
221
|
-
|
|
222
|
-
# Add booleans from ScrapeFormats
|
|
223
|
-
if original_formats.markdown:
|
|
224
|
-
converted_formats.append("markdown")
|
|
225
|
-
if original_formats.html:
|
|
226
|
-
converted_formats.append("html")
|
|
227
|
-
if original_formats.raw_html:
|
|
228
|
-
converted_formats.append("rawHtml")
|
|
229
|
-
if original_formats.summary:
|
|
230
|
-
converted_formats.append("summary")
|
|
231
|
-
if original_formats.links:
|
|
232
|
-
converted_formats.append("links")
|
|
233
|
-
if original_formats.screenshot:
|
|
234
|
-
converted_formats.append("screenshot")
|
|
235
|
-
if original_formats.change_tracking:
|
|
236
|
-
converted_formats.append("changeTracking")
|
|
237
|
-
# Note: We intentionally do not auto-include 'json' when boolean is set,
|
|
238
|
-
# because JSON requires an object with schema/prompt. The caller must
|
|
239
|
-
# supply the full json format object explicitly.
|
|
240
|
-
elif isinstance(original_formats, list):
|
|
241
|
-
for fmt in original_formats:
|
|
242
|
-
if isinstance(fmt, str):
|
|
243
|
-
if fmt == "json":
|
|
244
|
-
raise ValueError("json format must be an object with 'type', 'prompt', and 'schema' fields")
|
|
245
|
-
converted_formats.append(_convert_format_string(fmt))
|
|
246
|
-
elif isinstance(fmt, dict):
|
|
247
|
-
fmt_type = _convert_format_string(fmt.get('type')) if fmt.get('type') else None
|
|
248
|
-
if fmt_type == 'json':
|
|
249
|
-
validated_json = _validate_json_format({**fmt, 'type': 'json'})
|
|
250
|
-
converted_formats.append(validated_json)
|
|
251
|
-
elif fmt_type == 'screenshot':
|
|
252
|
-
normalized = {**fmt, 'type': 'screenshot'}
|
|
253
|
-
if 'full_page' in normalized:
|
|
254
|
-
normalized['fullPage'] = normalized.pop('full_page')
|
|
255
|
-
vp = normalized.get('viewport')
|
|
256
|
-
if hasattr(vp, 'model_dump'):
|
|
257
|
-
normalized['viewport'] = vp.model_dump(exclude_none=True)
|
|
258
|
-
converted_formats.append(normalized)
|
|
259
|
-
else:
|
|
260
|
-
if 'type' in fmt:
|
|
261
|
-
fmt['type'] = fmt_type or fmt['type']
|
|
262
|
-
converted_formats.append(fmt)
|
|
263
|
-
elif hasattr(fmt, 'type'):
|
|
264
|
-
if fmt.type == 'json':
|
|
265
|
-
converted_formats.append(_validate_json_format(fmt.model_dump()))
|
|
266
|
-
elif fmt.type == 'screenshot':
|
|
267
|
-
normalized = {'type': 'screenshot'}
|
|
268
|
-
if getattr(fmt, 'full_page', None) is not None:
|
|
269
|
-
normalized['fullPage'] = fmt.full_page
|
|
270
|
-
if getattr(fmt, 'quality', None) is not None:
|
|
271
|
-
normalized['quality'] = fmt.quality
|
|
272
|
-
vp = getattr(fmt, 'viewport', None)
|
|
273
|
-
if vp is not None:
|
|
274
|
-
normalized['viewport'] = vp.model_dump(exclude_none=True) if hasattr(vp, 'model_dump') else vp
|
|
275
|
-
converted_formats.append(normalized)
|
|
276
|
-
else:
|
|
277
|
-
converted_formats.append(_convert_format_string(fmt.type))
|
|
278
|
-
else:
|
|
279
|
-
converted_formats.append(fmt)
|
|
280
|
-
else:
|
|
281
|
-
# Fallback: try to iterate over value if it's a list-like
|
|
282
|
-
try:
|
|
283
|
-
for fmt in value:
|
|
284
|
-
converted_formats.append(fmt)
|
|
285
|
-
except TypeError:
|
|
286
|
-
pass
|
|
287
|
-
|
|
288
|
-
if converted_formats:
|
|
289
|
-
scrape_data["formats"] = converted_formats
|
|
290
|
-
elif key == "actions":
|
|
291
|
-
# Handle actions conversion
|
|
292
|
-
converted_actions = []
|
|
293
|
-
for action in value:
|
|
294
|
-
if isinstance(action, dict):
|
|
295
|
-
# Convert action dict
|
|
296
|
-
converted_action = {}
|
|
297
|
-
for action_key, action_value in action.items():
|
|
298
|
-
if action_key == "full_page":
|
|
299
|
-
converted_action["fullPage"] = action_value
|
|
300
|
-
else:
|
|
301
|
-
converted_action[action_key] = action_value
|
|
302
|
-
converted_actions.append(converted_action)
|
|
303
|
-
else:
|
|
304
|
-
# Handle action objects
|
|
305
|
-
action_data = action.model_dump(exclude_none=True)
|
|
306
|
-
converted_action = {}
|
|
307
|
-
for action_key, action_value in action_data.items():
|
|
308
|
-
if action_key == "full_page":
|
|
309
|
-
converted_action["fullPage"] = action_value
|
|
310
|
-
else:
|
|
311
|
-
converted_action[action_key] = action_value
|
|
312
|
-
converted_actions.append(converted_action)
|
|
313
|
-
scrape_data["actions"] = converted_actions
|
|
314
|
-
elif key == "location":
|
|
315
|
-
# Handle location conversion
|
|
316
|
-
if isinstance(value, dict):
|
|
317
|
-
scrape_data["location"] = value
|
|
318
|
-
else:
|
|
319
|
-
scrape_data["location"] = value.model_dump(exclude_none=True)
|
|
320
|
-
else:
|
|
321
|
-
# For fields that don't need conversion, use as-is
|
|
322
|
-
scrape_data[key] = value
|
|
323
|
-
|
|
324
|
-
return scrape_data
|
|
@@ -1,301 +0,0 @@
|
|
|
1
|
-
"""
|
|
2
|
-
WebSocket-based watcher for v2 jobs (crawl and batch), mirroring v1 behavior.
|
|
3
|
-
|
|
4
|
-
Usage:
|
|
5
|
-
watcher = client.watcher(job_id, kind="crawl")
|
|
6
|
-
watcher.add_listener(lambda status: print(status.status))
|
|
7
|
-
watcher.start()
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
import asyncio
|
|
11
|
-
import json
|
|
12
|
-
import threading
|
|
13
|
-
from typing import Callable, List, Optional, Literal, Union, Dict, Any
|
|
14
|
-
|
|
15
|
-
import websockets
|
|
16
|
-
|
|
17
|
-
from .types import CrawlJob, BatchScrapeJob, Document
|
|
18
|
-
from .utils.normalize import normalize_document_input
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
JobKind = Literal["crawl", "batch"]
|
|
22
|
-
JobType = Union[CrawlJob, BatchScrapeJob]
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
class Watcher:
|
|
26
|
-
def __init__(
|
|
27
|
-
self,
|
|
28
|
-
client: object,
|
|
29
|
-
job_id: str,
|
|
30
|
-
kind: JobKind = "crawl",
|
|
31
|
-
poll_interval: int = 2,
|
|
32
|
-
timeout: Optional[int] = None,
|
|
33
|
-
) -> None:
|
|
34
|
-
self._client = client
|
|
35
|
-
self._job_id = job_id
|
|
36
|
-
self._kind = kind
|
|
37
|
-
self._timeout = timeout
|
|
38
|
-
self._poll_interval = poll_interval
|
|
39
|
-
self._listeners: List[Callable[[JobType], None]] = []
|
|
40
|
-
self._thread: Optional[threading.Thread] = None
|
|
41
|
-
self._stop = threading.Event()
|
|
42
|
-
|
|
43
|
-
http_client = getattr(client, "http_client", None)
|
|
44
|
-
self._api_url: Optional[str] = getattr(http_client, "api_url", None)
|
|
45
|
-
self._api_key: Optional[str] = getattr(http_client, "api_key", None)
|
|
46
|
-
|
|
47
|
-
# v1-parity state and event handlers
|
|
48
|
-
self.status: str = "scraping"
|
|
49
|
-
self.data: List[Dict[str, Any]] = []
|
|
50
|
-
self._event_handlers: Dict[str, List[Callable[[Dict[str, Any]], None]]] = {
|
|
51
|
-
"done": [],
|
|
52
|
-
"error": [],
|
|
53
|
-
"document": [],
|
|
54
|
-
}
|
|
55
|
-
self._sent_done: bool = False
|
|
56
|
-
self._sent_error: bool = False
|
|
57
|
-
|
|
58
|
-
def add_listener(self, callback: Callable[[JobType], None]) -> None:
|
|
59
|
-
self._listeners.append(callback)
|
|
60
|
-
|
|
61
|
-
def _emit(self, status: JobType) -> None:
|
|
62
|
-
for cb in list(self._listeners):
|
|
63
|
-
try:
|
|
64
|
-
cb(status)
|
|
65
|
-
except Exception:
|
|
66
|
-
pass
|
|
67
|
-
|
|
68
|
-
# v1-like events API
|
|
69
|
-
def add_event_listener(self, event_type: str, handler: Callable[[Dict[str, Any]], None]) -> None:
|
|
70
|
-
if event_type in self._event_handlers:
|
|
71
|
-
self._event_handlers[event_type].append(handler)
|
|
72
|
-
|
|
73
|
-
def dispatch_event(self, event_type: str, detail: Dict[str, Any]) -> None:
|
|
74
|
-
if event_type in self._event_handlers:
|
|
75
|
-
for handler in self._event_handlers[event_type]:
|
|
76
|
-
try:
|
|
77
|
-
handler(detail)
|
|
78
|
-
except Exception:
|
|
79
|
-
pass
|
|
80
|
-
|
|
81
|
-
def _build_ws_url(self) -> str:
|
|
82
|
-
if not self._api_url:
|
|
83
|
-
raise ValueError("API URL is required for WebSocket watcher")
|
|
84
|
-
ws_base = self._api_url.replace("https://", "wss://").replace("http://", "ws://", 1)
|
|
85
|
-
if self._kind == "crawl":
|
|
86
|
-
return f"{ws_base}/v2/crawl/{self._job_id}"
|
|
87
|
-
return f"{ws_base}/v2/batch/scrape/{self._job_id}"
|
|
88
|
-
|
|
89
|
-
async def _run_ws(self) -> None:
|
|
90
|
-
uri = self._build_ws_url()
|
|
91
|
-
headers_list = []
|
|
92
|
-
if self._api_key:
|
|
93
|
-
headers_list.append(("Authorization", f"Bearer {self._api_key}"))
|
|
94
|
-
|
|
95
|
-
try:
|
|
96
|
-
async with websockets.connect(uri, max_size=None, additional_headers=headers_list) as websocket:
|
|
97
|
-
deadline = asyncio.get_event_loop().time() + self._timeout if self._timeout else None
|
|
98
|
-
while not self._stop.is_set():
|
|
99
|
-
# Use short recv timeouts to allow HTTP polling fallback
|
|
100
|
-
if deadline is not None:
|
|
101
|
-
remaining = max(0.0, deadline - asyncio.get_event_loop().time())
|
|
102
|
-
timeout = min(self._poll_interval or remaining, remaining)
|
|
103
|
-
else:
|
|
104
|
-
timeout = self._poll_interval or 5
|
|
105
|
-
try:
|
|
106
|
-
msg = await asyncio.wait_for(websocket.recv(), timeout=timeout)
|
|
107
|
-
except asyncio.TimeoutError:
|
|
108
|
-
# Quiet period: poll HTTP once to progress statuses
|
|
109
|
-
if await self._poll_status_once():
|
|
110
|
-
break
|
|
111
|
-
else:
|
|
112
|
-
continue
|
|
113
|
-
except asyncio.CancelledError:
|
|
114
|
-
break
|
|
115
|
-
except Exception:
|
|
116
|
-
# Connection error: switch to HTTP polling until terminal or timeout
|
|
117
|
-
while not self._stop.is_set():
|
|
118
|
-
if await self._poll_status_once():
|
|
119
|
-
return
|
|
120
|
-
if deadline is not None and asyncio.get_event_loop().time() >= deadline:
|
|
121
|
-
return
|
|
122
|
-
await asyncio.sleep(self._poll_interval or 2)
|
|
123
|
-
return
|
|
124
|
-
|
|
125
|
-
try:
|
|
126
|
-
body = json.loads(msg)
|
|
127
|
-
except Exception:
|
|
128
|
-
continue
|
|
129
|
-
|
|
130
|
-
# v1-style typed event handling
|
|
131
|
-
msg_type = body.get("type")
|
|
132
|
-
if msg_type == "error":
|
|
133
|
-
self.status = "failed"
|
|
134
|
-
self.dispatch_event("error", {
|
|
135
|
-
"status": self.status,
|
|
136
|
-
"data": self.data,
|
|
137
|
-
"error": body.get("error"),
|
|
138
|
-
"id": self._job_id,
|
|
139
|
-
})
|
|
140
|
-
self._sent_error = True
|
|
141
|
-
# Emit a final failed snapshot for listeners
|
|
142
|
-
if self._kind == "crawl":
|
|
143
|
-
job = CrawlJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
|
|
144
|
-
else:
|
|
145
|
-
job = BatchScrapeJob(status="failed", completed=0, total=0, credits_used=0, expires_at=None, next=None, data=[])
|
|
146
|
-
self._emit(job)
|
|
147
|
-
break
|
|
148
|
-
elif msg_type == "catchup":
|
|
149
|
-
d = body.get("data", {})
|
|
150
|
-
self.status = d.get("status", self.status)
|
|
151
|
-
docs_in = d.get("data", [])
|
|
152
|
-
self.data.extend(docs_in)
|
|
153
|
-
for doc in docs_in:
|
|
154
|
-
self.dispatch_event("document", {"data": doc, "id": self._job_id})
|
|
155
|
-
elif msg_type == "document":
|
|
156
|
-
doc = body.get("data")
|
|
157
|
-
if isinstance(doc, dict):
|
|
158
|
-
self.data.append(doc)
|
|
159
|
-
self.dispatch_event("document", {"data": doc, "id": self._job_id})
|
|
160
|
-
elif msg_type == "done":
|
|
161
|
-
self.status = "completed"
|
|
162
|
-
# Gather any documents in the done payload
|
|
163
|
-
raw_payload = body.get("data", {}) or {}
|
|
164
|
-
docs_in = raw_payload.get("data", []) or []
|
|
165
|
-
if isinstance(docs_in, list) and docs_in:
|
|
166
|
-
for doc in docs_in:
|
|
167
|
-
if isinstance(doc, dict):
|
|
168
|
-
self.data.append(doc)
|
|
169
|
-
# Dispatch done event first
|
|
170
|
-
self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
|
|
171
|
-
self._sent_done = True
|
|
172
|
-
# Emit a final completed snapshot for listeners and break immediately
|
|
173
|
-
docs: List[Document] = []
|
|
174
|
-
for doc in self.data:
|
|
175
|
-
if isinstance(doc, dict):
|
|
176
|
-
d = normalize_document_input(doc)
|
|
177
|
-
docs.append(Document(**d))
|
|
178
|
-
if self._kind == "crawl":
|
|
179
|
-
job = CrawlJob(
|
|
180
|
-
status="completed",
|
|
181
|
-
completed=raw_payload.get("completed", 0),
|
|
182
|
-
total=raw_payload.get("total", 0),
|
|
183
|
-
credits_used=raw_payload.get("creditsUsed", 0),
|
|
184
|
-
expires_at=raw_payload.get("expiresAt"),
|
|
185
|
-
next=raw_payload.get("next"),
|
|
186
|
-
data=docs,
|
|
187
|
-
)
|
|
188
|
-
else:
|
|
189
|
-
job = BatchScrapeJob(
|
|
190
|
-
status="completed",
|
|
191
|
-
completed=raw_payload.get("completed", 0),
|
|
192
|
-
total=raw_payload.get("total", 0),
|
|
193
|
-
credits_used=raw_payload.get("creditsUsed", 0),
|
|
194
|
-
expires_at=raw_payload.get("expiresAt"),
|
|
195
|
-
next=raw_payload.get("next"),
|
|
196
|
-
data=docs,
|
|
197
|
-
)
|
|
198
|
-
self._emit(job)
|
|
199
|
-
break
|
|
200
|
-
|
|
201
|
-
payload = body.get("data", body)
|
|
202
|
-
# Only treat messages with an explicit status as job snapshots
|
|
203
|
-
has_status_field = (isinstance(payload, dict) and "status" in payload) or ("status" in body)
|
|
204
|
-
if not has_status_field:
|
|
205
|
-
continue
|
|
206
|
-
status_str = payload.get("status", body.get("status", self.status))
|
|
207
|
-
|
|
208
|
-
if self._kind == "crawl":
|
|
209
|
-
docs = []
|
|
210
|
-
for doc in payload.get("data", []):
|
|
211
|
-
if isinstance(doc, dict):
|
|
212
|
-
d = normalize_document_input(doc)
|
|
213
|
-
docs.append(Document(**d))
|
|
214
|
-
job = CrawlJob(
|
|
215
|
-
status=status_str,
|
|
216
|
-
completed=payload.get("completed", 0),
|
|
217
|
-
total=payload.get("total", 0),
|
|
218
|
-
credits_used=payload.get("creditsUsed", 0),
|
|
219
|
-
expires_at=payload.get("expiresAt"),
|
|
220
|
-
next=payload.get("next"),
|
|
221
|
-
data=docs,
|
|
222
|
-
)
|
|
223
|
-
self._emit(job)
|
|
224
|
-
if status_str in ("completed", "failed", "cancelled"):
|
|
225
|
-
# Ensure done/error dispatched even if server didn't send explicit event type
|
|
226
|
-
if status_str == "completed" and not self._sent_done:
|
|
227
|
-
self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
|
|
228
|
-
self._sent_done = True
|
|
229
|
-
if status_str == "failed" and not self._sent_error:
|
|
230
|
-
self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
|
|
231
|
-
self._sent_error = True
|
|
232
|
-
break
|
|
233
|
-
else:
|
|
234
|
-
docs = []
|
|
235
|
-
for doc in payload.get("data", []):
|
|
236
|
-
if isinstance(doc, dict):
|
|
237
|
-
d = normalize_document_input(doc)
|
|
238
|
-
docs.append(Document(**d))
|
|
239
|
-
job = BatchScrapeJob(
|
|
240
|
-
status=status_str,
|
|
241
|
-
completed=payload.get("completed", 0),
|
|
242
|
-
total=payload.get("total", 0),
|
|
243
|
-
credits_used=payload.get("creditsUsed"),
|
|
244
|
-
expires_at=payload.get("expiresAt"),
|
|
245
|
-
next=payload.get("next"),
|
|
246
|
-
data=docs,
|
|
247
|
-
)
|
|
248
|
-
self._emit(job)
|
|
249
|
-
if status_str in ("completed", "failed", "cancelled"):
|
|
250
|
-
if status_str == "completed" and not self._sent_done:
|
|
251
|
-
self.dispatch_event("done", {"status": status_str, "data": self.data, "id": self._job_id})
|
|
252
|
-
self._sent_done = True
|
|
253
|
-
if status_str == "failed" and not self._sent_error:
|
|
254
|
-
self.dispatch_event("error", {"status": status_str, "data": self.data, "id": self._job_id})
|
|
255
|
-
self._sent_error = True
|
|
256
|
-
break
|
|
257
|
-
except Exception:
|
|
258
|
-
pass
|
|
259
|
-
finally:
|
|
260
|
-
# Ensure terminal event parity with v1 even on abrupt disconnects
|
|
261
|
-
if self.status == "completed" and not self._sent_done:
|
|
262
|
-
self.dispatch_event("done", {"status": self.status, "data": self.data, "id": self._job_id})
|
|
263
|
-
self._sent_done = True
|
|
264
|
-
|
|
265
|
-
async def _poll_status_once(self) -> bool:
|
|
266
|
-
"""Poll job status over HTTP once. Returns True if terminal."""
|
|
267
|
-
try:
|
|
268
|
-
if self._kind == "crawl":
|
|
269
|
-
job: CrawlJob = await asyncio.to_thread(self._client.get_crawl_status, self._job_id)
|
|
270
|
-
else:
|
|
271
|
-
job: BatchScrapeJob = await asyncio.to_thread(self._client.get_batch_scrape_status, self._job_id)
|
|
272
|
-
except Exception:
|
|
273
|
-
return False
|
|
274
|
-
|
|
275
|
-
self.status = job.status
|
|
276
|
-
self._emit(job)
|
|
277
|
-
if job.status in ("completed", "failed", "cancelled"):
|
|
278
|
-
if job.status == "completed" and not self._sent_done:
|
|
279
|
-
self.dispatch_event("done", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
|
|
280
|
-
self._sent_done = True
|
|
281
|
-
if job.status == "failed" and not self._sent_error:
|
|
282
|
-
self.dispatch_event("error", {"status": job.status, "data": [d.model_dump() for d in job.data], "id": self._job_id})
|
|
283
|
-
self._sent_error = True
|
|
284
|
-
return True
|
|
285
|
-
return False
|
|
286
|
-
|
|
287
|
-
def _loop(self) -> None:
|
|
288
|
-
asyncio.run(self._run_ws())
|
|
289
|
-
|
|
290
|
-
def start(self) -> None:
|
|
291
|
-
if self._thread and self._thread.is_alive():
|
|
292
|
-
return
|
|
293
|
-
self._stop.clear()
|
|
294
|
-
self._thread = threading.Thread(target=self._loop, daemon=True)
|
|
295
|
-
self._thread.start()
|
|
296
|
-
|
|
297
|
-
def stop(self) -> None:
|
|
298
|
-
self._stop.set()
|
|
299
|
-
if self._thread:
|
|
300
|
-
self._thread.join(timeout=1)
|
|
301
|
-
|