scrape-do-python 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrape_do/__init__.py +0 -0
- scrape_do/abc.py +0 -0
- scrape_do/async_client.py +0 -0
- scrape_do/client.py +804 -0
- scrape_do/constants.py +84 -0
- scrape_do/exceptions.py +238 -0
- scrape_do/models/__init__.py +79 -0
- scrape_do/models/browser_actions.py +332 -0
- scrape_do/models/enums.py +76 -0
- scrape_do/models/parameters.py +840 -0
- scrape_do/models/request.py +232 -0
- scrape_do/models/response.py +890 -0
- scrape_do/namespaces/__init__.py +0 -0
- scrape_do/namespaces/amazon.py +0 -0
- scrape_do/namespaces/google.py +0 -0
- scrape_do/namespaces/jobs.py +0 -0
- scrape_do_python-0.1.0.dist-info/METADATA +134 -0
- scrape_do_python-0.1.0.dist-info/RECORD +21 -0
- scrape_do_python-0.1.0.dist-info/WHEEL +5 -0
- scrape_do_python-0.1.0.dist-info/licenses/LICENSE +21 -0
- scrape_do_python-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,840 @@
|
|
|
1
|
+
"""Core validation engine and configuration contracts.
|
|
2
|
+
|
|
3
|
+
Validates request data before the network layer to ensure that invalid
|
|
4
|
+
configurations are caught locally without wasting network requests by using
|
|
5
|
+
Pydantic V2 models to enforce Scrape.do's parameter dependencies and
|
|
6
|
+
interactions
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
import json
|
|
11
|
+
import urllib.parse
|
|
12
|
+
from typing import (
|
|
13
|
+
Optional,
|
|
14
|
+
List,
|
|
15
|
+
Self,
|
|
16
|
+
Type,
|
|
17
|
+
Any,
|
|
18
|
+
Dict,
|
|
19
|
+
TypedDict
|
|
20
|
+
)
|
|
21
|
+
from pydantic import (
|
|
22
|
+
BaseModel,
|
|
23
|
+
Field,
|
|
24
|
+
HttpUrl,
|
|
25
|
+
model_validator,
|
|
26
|
+
field_validator,
|
|
27
|
+
ValidationInfo,
|
|
28
|
+
ConfigDict
|
|
29
|
+
)
|
|
30
|
+
from .browser_actions import BrowserAction
|
|
31
|
+
from .enums import (
|
|
32
|
+
OutputType,
|
|
33
|
+
DeviceType,
|
|
34
|
+
WaitUntilType,
|
|
35
|
+
RegionCodeType
|
|
36
|
+
)
|
|
37
|
+
from ..constants import (
|
|
38
|
+
_SUPER_SUPPORTED_COUNTRIES,
|
|
39
|
+
_DATACENTER_SUPPORTED_COUNTRIES,
|
|
40
|
+
_ZIPCODE_FORMATS
|
|
41
|
+
)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# ----------------------------------
|
|
45
|
+
# RequestParameters Kwargs TypedDict
|
|
46
|
+
# ----------------------------------
|
|
47
|
+
|
|
48
|
+
class RequestParametersDict(TypedDict, total=False):
|
|
49
|
+
"""
|
|
50
|
+
Provides strict IDE autocomplete and static type checking for `**kwargs`
|
|
51
|
+
dictionaries meant for the
|
|
52
|
+
[RequestParameters][scrape_do.models.RequestParameters] model.
|
|
53
|
+
"""
|
|
54
|
+
super: Optional[bool]
|
|
55
|
+
"""
|
|
56
|
+
Activates Residential/Mobile IP proxies.
|
|
57
|
+
"""
|
|
58
|
+
render: Optional[bool]
|
|
59
|
+
"""
|
|
60
|
+
Executes the request using a headless browser.
|
|
61
|
+
"""
|
|
62
|
+
device: Optional[DeviceType]
|
|
63
|
+
"""
|
|
64
|
+
Specify the device type (desktop, mobile, tablet)
|
|
65
|
+
"""
|
|
66
|
+
session_id: Optional[int]
|
|
67
|
+
"""
|
|
68
|
+
Use the same IP address continuously with a session
|
|
69
|
+
"""
|
|
70
|
+
geo_code: Optional[str]
|
|
71
|
+
"""
|
|
72
|
+
ISO 3166-1 alpha-2 country code for IP targeting.
|
|
73
|
+
"""
|
|
74
|
+
regional_geo_code: Optional[RegionCodeType]
|
|
75
|
+
"""
|
|
76
|
+
Targets a broader geographical region. Requires super=True.
|
|
77
|
+
"""
|
|
78
|
+
postal_code: Optional[str]
|
|
79
|
+
"""
|
|
80
|
+
Targets a specific zip code. Requires super=True and a supported geo_code.
|
|
81
|
+
"""
|
|
82
|
+
wait_until: Optional[WaitUntilType]
|
|
83
|
+
"""
|
|
84
|
+
Control when the browser considers the page loaded
|
|
85
|
+
"""
|
|
86
|
+
custom_wait: Optional[int]
|
|
87
|
+
"""
|
|
88
|
+
Set the browser wait time on the target web page after content loaded
|
|
89
|
+
"""
|
|
90
|
+
wait_selector: Optional[str]
|
|
91
|
+
"""
|
|
92
|
+
CSS selector to wait for in the target web page.
|
|
93
|
+
"""
|
|
94
|
+
width: Optional[int]
|
|
95
|
+
"""
|
|
96
|
+
Custom viewport width.
|
|
97
|
+
"""
|
|
98
|
+
height: Optional[int]
|
|
99
|
+
"""
|
|
100
|
+
Custom viewport height.
|
|
101
|
+
"""
|
|
102
|
+
return_json: Optional[bool]
|
|
103
|
+
"""
|
|
104
|
+
Returns response body as base64-encoded JSON instead of raw HTML.
|
|
105
|
+
"""
|
|
106
|
+
block_resources: Optional[bool]
|
|
107
|
+
"""
|
|
108
|
+
Block CSS, images, and fonts on your target web page
|
|
109
|
+
"""
|
|
110
|
+
screenshot: Optional[bool]
|
|
111
|
+
"""
|
|
112
|
+
Captures the visible viewport.
|
|
113
|
+
"""
|
|
114
|
+
full_screenshot: Optional[bool]
|
|
115
|
+
"""
|
|
116
|
+
Captures the entire scrollable page.
|
|
117
|
+
"""
|
|
118
|
+
particular_screenshot: Optional[str]
|
|
119
|
+
"""
|
|
120
|
+
Captures a specific DOM element by selector.
|
|
121
|
+
"""
|
|
122
|
+
play_with_browser: Optional[List[BrowserAction]]
|
|
123
|
+
"""
|
|
124
|
+
A sequence of automated interactions to perform.
|
|
125
|
+
"""
|
|
126
|
+
show_frames: Optional[bool]
|
|
127
|
+
"""
|
|
128
|
+
Returns all iframe content from the target webpage. Requires render=true
|
|
129
|
+
and returnJSON=true
|
|
130
|
+
"""
|
|
131
|
+
show_websocket_requests: Optional[bool]
|
|
132
|
+
"""
|
|
133
|
+
Captures WebSocket network traffic. Requires render=true and
|
|
134
|
+
returnJSON=true.
|
|
135
|
+
"""
|
|
136
|
+
custom_headers: Optional[bool]
|
|
137
|
+
"""
|
|
138
|
+
Replaces Scrape.do's default headers with your provided headers.
|
|
139
|
+
"""
|
|
140
|
+
extra_headers: Optional[bool]
|
|
141
|
+
"""
|
|
142
|
+
Appends your provided headers to Scrape.do's default headers.
|
|
143
|
+
"""
|
|
144
|
+
forward_headers: Optional[bool]
|
|
145
|
+
"""
|
|
146
|
+
Forwards all headers exactly as sent by your client.
|
|
147
|
+
"""
|
|
148
|
+
set_cookies: Optional[str]
|
|
149
|
+
"""
|
|
150
|
+
Injects specific cookies into the request.
|
|
151
|
+
"""
|
|
152
|
+
disable_redirection: Optional[bool]
|
|
153
|
+
"""
|
|
154
|
+
Prevents the proxy from following 3xx HTTP redirects.
|
|
155
|
+
"""
|
|
156
|
+
timeout: Optional[int]
|
|
157
|
+
"""
|
|
158
|
+
Total API connection timeout in milliseconds.
|
|
159
|
+
"""
|
|
160
|
+
retry_timeout: Optional[int]
|
|
161
|
+
"""
|
|
162
|
+
Internal proxy retry duration in milliseconds. Cannot be used with
|
|
163
|
+
render=True.
|
|
164
|
+
"""
|
|
165
|
+
disable_retry: Optional[bool]
|
|
166
|
+
"""
|
|
167
|
+
Fails immediately on target error without rotating IPs.
|
|
168
|
+
"""
|
|
169
|
+
output: Optional[OutputType]
|
|
170
|
+
"""
|
|
171
|
+
Output format parser.
|
|
172
|
+
"""
|
|
173
|
+
transparent_response: Optional[bool]
|
|
174
|
+
"""
|
|
175
|
+
Return pure response from target web page without Scrape.do processing
|
|
176
|
+
"""
|
|
177
|
+
pure_cookies: Optional[bool]
|
|
178
|
+
"""
|
|
179
|
+
Returns the original Set-Cookie headers from the target website
|
|
180
|
+
"""
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
# --------------------
|
|
184
|
+
# Request Parameters
|
|
185
|
+
# --------------------
|
|
186
|
+
|
|
187
|
+
class RequestParameters(BaseModel):
|
|
188
|
+
"""The strict data contract for the request parameters accepted by
|
|
189
|
+
Scrape.do's API.
|
|
190
|
+
|
|
191
|
+
This model enforces all parameter dependencies, mutually exclusive rules,
|
|
192
|
+
and geographical targeting constraints locally before a network request
|
|
193
|
+
is generated.
|
|
194
|
+
|
|
195
|
+
Attributes:
|
|
196
|
+
url (HttpUrl): The absolute destination URL you wish to scrape.
|
|
197
|
+
super (Optional[bool]): Activates Residential/Mobile IP proxies.
|
|
198
|
+
render (Optional[bool]): Executes the request using a headless browser.
|
|
199
|
+
device (Optional[DeviceType]): Specify the device type (desktop,
|
|
200
|
+
mobile, tablet)
|
|
201
|
+
session_id (Optional[int]): Use the same IP address continuously with
|
|
202
|
+
a session
|
|
203
|
+
geo_code (Optional[str]): ISO 3166-1 alpha-2 country code for IP
|
|
204
|
+
targeting.
|
|
205
|
+
regional_geo_code (Optional[RegionCodeType]): Targets a broader
|
|
206
|
+
geographical region. Requires super=True.
|
|
207
|
+
postal_code (Optional[str]): Targets a specific zip code. Requires
|
|
208
|
+
super=True and a supported geo_code.
|
|
209
|
+
wait_until (Optional[WaitUntilType]): Control when the browser
|
|
210
|
+
considers the page loaded
|
|
211
|
+
custom_wait (Optional[int]): Set the browser wait time on the target
|
|
212
|
+
web page after content loaded
|
|
213
|
+
wait_selector (Optional[str]): CSS selector to wait for in the target
|
|
214
|
+
web page.
|
|
215
|
+
width (Optional[int]): Custom viewport width.
|
|
216
|
+
height (Optional[int]): Custom viewport height.
|
|
217
|
+
return_json (Optional[bool]): Returns response body as base64-encoded
|
|
218
|
+
JSON instead of raw HTML.
|
|
219
|
+
block_resources (Optional[bool]): Block CSS, images, and fonts on your
|
|
220
|
+
target web page
|
|
221
|
+
screenshot (Optional[bool]): Captures the visible viewport.
|
|
222
|
+
full_screenshot (Optional[bool]): Captures the entire scrollable page.
|
|
223
|
+
particular_screenshot (Optional[str]): Captures a specific DOM element
|
|
224
|
+
by selector.
|
|
225
|
+
play_with_browser (Optional[List[BrowserAction]]): A sequence of
|
|
226
|
+
automated interactions to perform.
|
|
227
|
+
show_frames (Optional[bool]): Returns all iframe content from the
|
|
228
|
+
target webpage. Requires render=true and returnJSON=true
|
|
229
|
+
show_websocket_requests (Optional[bool]): Captures WebSocket network
|
|
230
|
+
traffic. Requires render=true and returnJSON=true.
|
|
231
|
+
custom_headers (Optional[bool]): Replaces Scrape.do's default headers
|
|
232
|
+
with your provided headers.
|
|
233
|
+
extra_headers (Optional[bool]): Appends your provided headers to
|
|
234
|
+
Scrape.do's default headers.
|
|
235
|
+
forward_headers (Optional[bool]): Forwards all headers exactly as sent
|
|
236
|
+
by your client.
|
|
237
|
+
set_cookies (Optional[str]): Injects specific cookies into the request.
|
|
238
|
+
disable_redirection (Optional[bool]): Prevents the proxy from
|
|
239
|
+
following 3xx HTTP redirects.
|
|
240
|
+
timeout (Optional[int]): Total API connection timeout in milliseconds.
|
|
241
|
+
retry_timeout (Optional[int]): Internal proxy retry duration in
|
|
242
|
+
milliseconds. Cannot be used with render=True.
|
|
243
|
+
disable_retry (Optional[bool]): Fails immediately on target error
|
|
244
|
+
without rotating IPs.
|
|
245
|
+
output (Optional[OutputType]): Output format parser.
|
|
246
|
+
transparent_response (Optional[bool]): Return pure response from
|
|
247
|
+
target web page without Scrape.do processing
|
|
248
|
+
pure_cookies (Optional[bool]): Returns the original Set-Cookie headers
|
|
249
|
+
from the target website
|
|
250
|
+
"""
|
|
251
|
+
model_config = ConfigDict(
|
|
252
|
+
populate_by_name=True
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# --- Required Parameters ---
|
|
256
|
+
|
|
257
|
+
url: HttpUrl = Field(
|
|
258
|
+
...,
|
|
259
|
+
alias="url"
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# --- Core Routing Parameters ---
|
|
263
|
+
|
|
264
|
+
super: Optional[bool] = Field(
|
|
265
|
+
default=None,
|
|
266
|
+
alias="super"
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
render: Optional[bool] = Field(
|
|
270
|
+
None,
|
|
271
|
+
alias="render"
|
|
272
|
+
)
|
|
273
|
+
|
|
274
|
+
device: Optional[DeviceType] = Field(
|
|
275
|
+
None,
|
|
276
|
+
alias="device"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
session_id: Optional[int] = Field(
|
|
280
|
+
None,
|
|
281
|
+
alias="sessionId",
|
|
282
|
+
ge=0,
|
|
283
|
+
le=1000000
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
# --- Location Parameters ---
|
|
287
|
+
|
|
288
|
+
geo_code: Optional[str] = Field(
|
|
289
|
+
None,
|
|
290
|
+
alias="geoCode",
|
|
291
|
+
min_length=2,
|
|
292
|
+
max_length=2,
|
|
293
|
+
validate_default=True
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
regional_geo_code: Optional[RegionCodeType] = Field(
|
|
297
|
+
None,
|
|
298
|
+
alias="regionalGeoCode"
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
postal_code: Optional[str] = Field(
|
|
302
|
+
None,
|
|
303
|
+
alias="postalcode",
|
|
304
|
+
validate_default=True
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
# --- Browser Parameters ---
|
|
308
|
+
|
|
309
|
+
wait_until: Optional[WaitUntilType] = Field(
|
|
310
|
+
None,
|
|
311
|
+
alias="waitUntil"
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
custom_wait: Optional[int] = Field(
|
|
315
|
+
None,
|
|
316
|
+
alias="customWait",
|
|
317
|
+
ge=0,
|
|
318
|
+
le=35000
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
wait_selector: Optional[str] = Field(
|
|
322
|
+
None,
|
|
323
|
+
alias="waitSelector"
|
|
324
|
+
)
|
|
325
|
+
|
|
326
|
+
width: Optional[int] = Field(
|
|
327
|
+
None,
|
|
328
|
+
alias="width"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
height: Optional[int] = Field(
|
|
332
|
+
None,
|
|
333
|
+
alias="height"
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return_json: Optional[bool] = Field(
|
|
337
|
+
None,
|
|
338
|
+
alias="returnJSON"
|
|
339
|
+
)
|
|
340
|
+
|
|
341
|
+
block_resources: Optional[bool] = Field(
|
|
342
|
+
None,
|
|
343
|
+
alias="blockResources"
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
screenshot: Optional[bool] = Field(
|
|
347
|
+
None,
|
|
348
|
+
alias="screenShot"
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
full_screenshot: Optional[bool] = Field(
|
|
352
|
+
None,
|
|
353
|
+
alias="fullScreenShot"
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
particular_screenshot: Optional[str] = Field(
|
|
357
|
+
None,
|
|
358
|
+
alias="particularScreenShot"
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
play_with_browser: Optional[List[BrowserAction]] = Field(
|
|
362
|
+
None,
|
|
363
|
+
alias="playWithBrowser"
|
|
364
|
+
)
|
|
365
|
+
|
|
366
|
+
# --- Browser Response Configuration Parameters ---
|
|
367
|
+
|
|
368
|
+
show_frames: Optional[bool] = Field(
|
|
369
|
+
None,
|
|
370
|
+
alias="showFrames"
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
show_websocket_requests: Optional[bool] = Field(
|
|
374
|
+
None,
|
|
375
|
+
alias="showWebsocketRequests"
|
|
376
|
+
)
|
|
377
|
+
|
|
378
|
+
# --- Header + Cookie Control Parameters ---
|
|
379
|
+
|
|
380
|
+
custom_headers: Optional[bool] = Field(
|
|
381
|
+
None,
|
|
382
|
+
alias="customHeaders"
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
extra_headers: Optional[bool] = Field(
|
|
386
|
+
None,
|
|
387
|
+
alias="extraHeaders"
|
|
388
|
+
)
|
|
389
|
+
|
|
390
|
+
forward_headers: Optional[bool] = Field(
|
|
391
|
+
None,
|
|
392
|
+
alias="forwardHeaders"
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
set_cookies: Optional[str] = Field(
|
|
396
|
+
None,
|
|
397
|
+
alias="setCookies"
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
# --- Network Parameters ---
|
|
401
|
+
|
|
402
|
+
disable_redirection: Optional[bool] = Field(
|
|
403
|
+
None,
|
|
404
|
+
alias="disableRedirection"
|
|
405
|
+
)
|
|
406
|
+
|
|
407
|
+
timeout: Optional[int] = Field(
|
|
408
|
+
None,
|
|
409
|
+
alias="timeout",
|
|
410
|
+
le=120000,
|
|
411
|
+
ge=5000
|
|
412
|
+
)
|
|
413
|
+
|
|
414
|
+
retry_timeout: Optional[int] = Field(
|
|
415
|
+
None,
|
|
416
|
+
alias="retryTimeout",
|
|
417
|
+
le=55000,
|
|
418
|
+
ge=5000
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
disable_retry: Optional[bool] = Field(
|
|
422
|
+
None,
|
|
423
|
+
alias="disableRetry"
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
# --- General Response Configuration Parameters ---
|
|
427
|
+
|
|
428
|
+
output: Optional[OutputType] = Field(
|
|
429
|
+
None,
|
|
430
|
+
alias="output"
|
|
431
|
+
)
|
|
432
|
+
|
|
433
|
+
transparent_response: Optional[bool] = Field(
|
|
434
|
+
None,
|
|
435
|
+
alias="transparentResponse"
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
pure_cookies: Optional[bool] = Field(
|
|
439
|
+
None,
|
|
440
|
+
alias="pureCookies"
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
@model_validator(mode="after")
|
|
444
|
+
def validate_compatibility(self) -> Self:
|
|
445
|
+
"""Cross-validates parameter dependencies to prevent invalid API
|
|
446
|
+
requests locally.
|
|
447
|
+
|
|
448
|
+
info: Headless Browser Dependencies (`render=True`)
|
|
449
|
+
- `wait_until`
|
|
450
|
+
- `wait_selector`
|
|
451
|
+
- `custom_wait`
|
|
452
|
+
- `width`
|
|
453
|
+
- `height`
|
|
454
|
+
- `return_json`
|
|
455
|
+
- `block_resources`
|
|
456
|
+
- `screenshot`
|
|
457
|
+
- `full_screenshot`
|
|
458
|
+
- `particular_screenshot`
|
|
459
|
+
- `play_with_browser`
|
|
460
|
+
- `show_frames`
|
|
461
|
+
- `show_websocket_requests`
|
|
462
|
+
|
|
463
|
+
info: ReturnJSON Dependencies (`render=True` + `return_json=True`)
|
|
464
|
+
- `screenshot`
|
|
465
|
+
- `full_screenshot`
|
|
466
|
+
- `particular_screenshot`
|
|
467
|
+
- `show_frames`
|
|
468
|
+
- `show_websocket_requests`
|
|
469
|
+
|
|
470
|
+
info: Super Proxy Dependencies (`super=True`)
|
|
471
|
+
- `regional_geo_code`
|
|
472
|
+
|
|
473
|
+
info: Screenshot Parameters
|
|
474
|
+
- Only one of the screenshot parameters can be set at a time.
|
|
475
|
+
|
|
476
|
+
- In addition to `render=True` and `return_json=True`, all
|
|
477
|
+
screenshot parameters require `blockResources` to be set to
|
|
478
|
+
False.
|
|
479
|
+
|
|
480
|
+
info: Header Parameters
|
|
481
|
+
- Only one of the header parameters can be set at a time.
|
|
482
|
+
|
|
483
|
+
- None of the header parameters can be set to True when using the
|
|
484
|
+
`setCookies` parameter
|
|
485
|
+
|
|
486
|
+
info: Mutually Exclusive Parameters
|
|
487
|
+
- The `playWithBrowser` and `particular_screenshot` parameters
|
|
488
|
+
cannot be used simultaneously
|
|
489
|
+
|
|
490
|
+
- The `retryTimeout` and `render` parameters cannot be used
|
|
491
|
+
simultaneously
|
|
492
|
+
|
|
493
|
+
- The `regional_geo_code` and `geo_code` parameters cannot be used
|
|
494
|
+
simultaneously
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
The validated instance from which the method was called
|
|
498
|
+
|
|
499
|
+
Raises:
|
|
500
|
+
ValueError: If mutually exclusive parameters are combined or if
|
|
501
|
+
dependent parameters are provided without their required
|
|
502
|
+
prerequisites.
|
|
503
|
+
"""
|
|
504
|
+
|
|
505
|
+
# --- Headless Browser Dependencies ---
|
|
506
|
+
|
|
507
|
+
# Render Dependencies
|
|
508
|
+
|
|
509
|
+
render_dependent_fields = {
|
|
510
|
+
"wait_until": self.wait_until,
|
|
511
|
+
"custom_wait": self.custom_wait,
|
|
512
|
+
"wait_selector": self.wait_selector,
|
|
513
|
+
"width": self.width,
|
|
514
|
+
"height": self.height,
|
|
515
|
+
"return_json": self.return_json,
|
|
516
|
+
"block_resources": self.block_resources,
|
|
517
|
+
"screenshot": self.screenshot,
|
|
518
|
+
"full_screenshot": self.full_screenshot,
|
|
519
|
+
"particular_screenshot": self.particular_screenshot,
|
|
520
|
+
"play_with_browser": self.play_with_browser,
|
|
521
|
+
"show_frames": self.show_frames,
|
|
522
|
+
"show_websocket_requests": self.show_websocket_requests
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
used_render_fields = [
|
|
526
|
+
field_name for field_name, value in render_dependent_fields.items()
|
|
527
|
+
if value is not None
|
|
528
|
+
]
|
|
529
|
+
|
|
530
|
+
if used_render_fields and not self.render:
|
|
531
|
+
raise ValueError(
|
|
532
|
+
f"The following parameters require 'render=true' to be set: "
|
|
533
|
+
f"{', '.join(used_render_fields)}."
|
|
534
|
+
)
|
|
535
|
+
|
|
536
|
+
# ReturnJSON Additional Dependencies
|
|
537
|
+
json_dependent_fields = {
|
|
538
|
+
"screenshot": self.screenshot,
|
|
539
|
+
"full_screenshot": self.full_screenshot,
|
|
540
|
+
"particular_screenshot": self.particular_screenshot,
|
|
541
|
+
"show_frames": self.show_frames,
|
|
542
|
+
"show_websocket_requests": self.show_websocket_requests
|
|
543
|
+
}
|
|
544
|
+
|
|
545
|
+
used_json_fields = [
|
|
546
|
+
field_name for field_name, value in json_dependent_fields.items()
|
|
547
|
+
if value
|
|
548
|
+
]
|
|
549
|
+
|
|
550
|
+
if used_json_fields and not self.return_json:
|
|
551
|
+
raise ValueError((
|
|
552
|
+
f"The following parameters require both 'render=true' AND"
|
|
553
|
+
f" 'returnJSON=true' to be set: "
|
|
554
|
+
f" {', '.join(used_json_fields)}."
|
|
555
|
+
))
|
|
556
|
+
|
|
557
|
+
# Screenshot Additional Dependencies
|
|
558
|
+
screenshot_fields = {
|
|
559
|
+
"screenshot": self.screenshot,
|
|
560
|
+
"full_screenshot": self.full_screenshot,
|
|
561
|
+
"particular_screenshot": self.particular_screenshot
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
used_screenshot_fields = [
|
|
565
|
+
field_name for field_name, value in screenshot_fields.items()
|
|
566
|
+
if value
|
|
567
|
+
]
|
|
568
|
+
|
|
569
|
+
if used_screenshot_fields and self.block_resources:
|
|
570
|
+
raise ValueError((
|
|
571
|
+
f"Screenshot parameters automatically operate with "
|
|
572
|
+
f"'blockResources=false' to ensure contents are loaded "
|
|
573
|
+
f"correctly. Screenshot Parameters used:"
|
|
574
|
+
f" {', '.join(used_screenshot_fields)}"
|
|
575
|
+
))
|
|
576
|
+
|
|
577
|
+
# --- Enforce Mutually Eclusive Parameters ---
|
|
578
|
+
|
|
579
|
+
if self.render and self.retry_timeout is not None:
|
|
580
|
+
raise ValueError(
|
|
581
|
+
"The 'retry_timeout' parameter cannot be used concurrently"
|
|
582
|
+
" with 'render=true'"
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
if len(used_screenshot_fields) > 1:
|
|
586
|
+
raise ValueError(
|
|
587
|
+
f"Only one screenshot parameter can be used at a time."
|
|
588
|
+
f" Screenshot Parameters used:"
|
|
589
|
+
f" {', '.join(used_screenshot_fields)}"
|
|
590
|
+
)
|
|
591
|
+
|
|
592
|
+
if (
|
|
593
|
+
self.particular_screenshot is not None
|
|
594
|
+
and self.play_with_browser is not None
|
|
595
|
+
):
|
|
596
|
+
raise ValueError(
|
|
597
|
+
"The 'particular_screenshot' parameter cannot be used"
|
|
598
|
+
" concurrently with the 'playWithBrowser' parameter"
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
header_fields = {
|
|
602
|
+
"custom_headers": self.custom_headers,
|
|
603
|
+
"extra_headers": self.extra_headers,
|
|
604
|
+
"forward_headers": self.forward_headers
|
|
605
|
+
}
|
|
606
|
+
|
|
607
|
+
used_header_fields = [
|
|
608
|
+
field_name for field_name, value in header_fields.items()
|
|
609
|
+
if value
|
|
610
|
+
]
|
|
611
|
+
|
|
612
|
+
if len(used_header_fields) > 1:
|
|
613
|
+
raise ValueError(
|
|
614
|
+
f"Only one header parameter can be used at a time."
|
|
615
|
+
f" Header Parameters used: {', '.join(used_header_fields)}"
|
|
616
|
+
)
|
|
617
|
+
|
|
618
|
+
if used_header_fields and self.set_cookies:
|
|
619
|
+
raise ValueError(
|
|
620
|
+
f"Header parameters cannot be used concurrently with"
|
|
621
|
+
f" the set_cookies parameter. Header Parameters used:"
|
|
622
|
+
f" {', '.join(used_header_fields)}"
|
|
623
|
+
)
|
|
624
|
+
|
|
625
|
+
if self.geo_code is not None and self.regional_geo_code is not None:
|
|
626
|
+
raise ValueError(
|
|
627
|
+
"'geoCode' and 'regionalGeoCode' parameters cannot be used"
|
|
628
|
+
" simultaneously"
|
|
629
|
+
)
|
|
630
|
+
|
|
631
|
+
if not self.super and self.regional_geo_code is not None:
|
|
632
|
+
raise ValueError(
|
|
633
|
+
"'super=true' must be set to use the 'regionalGeoCode'"
|
|
634
|
+
" parameter"
|
|
635
|
+
)
|
|
636
|
+
|
|
637
|
+
return self
|
|
638
|
+
|
|
639
|
+
@field_validator("geo_code")
|
|
640
|
+
@classmethod
|
|
641
|
+
def validate_geo_code(
|
|
642
|
+
cls: Type[Self],
|
|
643
|
+
v: Optional[str],
|
|
644
|
+
info: ValidationInfo
|
|
645
|
+
) -> Optional[str]:
|
|
646
|
+
"""Validates the country code against the allowed proxy pools.
|
|
647
|
+
|
|
648
|
+
Args:
|
|
649
|
+
v (Optional[str]): The `geo_code` provided during initialization
|
|
650
|
+
info (ValidationInfo): The data already validated for the model so
|
|
651
|
+
far
|
|
652
|
+
|
|
653
|
+
Returns:
|
|
654
|
+
The validated `geo_code` parameter
|
|
655
|
+
|
|
656
|
+
Raises:
|
|
657
|
+
ValueError: If the country code is not supported by the selected
|
|
658
|
+
proxy tier.
|
|
659
|
+
"""
|
|
660
|
+
|
|
661
|
+
is_super = info.data.get("super", False)
|
|
662
|
+
if v is not None:
|
|
663
|
+
v = v.lower()
|
|
664
|
+
if is_super:
|
|
665
|
+
if v not in _SUPER_SUPPORTED_COUNTRIES:
|
|
666
|
+
raise ValueError(
|
|
667
|
+
f"'{v}' is not a supported country code"
|
|
668
|
+
)
|
|
669
|
+
else:
|
|
670
|
+
if v not in _DATACENTER_SUPPORTED_COUNTRIES:
|
|
671
|
+
if v in _SUPER_SUPPORTED_COUNTRIES:
|
|
672
|
+
raise ValueError(
|
|
673
|
+
f"'{v}' is not a supported country code when"
|
|
674
|
+
f" 'super=false'"
|
|
675
|
+
)
|
|
676
|
+
else:
|
|
677
|
+
raise ValueError(
|
|
678
|
+
f"'{v}' is not a supported country code"
|
|
679
|
+
)
|
|
680
|
+
return v
|
|
681
|
+
|
|
682
|
+
return v
|
|
683
|
+
|
|
684
|
+
@field_validator("postal_code")
|
|
685
|
+
@classmethod
|
|
686
|
+
def validate_postal_code(
|
|
687
|
+
cls: Type[Self],
|
|
688
|
+
v: Optional[str],
|
|
689
|
+
info: ValidationInfo
|
|
690
|
+
) -> Optional[str]:
|
|
691
|
+
"""Validates postal codes based on specific regional formats.
|
|
692
|
+
|
|
693
|
+
Args:
|
|
694
|
+
v (Optional[str]): The `postal_code` provided during initialization
|
|
695
|
+
info (ValidationInfo): The data already validated for the model so
|
|
696
|
+
far
|
|
697
|
+
|
|
698
|
+
Returns:
|
|
699
|
+
The validated `postal_code` parameter
|
|
700
|
+
|
|
701
|
+
Raises:
|
|
702
|
+
ValueError: If dependencies are missing or the format does not
|
|
703
|
+
match the regional regex.
|
|
704
|
+
"""
|
|
705
|
+
if v is not None:
|
|
706
|
+
v = v.strip()
|
|
707
|
+
is_super = info.data.get("super", False)
|
|
708
|
+
geo_code = info.data.get("geo_code")
|
|
709
|
+
|
|
710
|
+
if not is_super or not geo_code:
|
|
711
|
+
raise ValueError(
|
|
712
|
+
"The 'postalcode' parameter can only be used when both "
|
|
713
|
+
"'super=true' and a valid 'geoCode' are provided."
|
|
714
|
+
)
|
|
715
|
+
|
|
716
|
+
if geo_code not in _ZIPCODE_FORMATS:
|
|
717
|
+
raise ValueError(
|
|
718
|
+
f"Zip code targeting is not supported for country"
|
|
719
|
+
f" '{geo_code}'. "
|
|
720
|
+
f" Supported countries are:"
|
|
721
|
+
f" {', '.join(_ZIPCODE_FORMATS.keys())}."
|
|
722
|
+
)
|
|
723
|
+
|
|
724
|
+
regex = _ZIPCODE_FORMATS[geo_code]
|
|
725
|
+
if not regex.match(v):
|
|
726
|
+
raise ValueError(
|
|
727
|
+
f"Invalid zip code format for {geo_code}. "
|
|
728
|
+
f"Provided '{v}' does not match the required pattern."
|
|
729
|
+
)
|
|
730
|
+
|
|
731
|
+
return v
|
|
732
|
+
return v
|
|
733
|
+
|
|
734
|
+
def to_api_params(self) -> Dict[str, Any]:
|
|
735
|
+
"""Serializes the model into a dictionary formatted for httpx
|
|
736
|
+
query parameters.
|
|
737
|
+
|
|
738
|
+
This method automatically drops unassigned fields, maps snake_case
|
|
739
|
+
variables to their camelCase API equivalents, and stringifies nested
|
|
740
|
+
JSON objects as required by Scrape.do.
|
|
741
|
+
|
|
742
|
+
Returns:
|
|
743
|
+
A sanitized dictionary ready to be passed to httpx.
|
|
744
|
+
"""
|
|
745
|
+
|
|
746
|
+
params = self.model_dump(
|
|
747
|
+
by_alias=True,
|
|
748
|
+
exclude_none=True,
|
|
749
|
+
mode="json"
|
|
750
|
+
)
|
|
751
|
+
|
|
752
|
+
for key, value in params.items():
|
|
753
|
+
|
|
754
|
+
# Serialize playWithBrowserActions
|
|
755
|
+
if key == "playWithBrowser" and self.play_with_browser:
|
|
756
|
+
actions = []
|
|
757
|
+
for action in self.play_with_browser:
|
|
758
|
+
a_dict = action.model_dump(
|
|
759
|
+
by_alias=True,
|
|
760
|
+
exclude_none=True,
|
|
761
|
+
mode="json"
|
|
762
|
+
)
|
|
763
|
+
|
|
764
|
+
# Scrape.do's backend expects string booleans
|
|
765
|
+
for k, v in a_dict.items():
|
|
766
|
+
if isinstance(v, bool):
|
|
767
|
+
a_dict[k] = "true" if v else "false"
|
|
768
|
+
|
|
769
|
+
actions.append(a_dict)
|
|
770
|
+
|
|
771
|
+
params[key] = json.dumps(actions)
|
|
772
|
+
|
|
773
|
+
if isinstance(value, bool):
|
|
774
|
+
params[key] = "true" if value else "false"
|
|
775
|
+
|
|
776
|
+
return params
|
|
777
|
+
|
|
778
|
+
@classmethod
|
|
779
|
+
def from_url(cls: type[Self], api_url: str) -> RequestParameters:
|
|
780
|
+
"""Instantiates a `RequestParameters` instance by parsing a raw
|
|
781
|
+
Scrape.do API URL string.
|
|
782
|
+
|
|
783
|
+
tip: Accepted URLs
|
|
784
|
+
This method accepts both raw and encoded URLs by using
|
|
785
|
+
the `urllib.parse.parse_qs` and `urllib.parse.unquote_plus`
|
|
786
|
+
functions to normalize encoded URLs.
|
|
787
|
+
|
|
788
|
+
warning: Browser Actions (`playWithBrowser`)
|
|
789
|
+
When providing a URL containing the `playWithBrowser` parameter,
|
|
790
|
+
make sure to use the `json.dumps` function to stringify the list
|
|
791
|
+
of dictionaries containing the entries. Both the raw and ecoded
|
|
792
|
+
URLs can be passed to this method afterwards.
|
|
793
|
+
|
|
794
|
+
warning: API Token
|
|
795
|
+
This method ignores the `&token=` parameter containing the
|
|
796
|
+
Scrape.do API key, since its insertion is meant to be handled by
|
|
797
|
+
the `ScrapeDoClient` using either an initialization parameter, or
|
|
798
|
+
the `SCRAPE_DO_API_KEY` environment variable.
|
|
799
|
+
|
|
800
|
+
Args:
|
|
801
|
+
api_url (str): The full Scrape.do endpoint
|
|
802
|
+
(`https://api.scrape.do/?url=...&render=true...`)
|
|
803
|
+
|
|
804
|
+
Raises:
|
|
805
|
+
ValueError: If the value found in the `&playWithBrowser=` parameter
|
|
806
|
+
is not a parsable JSON string.
|
|
807
|
+
|
|
808
|
+
Returns:
|
|
809
|
+
The `RequestParameters` instance mapping the URL parameters
|
|
810
|
+
(`&render=true&...`) to validated attributes
|
|
811
|
+
"""
|
|
812
|
+
|
|
813
|
+
parsed = urllib.parse.urlparse(api_url)
|
|
814
|
+
query_params = urllib.parse.parse_qs(parsed.query)
|
|
815
|
+
# Type parsed params as Dict[str, Any] and let Pydantic raise a
|
|
816
|
+
# ValidationError if it can't coerce a specific value
|
|
817
|
+
flat_params: Dict[str, Any] = {
|
|
818
|
+
k: v[0] for k, v in query_params.items()
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
# Reconstruct the nested JSON actions if they exist
|
|
822
|
+
if "playWithBrowser" in flat_params:
|
|
823
|
+
try:
|
|
824
|
+
# Manually convert '+' to ' ' specifically for this JSON string
|
|
825
|
+
decoded = urllib.parse.unquote_plus(
|
|
826
|
+
flat_params["playWithBrowser"]
|
|
827
|
+
)
|
|
828
|
+
|
|
829
|
+
flat_params["playWithBrowser"] = json.loads(decoded)
|
|
830
|
+
|
|
831
|
+
except json.JSONDecodeError as e:
|
|
832
|
+
raise ValueError(
|
|
833
|
+
f"Failed to decode `playWithBrowser` parameter from URL | "
|
|
834
|
+
f"Parameter Value : {flat_params['playWithBrowser']}"
|
|
835
|
+
) from e
|
|
836
|
+
|
|
837
|
+
# Strip Token
|
|
838
|
+
flat_params.pop("token", None)
|
|
839
|
+
|
|
840
|
+
return cls(**flat_params)
|