scrapling 0.3.5__py3-none-any.whl → 0.3.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +29 -19
- scrapling/cli.py +21 -4
- scrapling/core/_types.py +3 -2
- scrapling/core/ai.py +24 -15
- scrapling/core/custom_types.py +20 -27
- scrapling/core/mixins.py +15 -9
- scrapling/core/shell.py +6 -4
- scrapling/core/storage.py +7 -6
- scrapling/core/translator.py +13 -8
- scrapling/core/utils/__init__.py +0 -1
- scrapling/engines/_browsers/__init__.py +0 -2
- scrapling/engines/_browsers/_base.py +45 -21
- scrapling/engines/_browsers/_camoufox.py +98 -43
- scrapling/engines/_browsers/_config_tools.py +1 -1
- scrapling/engines/_browsers/_controllers.py +34 -13
- scrapling/engines/_browsers/_validators.py +31 -10
- scrapling/engines/constants.py +0 -15
- scrapling/engines/static.py +749 -336
- scrapling/engines/toolbelt/convertor.py +13 -15
- scrapling/engines/toolbelt/custom.py +6 -9
- scrapling/engines/toolbelt/fingerprints.py +17 -10
- scrapling/engines/toolbelt/navigation.py +11 -3
- scrapling/fetchers/__init__.py +46 -0
- scrapling/fetchers/chrome.py +210 -0
- scrapling/fetchers/firefox.py +212 -0
- scrapling/fetchers/requests.py +28 -0
- scrapling/parser.py +109 -84
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/METADATA +17 -16
- scrapling-0.3.7.dist-info/RECORD +47 -0
- scrapling/fetchers.py +0 -444
- scrapling-0.3.5.dist-info/RECORD +0 -44
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/WHEEL +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/entry_points.txt +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/licenses/LICENSE +0 -0
- {scrapling-0.3.5.dist-info → scrapling-0.3.7.dist-info}/top_level.txt +0 -0
@@ -10,6 +10,7 @@ from playwright.async_api import (
|
|
10
10
|
BrowserContext as AsyncBrowserContext,
|
11
11
|
Playwright as AsyncPlaywright,
|
12
12
|
Locator as AsyncLocator,
|
13
|
+
Page as async_Page,
|
13
14
|
)
|
14
15
|
from patchright.sync_api import sync_playwright as sync_patchright
|
15
16
|
from patchright.async_api import async_playwright as async_patchright
|
@@ -18,10 +19,12 @@ from scrapling.core.utils import log
|
|
18
19
|
from ._base import SyncSession, AsyncSession, DynamicSessionMixin
|
19
20
|
from ._validators import validate_fetch as _validate
|
20
21
|
from scrapling.core._types import (
|
22
|
+
Any,
|
21
23
|
Dict,
|
22
24
|
List,
|
23
25
|
Optional,
|
24
26
|
Callable,
|
27
|
+
TYPE_CHECKING,
|
25
28
|
SelectorWaitStates,
|
26
29
|
)
|
27
30
|
from scrapling.engines.toolbelt.convertor import (
|
@@ -30,7 +33,7 @@ from scrapling.engines.toolbelt.convertor import (
|
|
30
33
|
)
|
31
34
|
from scrapling.engines.toolbelt.fingerprints import generate_convincing_referer
|
32
35
|
|
33
|
-
_UNSET = object()
|
36
|
+
_UNSET: Any = object()
|
34
37
|
|
35
38
|
|
36
39
|
class DynamicSession(DynamicSessionMixin, SyncSession):
|
@@ -94,7 +97,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
94
97
|
network_idle: bool = False,
|
95
98
|
load_dom: bool = True,
|
96
99
|
wait_selector_state: SelectorWaitStates = "attached",
|
100
|
+
user_data_dir: str = "",
|
97
101
|
selector_config: Optional[Dict] = None,
|
102
|
+
additional_args: Optional[Dict] = None,
|
98
103
|
):
|
99
104
|
"""A Browser session manager with page pooling, it's using a persistent browser Context by default with a temporary user profile directory.
|
100
105
|
|
@@ -117,11 +122,13 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
117
122
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
118
123
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
119
124
|
:param load_dom: Enabled by default, wait for all JavaScript on page(s) to fully load and execute.
|
120
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
125
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
121
126
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
122
127
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
123
128
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
129
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
124
130
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
131
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
125
132
|
"""
|
126
133
|
self.__validate__(
|
127
134
|
wait=wait,
|
@@ -140,11 +147,13 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
140
147
|
hide_canvas=hide_canvas,
|
141
148
|
init_script=init_script,
|
142
149
|
network_idle=network_idle,
|
150
|
+
user_data_dir=user_data_dir,
|
143
151
|
google_search=google_search,
|
144
152
|
extra_headers=extra_headers,
|
145
153
|
wait_selector=wait_selector,
|
146
154
|
disable_webgl=disable_webgl,
|
147
155
|
selector_config=selector_config,
|
156
|
+
additional_args=additional_args,
|
148
157
|
disable_resources=disable_resources,
|
149
158
|
wait_selector_state=wait_selector_state,
|
150
159
|
)
|
@@ -154,14 +163,14 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
154
163
|
"""Create a browser for this instance and context."""
|
155
164
|
sync_context = sync_patchright if self.stealth else sync_playwright
|
156
165
|
|
157
|
-
self.playwright: Playwright = sync_context().start()
|
166
|
+
self.playwright: Playwright = sync_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
158
167
|
|
159
168
|
if self.cdp_url: # pragma: no cover
|
160
169
|
self.context = self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url).new_context(
|
161
170
|
**self.context_options
|
162
171
|
)
|
163
172
|
else:
|
164
|
-
self.context = self.playwright.chromium.launch_persistent_context(
|
173
|
+
self.context = self.playwright.chromium.launch_persistent_context(**self.launch_options)
|
165
174
|
|
166
175
|
if self.init_script: # pragma: no cover
|
167
176
|
self.context.add_init_script(path=self.init_script)
|
@@ -187,7 +196,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
187
196
|
|
188
197
|
if self.playwright:
|
189
198
|
self.playwright.stop()
|
190
|
-
self.playwright = None
|
199
|
+
self.playwright = None # pyright: ignore
|
191
200
|
|
192
201
|
self._closed = True
|
193
202
|
|
@@ -254,6 +263,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
254
263
|
if (
|
255
264
|
finished_response.request.resource_type == "document"
|
256
265
|
and finished_response.request.is_navigation_request()
|
266
|
+
and finished_response.request.frame == page_info.page.main_frame
|
257
267
|
):
|
258
268
|
final_response = finished_response
|
259
269
|
|
@@ -299,7 +309,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
|
|
299
309
|
page_info.page, first_response, final_response, params.selector_config
|
300
310
|
)
|
301
311
|
|
302
|
-
# Close the page
|
312
|
+
# Close the page to free up resources
|
303
313
|
page_info.page.close()
|
304
314
|
self.page_pool.pages.remove(page_info)
|
305
315
|
|
@@ -337,7 +347,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
337
347
|
network_idle: bool = False,
|
338
348
|
load_dom: bool = True,
|
339
349
|
wait_selector_state: SelectorWaitStates = "attached",
|
350
|
+
user_data_dir: str = "",
|
340
351
|
selector_config: Optional[Dict] = None,
|
352
|
+
additional_args: Optional[Dict] = None,
|
341
353
|
):
|
342
354
|
"""A Browser session manager with page pooling
|
343
355
|
|
@@ -360,12 +372,14 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
360
372
|
:param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
|
361
373
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
362
374
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
363
|
-
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers
|
375
|
+
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
|
364
376
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
|
365
377
|
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
366
378
|
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
367
379
|
:param max_pages: The maximum number of tabs to be opened at the same time. It will be used in rotation through a PagePool.
|
380
|
+
:param user_data_dir: Path to a User Data Directory, which stores browser session data like cookies and local storage. The default is to create a temporary directory.
|
368
381
|
:param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
|
382
|
+
:param additional_args: Additional arguments to be passed to Playwright's context as additional settings, and it takes higher priority than Scrapling's settings.
|
369
383
|
"""
|
370
384
|
|
371
385
|
self.__validate__(
|
@@ -385,11 +399,13 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
385
399
|
hide_canvas=hide_canvas,
|
386
400
|
init_script=init_script,
|
387
401
|
network_idle=network_idle,
|
402
|
+
user_data_dir=user_data_dir,
|
388
403
|
google_search=google_search,
|
389
404
|
extra_headers=extra_headers,
|
390
405
|
wait_selector=wait_selector,
|
391
406
|
disable_webgl=disable_webgl,
|
392
407
|
selector_config=selector_config,
|
408
|
+
additional_args=additional_args,
|
393
409
|
disable_resources=disable_resources,
|
394
410
|
wait_selector_state=wait_selector_state,
|
395
411
|
)
|
@@ -399,21 +415,21 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
399
415
|
"""Create a browser for this instance and context."""
|
400
416
|
async_context = async_patchright if self.stealth else async_playwright
|
401
417
|
|
402
|
-
self.playwright: AsyncPlaywright = await async_context().start()
|
418
|
+
self.playwright: AsyncPlaywright = await async_context().start() # pyright: ignore [reportAttributeAccessIssue]
|
403
419
|
|
404
420
|
if self.cdp_url:
|
405
421
|
browser = await self.playwright.chromium.connect_over_cdp(endpoint_url=self.cdp_url)
|
406
422
|
self.context: AsyncBrowserContext = await browser.new_context(**self.context_options)
|
407
423
|
else:
|
408
424
|
self.context: AsyncBrowserContext = await self.playwright.chromium.launch_persistent_context(
|
409
|
-
|
425
|
+
**self.launch_options
|
410
426
|
)
|
411
427
|
|
412
428
|
if self.init_script: # pragma: no cover
|
413
429
|
await self.context.add_init_script(path=self.init_script)
|
414
430
|
|
415
431
|
if self.cookies:
|
416
|
-
await self.context.add_cookies(self.cookies)
|
432
|
+
await self.context.add_cookies(self.cookies) # pyright: ignore
|
417
433
|
|
418
434
|
async def __aenter__(self):
|
419
435
|
await self.__create__()
|
@@ -429,11 +445,11 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
429
445
|
|
430
446
|
if self.context:
|
431
447
|
await self.context.close()
|
432
|
-
self.context = None
|
448
|
+
self.context = None # pyright: ignore
|
433
449
|
|
434
450
|
if self.playwright:
|
435
451
|
await self.playwright.stop()
|
436
|
-
self.playwright = None
|
452
|
+
self.playwright = None # pyright: ignore
|
437
453
|
|
438
454
|
self._closed = True
|
439
455
|
|
@@ -500,12 +516,17 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
500
516
|
if (
|
501
517
|
finished_response.request.resource_type == "document"
|
502
518
|
and finished_response.request.is_navigation_request()
|
519
|
+
and finished_response.request.frame == page_info.page.main_frame
|
503
520
|
):
|
504
521
|
final_response = finished_response
|
505
522
|
|
506
523
|
page_info = await self._get_page(params.timeout, params.extra_headers, params.disable_resources)
|
507
524
|
page_info.mark_busy(url=url)
|
508
525
|
|
526
|
+
if TYPE_CHECKING:
|
527
|
+
if not isinstance(page_info.page, async_Page):
|
528
|
+
raise TypeError
|
529
|
+
|
509
530
|
try:
|
510
531
|
# Navigate to URL and wait for a specified state
|
511
532
|
page_info.page.on("response", handle_response)
|
@@ -545,7 +566,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
|
|
545
566
|
page_info.page, first_response, final_response, params.selector_config
|
546
567
|
)
|
547
568
|
|
548
|
-
# Close the page
|
569
|
+
# Close the page to free up resources
|
549
570
|
await page_info.page.close()
|
550
571
|
self.page_pool.pages.remove(page_info)
|
551
572
|
return response
|
@@ -11,7 +11,9 @@ from scrapling.core._types import (
|
|
11
11
|
Tuple,
|
12
12
|
Optional,
|
13
13
|
Callable,
|
14
|
+
Iterable,
|
14
15
|
SelectorWaitStates,
|
16
|
+
overload,
|
15
17
|
)
|
16
18
|
from scrapling.engines.toolbelt.navigation import construct_proxy_dict
|
17
19
|
|
@@ -73,7 +75,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
73
75
|
stealth: bool = False
|
74
76
|
wait: Seconds = 0
|
75
77
|
page_action: Optional[Callable] = None
|
76
|
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
78
|
+
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
77
79
|
locale: str = "en-US"
|
78
80
|
extra_headers: Optional[Dict[str, str]] = None
|
79
81
|
useragent: Optional[str] = None
|
@@ -81,11 +83,13 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
81
83
|
init_script: Optional[str] = None
|
82
84
|
disable_resources: bool = False
|
83
85
|
wait_selector: Optional[str] = None
|
84
|
-
cookies: Optional[
|
86
|
+
cookies: Optional[Iterable[Dict]] = None
|
85
87
|
network_idle: bool = False
|
86
88
|
load_dom: bool = True
|
87
89
|
wait_selector_state: SelectorWaitStates = "attached"
|
88
|
-
|
90
|
+
user_data_dir: str = ""
|
91
|
+
selector_config: Optional[Dict] = {}
|
92
|
+
additional_args: Optional[Dict] = {}
|
89
93
|
|
90
94
|
def __post_init__(self):
|
91
95
|
"""Custom validation after msgspec validation"""
|
@@ -100,6 +104,8 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
|
|
100
104
|
self.cookies = []
|
101
105
|
if not self.selector_config:
|
102
106
|
self.selector_config = {}
|
107
|
+
if not self.additional_args:
|
108
|
+
self.additional_args = {}
|
103
109
|
|
104
110
|
if self.init_script is not None:
|
105
111
|
_validate_file_path(self.init_script)
|
@@ -125,15 +131,16 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
|
|
125
131
|
wait_selector: Optional[str] = None
|
126
132
|
addons: Optional[List[str]] = None
|
127
133
|
wait_selector_state: SelectorWaitStates = "attached"
|
128
|
-
cookies: Optional[
|
134
|
+
cookies: Optional[Iterable[Dict]] = None
|
129
135
|
google_search: bool = True
|
130
136
|
extra_headers: Optional[Dict[str, str]] = None
|
131
|
-
proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
|
137
|
+
proxy: Optional[str | Dict[str, str] | Tuple] = None # The default value for proxy in Playwright's source is `None`
|
132
138
|
os_randomize: bool = False
|
133
139
|
disable_ads: bool = False
|
134
140
|
geoip: bool = False
|
135
|
-
|
136
|
-
|
141
|
+
user_data_dir: str = ""
|
142
|
+
selector_config: Optional[Dict] = {}
|
143
|
+
additional_args: Optional[Dict] = {}
|
137
144
|
|
138
145
|
def __post_init__(self):
|
139
146
|
"""Custom validation after msgspec validation"""
|
@@ -177,7 +184,7 @@ class FetchConfig(Struct, kw_only=True):
|
|
177
184
|
network_idle: bool = False
|
178
185
|
load_dom: bool = True
|
179
186
|
solve_cloudflare: bool = False
|
180
|
-
selector_config:
|
187
|
+
selector_config: Dict = {}
|
181
188
|
|
182
189
|
def to_dict(self):
|
183
190
|
return {f: getattr(self, f) for f in self.__struct_fields__}
|
@@ -198,7 +205,7 @@ class _fetch_params:
|
|
198
205
|
network_idle: bool
|
199
206
|
load_dom: bool
|
200
207
|
solve_cloudflare: bool
|
201
|
-
selector_config:
|
208
|
+
selector_config: Dict
|
202
209
|
|
203
210
|
|
204
211
|
def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
@@ -222,7 +229,21 @@ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
|
|
222
229
|
return _fetch_params(**result)
|
223
230
|
|
224
231
|
|
225
|
-
|
232
|
+
@overload
|
233
|
+
def validate(params: Dict, model: type[PlaywrightConfig]) -> PlaywrightConfig: ...
|
234
|
+
|
235
|
+
|
236
|
+
@overload
|
237
|
+
def validate(params: Dict, model: type[CamoufoxConfig]) -> CamoufoxConfig: ...
|
238
|
+
|
239
|
+
|
240
|
+
@overload
|
241
|
+
def validate(params: Dict, model: type[FetchConfig]) -> FetchConfig: ...
|
242
|
+
|
243
|
+
|
244
|
+
def validate(
|
245
|
+
params: Dict, model: type[PlaywrightConfig] | type[CamoufoxConfig] | type[FetchConfig]
|
246
|
+
) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
|
226
247
|
try:
|
227
248
|
return convert(params, model)
|
228
249
|
except ValidationError as e:
|
scrapling/engines/constants.py
CHANGED
@@ -101,18 +101,3 @@ DEFAULT_STEALTH_FLAGS = (
|
|
101
101
|
"--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4",
|
102
102
|
"--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees",
|
103
103
|
)
|
104
|
-
|
105
|
-
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
106
|
-
NSTBROWSER_DEFAULT_QUERY = {
|
107
|
-
"once": True,
|
108
|
-
"headless": True,
|
109
|
-
"autoClose": True,
|
110
|
-
"fingerprint": {
|
111
|
-
"flags": {"timezone": "BasedOnIp", "screen": "Custom"},
|
112
|
-
"platform": "linux", # support: windows, mac, linux
|
113
|
-
"kernel": "chromium", # only support: chromium
|
114
|
-
"kernelMilestone": "128",
|
115
|
-
"hardwareConcurrency": 8,
|
116
|
-
"deviceMemory": 8,
|
117
|
-
},
|
118
|
-
}
|