scrapling 0.2.5__py3-none-any.whl → 0.2.6__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/engines/pw.py +15 -9
- scrapling/engines/toolbelt/fingerprints.py +1 -1
- scrapling/fetchers.py +7 -5
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/METADATA +6 -3
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/RECORD +9 -9
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/LICENSE +0 -0
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/WHEEL +0 -0
- {scrapling-0.2.5.dist-info → scrapling-0.2.6.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.6"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/engines/pw.py
CHANGED
@@ -27,11 +27,12 @@ class PlaywrightEngine:
|
|
27
27
|
page_action: Callable = do_nothing,
|
28
28
|
wait_selector: Optional[str] = None,
|
29
29
|
wait_selector_state: Optional[str] = 'attached',
|
30
|
-
stealth: bool = False,
|
31
|
-
|
32
|
-
|
30
|
+
stealth: Optional[bool] = False,
|
31
|
+
real_chrome: Optional[bool] = False,
|
32
|
+
hide_canvas: Optional[bool] = False,
|
33
|
+
disable_webgl: Optional[bool] = False,
|
33
34
|
cdp_url: Optional[str] = None,
|
34
|
-
nstbrowser_mode: bool = False,
|
35
|
+
nstbrowser_mode: Optional[bool] = False,
|
35
36
|
nstbrowser_config: Optional[Dict] = None,
|
36
37
|
google_search: Optional[bool] = True,
|
37
38
|
extra_headers: Optional[Dict[str, str]] = None,
|
@@ -51,6 +52,7 @@ class PlaywrightEngine:
|
|
51
52
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
52
53
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
53
54
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
55
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
54
56
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
55
57
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
56
58
|
:param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
|
@@ -67,6 +69,7 @@ class PlaywrightEngine:
|
|
67
69
|
self.stealth = bool(stealth)
|
68
70
|
self.hide_canvas = bool(hide_canvas)
|
69
71
|
self.disable_webgl = bool(disable_webgl)
|
72
|
+
self.real_chrome = bool(real_chrome)
|
70
73
|
self.google_search = bool(google_search)
|
71
74
|
self.extra_headers = extra_headers or {}
|
72
75
|
self.proxy = construct_proxy_dict(proxy)
|
@@ -119,7 +122,8 @@ class PlaywrightEngine:
|
|
119
122
|
:param url: Target url.
|
120
123
|
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
121
124
|
"""
|
122
|
-
if not self.stealth:
|
125
|
+
if not self.stealth or self.real_chrome:
|
126
|
+
# Because rebrowser_playwright doesn't play well with real browsers
|
123
127
|
from playwright.sync_api import sync_playwright
|
124
128
|
else:
|
125
129
|
from rebrowser_playwright.sync_api import sync_playwright
|
@@ -130,8 +134,8 @@ class PlaywrightEngine:
|
|
130
134
|
extra_headers = {}
|
131
135
|
useragent = self.useragent
|
132
136
|
else:
|
133
|
-
extra_headers =
|
134
|
-
useragent =
|
137
|
+
extra_headers = {}
|
138
|
+
useragent = generate_headers(browser_mode=True).get('User-Agent')
|
135
139
|
|
136
140
|
# Prepare the flags before diving
|
137
141
|
flags = DEFAULT_STEALTH_FLAGS
|
@@ -146,9 +150,11 @@ class PlaywrightEngine:
|
|
146
150
|
browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
|
147
151
|
else:
|
148
152
|
if self.stealth:
|
149
|
-
browser = p.chromium.launch(
|
153
|
+
browser = p.chromium.launch(
|
154
|
+
headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
|
155
|
+
)
|
150
156
|
else:
|
151
|
-
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
|
157
|
+
browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium')
|
152
158
|
|
153
159
|
# Creating the context
|
154
160
|
if self.stealth:
|
@@ -67,7 +67,7 @@ def generate_headers(browser_mode: bool = False) -> Dict:
|
|
67
67
|
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
68
68
|
os_name = get_os_name()
|
69
69
|
return HeaderGenerator(
|
70
|
-
browser=[Browser(name='chrome', min_version=
|
70
|
+
browser=[Browser(name='chrome', min_version=130)],
|
71
71
|
os=os_name, # None is ignored
|
72
72
|
device='desktop'
|
73
73
|
).generate()
|
scrapling/fetchers.py
CHANGED
@@ -138,7 +138,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
138
138
|
2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
139
139
|
3) Using custom flags on launch to hide Playwright even more and make it faster.
|
140
140
|
4) Generates real browser's headers of the same type and same user OS then append it to the request.
|
141
|
-
- Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
141
|
+
- Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
142
142
|
- NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
143
143
|
|
144
144
|
> Note that these are the main options with PlayWright but it can be mixed together.
|
@@ -146,12 +146,12 @@ class PlayWrightFetcher(BaseFetcher):
|
|
146
146
|
def fetch(
|
147
147
|
self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
|
148
148
|
useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
|
149
|
-
page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
150
|
-
hide_canvas: bool =
|
149
|
+
page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
|
150
|
+
hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
|
151
151
|
proxy: Optional[Union[str, Dict[str, str]]] = None,
|
152
|
-
stealth: bool = False,
|
152
|
+
stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
|
153
153
|
cdp_url: Optional[str] = None,
|
154
|
-
nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
|
154
|
+
nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
|
155
155
|
) -> Response:
|
156
156
|
"""Opens up a browser and do your request based on your chosen options below.
|
157
157
|
|
@@ -167,6 +167,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
167
167
|
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
168
168
|
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
169
169
|
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
|
170
|
+
:param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
|
170
171
|
:param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
|
171
172
|
:param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
|
172
173
|
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
@@ -184,6 +185,7 @@ class PlayWrightFetcher(BaseFetcher):
|
|
184
185
|
cdp_url=cdp_url,
|
185
186
|
headless=headless,
|
186
187
|
useragent=useragent,
|
188
|
+
real_chrome=real_chrome,
|
187
189
|
page_action=page_action,
|
188
190
|
hide_canvas=hide_canvas,
|
189
191
|
network_idle=network_idle,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.6
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -39,7 +39,7 @@ Requires-Dist: w3lib
|
|
39
39
|
Requires-Dist: orjson>=3
|
40
40
|
Requires-Dist: tldextract
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
|
-
Requires-Dist: playwright
|
42
|
+
Requires-Dist: playwright==1.48
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
44
|
Requires-Dist: camoufox>=0.3.10
|
45
45
|
Requires-Dist: browserforge
|
@@ -336,9 +336,11 @@ Using this Fetcher class, you can make requests with:
|
|
336
336
|
* Mimics some of the real browsers' properties by injecting several JS files and using custom options.
|
337
337
|
* Using custom flags on launch to hide Playwright even more and make it faster.
|
338
338
|
* Generates real browser's headers of the same type and same user OS then append it to the request's headers.
|
339
|
-
3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
339
|
+
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
340
340
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
341
341
|
|
342
|
+
> Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
|
343
|
+
|
342
344
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
343
345
|
|
344
346
|
<details><summary><strong>Expand this for the complete list of arguments</strong></summary>
|
@@ -360,6 +362,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
360
362
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
361
363
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
362
364
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
365
|
+
| real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
363
366
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
364
367
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
365
368
|
| nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
|
@@ -1,6 +1,6 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=NnIpEZcBGs5Pu2TjqPCacC7N6LN37SbnniBU1AhgdXs,435
|
2
2
|
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
|
-
scrapling/fetchers.py,sha256
|
3
|
+
scrapling/fetchers.py,sha256=-gc-Yo1MjF_4cdJ-5rxZqNC0owxFXTFoEBj08BFEYPs,16361
|
4
4
|
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
5
5
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
6
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
@@ -13,11 +13,11 @@ scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
|
13
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
14
|
scrapling/engines/camo.py,sha256=dXkdfFmf3M09RXAvaZ8CE5khsblC3Wd7_6jWfu8XO6I,7618
|
15
15
|
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
16
|
-
scrapling/engines/pw.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=gMWJAZYpJbFK-GiyRrpVrMjyMqSSetE6hf8kmf0zR2o,12729
|
17
17
|
scrapling/engines/static.py,sha256=wzBsoOHPpN5JV1izQSSSarPBNWB-wo0BDWNFuin6ID8,7109
|
18
18
|
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
19
|
scrapling/engines/toolbelt/custom.py,sha256=ELr3_FwUqNI27E98kz-50OA5a6hQQtoIYrZoLKsvUpM,12551
|
20
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=
|
20
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
|
21
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
22
|
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
23
|
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
@@ -35,8 +35,8 @@ tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,
|
|
35
35
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
36
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
37
37
|
tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
|
38
|
-
scrapling-0.2.
|
39
|
-
scrapling-0.2.
|
40
|
-
scrapling-0.2.
|
41
|
-
scrapling-0.2.
|
42
|
-
scrapling-0.2.
|
38
|
+
scrapling-0.2.6.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
+
scrapling-0.2.6.dist-info/METADATA,sha256=cFOu2nlkXDsjyjkIt9kDu1nKKvS14xYH2LT4_VNH5j0,65362
|
40
|
+
scrapling-0.2.6.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
+
scrapling-0.2.6.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
+
scrapling-0.2.6.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|