scrapling 0.2.4__tar.gz → 0.2.6__tar.gz

Sign up to get free protection for your applications and to get access to all the features.
Files changed (49) hide show
  1. {scrapling-0.2.4/scrapling.egg-info → scrapling-0.2.6}/PKG-INFO +6 -3
  2. {scrapling-0.2.4 → scrapling-0.2.6}/README.md +4 -1
  3. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/__init__.py +1 -1
  4. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/core/custom_types.py +2 -3
  5. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/core/translator.py +6 -4
  6. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/camo.py +3 -6
  7. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/pw.py +18 -15
  8. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/static.py +5 -1
  9. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/custom.py +69 -1
  10. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/fingerprints.py +1 -1
  11. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/fetchers.py +22 -12
  12. {scrapling-0.2.4 → scrapling-0.2.6/scrapling.egg-info}/PKG-INFO +6 -3
  13. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling.egg-info/SOURCES.txt +1 -0
  14. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling.egg-info/requires.txt +1 -1
  15. {scrapling-0.2.4 → scrapling-0.2.6}/setup.cfg +1 -1
  16. {scrapling-0.2.4 → scrapling-0.2.6}/setup.py +2 -2
  17. {scrapling-0.2.4 → scrapling-0.2.6}/tests/fetchers/test_camoufox.py +2 -0
  18. {scrapling-0.2.4 → scrapling-0.2.6}/tests/fetchers/test_playwright.py +2 -0
  19. scrapling-0.2.6/tests/fetchers/test_utils.py +129 -0
  20. {scrapling-0.2.4 → scrapling-0.2.6}/tests/parser/test_general.py +1 -1
  21. {scrapling-0.2.4 → scrapling-0.2.6}/LICENSE +0 -0
  22. {scrapling-0.2.4 → scrapling-0.2.6}/MANIFEST.in +0 -0
  23. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/core/__init__.py +0 -0
  24. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/core/_types.py +0 -0
  25. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/core/mixins.py +0 -0
  26. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/core/storage_adaptors.py +0 -0
  27. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/core/utils.py +0 -0
  28. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/defaults.py +0 -0
  29. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/__init__.py +0 -0
  30. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/constants.py +0 -0
  31. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/__init__.py +0 -0
  32. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  33. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  34. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -0
  35. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  36. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  37. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  38. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  39. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/engines/toolbelt/navigation.py +0 -0
  40. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/parser.py +0 -0
  41. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling/py.typed +0 -0
  42. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling.egg-info/dependency_links.txt +0 -0
  43. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling.egg-info/not-zip-safe +0 -0
  44. {scrapling-0.2.4 → scrapling-0.2.6}/scrapling.egg-info/top_level.txt +0 -0
  45. {scrapling-0.2.4 → scrapling-0.2.6}/tests/__init__.py +0 -0
  46. {scrapling-0.2.4 → scrapling-0.2.6}/tests/fetchers/__init__.py +0 -0
  47. {scrapling-0.2.4 → scrapling-0.2.6}/tests/fetchers/test_httpx.py +0 -0
  48. {scrapling-0.2.4 → scrapling-0.2.6}/tests/parser/__init__.py +0 -0
  49. {scrapling-0.2.4 → scrapling-0.2.6}/tests/parser/test_automatch.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -39,7 +39,7 @@ Requires-Dist: w3lib
39
39
  Requires-Dist: orjson>=3
40
40
  Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
- Requires-Dist: playwright
42
+ Requires-Dist: playwright==1.48
43
43
  Requires-Dist: rebrowser-playwright
44
44
  Requires-Dist: camoufox>=0.3.10
45
45
  Requires-Dist: browserforge
@@ -336,9 +336,11 @@ Using this Fetcher class, you can make requests with:
336
336
  * Mimics some of the real browsers' properties by injecting several JS files and using custom options.
337
337
  * Using custom flags on launch to hide Playwright even more and make it faster.
338
338
  * Generates real browser's headers of the same type and same user OS then append it to the request's headers.
339
- 3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
339
+ 3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
340
340
  4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
341
341
 
342
+ > Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
343
+
342
344
  Add that to a lot of controlling/hiding options as you will see in the arguments list below.
343
345
 
344
346
  <details><summary><strong>Expand this for the complete list of arguments</strong></summary>
@@ -360,6 +362,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
360
362
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
361
363
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
362
364
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
365
+ | real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
363
366
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
364
367
  | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
365
368
  | nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
@@ -290,9 +290,11 @@ Using this Fetcher class, you can make requests with:
290
290
  * Mimics some of the real browsers' properties by injecting several JS files and using custom options.
291
291
  * Using custom flags on launch to hide Playwright even more and make it faster.
292
292
  * Generates real browser's headers of the same type and same user OS then append it to the request's headers.
293
- 3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
293
+ 3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
294
294
  4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
295
295
 
296
+ > Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
297
+
296
298
  Add that to a lot of controlling/hiding options as you will see in the arguments list below.
297
299
 
298
300
  <details><summary><strong>Expand this for the complete list of arguments</strong></summary>
@@ -314,6 +316,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
314
316
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
315
317
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
316
318
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
319
+ | real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
317
320
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
318
321
  | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
319
322
  | nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
4
4
  from scrapling.core.custom_types import TextHandler, AttributesHandler
5
5
 
6
6
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
7
- __version__ = "0.2.4"
7
+ __version__ = "0.2.6"
8
8
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
9
9
 
10
10
 
@@ -129,9 +129,8 @@ class TextHandlers(List[TextHandler]):
129
129
 
130
130
 
131
131
  class AttributesHandler(Mapping):
132
- """A read-only mapping to use instead of the standard dictionary for the speed boost but
133
- at the same time I use it to add more functionalities.
134
- If standard dictionary is needed, just convert this class to dictionary with `dict` function
132
+ """A read-only mapping to use instead of the standard dictionary for the speed boost but at the same time I use it to add more functionalities.
133
+ If standard dictionary is needed, just convert this class to dictionary with `dict` function
135
134
  """
136
135
  __slots__ = ('_data',)
137
136
 
@@ -1,9 +1,11 @@
1
1
  """
2
2
  Most of this file is adapted version of the translator of parsel library with some modifications simply for 1 important reason...
3
- To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format
4
- which will be important in future releases but most importantly...
5
- so you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
6
- > if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
3
+
4
+ To add pseudo-elements ``::text`` and ``::attr(ATTR_NAME)`` so we match Parsel/Scrapy selectors format which will be important in future releases but most importantly...
5
+
6
+ So you don't have to learn a new selectors/api method like what bs4 done with soupsieve :)
7
+
8
+ if you want to learn about this, head to https://cssselect.readthedocs.io/en/latest/#cssselect.FunctionalPseudoElement
7
9
  """
8
10
 
9
11
  import re
@@ -104,13 +104,10 @@ class CamoufoxEngine:
104
104
 
105
105
  if self.wait_selector and type(self.wait_selector) is str:
106
106
  waiter = page.locator(self.wait_selector)
107
- waiter.wait_for(state=self.wait_selector_state)
107
+ waiter.first.wait_for(state=self.wait_selector_state)
108
108
 
109
- content_type = res.headers.get('content-type', '')
110
- # Parse charset from content-type
111
- encoding = 'utf-8' # default encoding
112
- if 'charset=' in content_type.lower():
113
- encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
109
+ # This will be parsed inside `Response`
110
+ encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
114
111
 
115
112
  status_text = res.status_text
116
113
  # PlayWright API sometimes give empty status text for some reason!
@@ -27,11 +27,12 @@ class PlaywrightEngine:
27
27
  page_action: Callable = do_nothing,
28
28
  wait_selector: Optional[str] = None,
29
29
  wait_selector_state: Optional[str] = 'attached',
30
- stealth: bool = False,
31
- hide_canvas: bool = True,
32
- disable_webgl: bool = False,
30
+ stealth: Optional[bool] = False,
31
+ real_chrome: Optional[bool] = False,
32
+ hide_canvas: Optional[bool] = False,
33
+ disable_webgl: Optional[bool] = False,
33
34
  cdp_url: Optional[str] = None,
34
- nstbrowser_mode: bool = False,
35
+ nstbrowser_mode: Optional[bool] = False,
35
36
  nstbrowser_config: Optional[Dict] = None,
36
37
  google_search: Optional[bool] = True,
37
38
  extra_headers: Optional[Dict[str, str]] = None,
@@ -51,6 +52,7 @@ class PlaywrightEngine:
51
52
  :param wait_selector: Wait for a specific css selector to be in a specific state.
52
53
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
53
54
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
55
+ :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
54
56
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
55
57
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
56
58
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
@@ -67,6 +69,7 @@ class PlaywrightEngine:
67
69
  self.stealth = bool(stealth)
68
70
  self.hide_canvas = bool(hide_canvas)
69
71
  self.disable_webgl = bool(disable_webgl)
72
+ self.real_chrome = bool(real_chrome)
70
73
  self.google_search = bool(google_search)
71
74
  self.extra_headers = extra_headers or {}
72
75
  self.proxy = construct_proxy_dict(proxy)
@@ -119,7 +122,8 @@ class PlaywrightEngine:
119
122
  :param url: Target url.
120
123
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
121
124
  """
122
- if not self.stealth:
125
+ if not self.stealth or self.real_chrome:
126
+ # Because rebrowser_playwright doesn't play well with real browsers
123
127
  from playwright.sync_api import sync_playwright
124
128
  else:
125
129
  from rebrowser_playwright.sync_api import sync_playwright
@@ -130,8 +134,8 @@ class PlaywrightEngine:
130
134
  extra_headers = {}
131
135
  useragent = self.useragent
132
136
  else:
133
- extra_headers = generate_headers(browser_mode=True)
134
- useragent = extra_headers.get('User-Agent')
137
+ extra_headers = {}
138
+ useragent = generate_headers(browser_mode=True).get('User-Agent')
135
139
 
136
140
  # Prepare the flags before diving
137
141
  flags = DEFAULT_STEALTH_FLAGS
@@ -146,9 +150,11 @@ class PlaywrightEngine:
146
150
  browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
147
151
  else:
148
152
  if self.stealth:
149
- browser = p.chromium.launch(headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True)
153
+ browser = p.chromium.launch(
154
+ headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
155
+ )
150
156
  else:
151
- browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
157
+ browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium')
152
158
 
153
159
  # Creating the context
154
160
  if self.stealth:
@@ -214,13 +220,10 @@ class PlaywrightEngine:
214
220
 
215
221
  if self.wait_selector and type(self.wait_selector) is str:
216
222
  waiter = page.locator(self.wait_selector)
217
- waiter.wait_for(state=self.wait_selector_state)
223
+ waiter.first.wait_for(state=self.wait_selector_state)
218
224
 
219
- content_type = res.headers.get('content-type', '')
220
- # Parse charset from content-type
221
- encoding = 'utf-8' # default encoding
222
- if 'charset=' in content_type.lower():
223
- encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
225
+ # This will be parsed inside `Response`
226
+ encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
224
227
 
225
228
  status_text = res.status_text
226
229
  # PlayWright API sometimes give empty status text for some reason!
@@ -23,7 +23,7 @@ class StaticEngine:
23
23
  @staticmethod
24
24
  def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
25
25
  """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
26
- finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
26
+ finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
27
27
 
28
28
  :param headers: Current headers in the request if the user passed any
29
29
  :param url: The Target URL.
@@ -65,6 +65,7 @@ class StaticEngine:
65
65
 
66
66
  def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
67
67
  """Make basic HTTP GET request for you but with some added flavors.
68
+
68
69
  :param url: Target url.
69
70
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
70
71
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -77,6 +78,7 @@ class StaticEngine:
77
78
 
78
79
  def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
79
80
  """Make basic HTTP POST request for you but with some added flavors.
81
+
80
82
  :param url: Target url.
81
83
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
82
84
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -89,6 +91,7 @@ class StaticEngine:
89
91
 
90
92
  def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
91
93
  """Make basic HTTP DELETE request for you but with some added flavors.
94
+
92
95
  :param url: Target url.
93
96
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
94
97
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -101,6 +104,7 @@ class StaticEngine:
101
104
 
102
105
  def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
103
106
  """Make basic HTTP PUT request for you but with some added flavors.
107
+
104
108
  :param url: Target url.
105
109
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
106
110
  create a referer header as if this request had came from Google's search of this URL's domain.
@@ -3,11 +3,78 @@ Functions related to custom types or type checking
3
3
  """
4
4
  import inspect
5
5
  import logging
6
+ from email.message import Message
6
7
 
7
8
  from scrapling.core.custom_types import MappingProxyType
8
9
  from scrapling.parser import Adaptor, SQLiteStorageSystem
9
10
  from scrapling.core.utils import setup_basic_logging, cache
10
- from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
11
+ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
12
+
13
+
14
+ class ResponseEncoding:
15
+ __DEFAULT_ENCODING = "utf-8"
16
+ __ISO_8859_1_CONTENT_TYPES = {"text/plain", "text/html", "text/css", "text/javascript"}
17
+
18
+ @classmethod
19
+ @cache(maxsize=None)
20
+ def __parse_content_type(cls, header_value: str) -> Tuple[str, Dict[str, str]]:
21
+ """Parse content type and parameters from a content-type header value.
22
+
23
+ Uses `email.message.Message` for robust header parsing according to RFC 2045.
24
+
25
+ :param header_value: Raw content-type header string
26
+ :return: Tuple of (content_type, parameters_dict)
27
+ """
28
+ # Create a Message object and set the Content-Type header then get the content type and parameters
29
+ msg = Message()
30
+ msg['content-type'] = header_value
31
+
32
+ content_type = msg.get_content_type()
33
+ params = dict(msg.get_params(failobj=[]))
34
+
35
+ # Remove the content-type from params if present somehow
36
+ params.pop('content-type', None)
37
+
38
+ return content_type, params
39
+
40
+ @classmethod
41
+ @cache(maxsize=None)
42
+ def get_value(cls, content_type: Optional[str]) -> str:
43
+ """Determine the appropriate character encoding from a content-type header.
44
+
45
+ The encoding is determined by these rules in order:
46
+ 1. If no content-type is provided, use UTF-8
47
+ 2. If charset parameter is present, use that encoding
48
+ 3. If content-type is `text/*`, use ISO-8859-1 per HTTP/1.1 spec
49
+ 4. If content-type is application/json, use UTF-8 per RFC 4627
50
+ 5. Default to UTF-8 if nothing else matches
51
+
52
+ :param content_type: Content-Type header value or None
53
+ :return: String naming the character encoding
54
+ """
55
+ if not content_type:
56
+ return cls.__DEFAULT_ENCODING
57
+
58
+ try:
59
+ content_type, params = cls.__parse_content_type(content_type)
60
+
61
+ # First check for explicit charset parameter
62
+ if "charset" in params:
63
+ encoding = params["charset"].strip("'\"")
64
+ "test".encode(encoding) # Validate encoding
65
+ return encoding
66
+
67
+ # Apply content-type specific rules
68
+ if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
69
+ return "ISO-8859-1"
70
+
71
+ if content_type == "application/json":
72
+ return cls.__DEFAULT_ENCODING
73
+
74
+ return cls.__DEFAULT_ENCODING
75
+
76
+ except (ValueError, LookupError, UnicodeEncodeError):
77
+ return cls.__DEFAULT_ENCODING
11
78
 
12
79
 
13
80
  class Response(Adaptor):
@@ -20,6 +87,7 @@ class Response(Adaptor):
20
87
  self.cookies = cookies
21
88
  self.headers = headers
22
89
  self.request_headers = request_headers
90
+ encoding = ResponseEncoding.get_value(encoding)
23
91
  super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
24
92
  # For back-ward compatibility
25
93
  self.adaptor = self
@@ -67,7 +67,7 @@ def generate_headers(browser_mode: bool = False) -> Dict:
67
67
  # So we don't raise any inconsistency red flags while websites fingerprinting us
68
68
  os_name = get_os_name()
69
69
  return HeaderGenerator(
70
- browser=[Browser(name='chrome', min_version=128)],
70
+ browser=[Browser(name='chrome', min_version=130)],
71
71
  os=os_name, # None is ignored
72
72
  device='desktop'
73
73
  ).generate()
@@ -11,6 +11,7 @@ class Fetcher(BaseFetcher):
11
11
  """
12
12
  def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
13
13
  """Make basic HTTP GET request for you but with some added flavors.
14
+
14
15
  :param url: Target url.
15
16
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
16
17
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -24,6 +25,7 @@ class Fetcher(BaseFetcher):
24
25
 
25
26
  def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
26
27
  """Make basic HTTP POST request for you but with some added flavors.
28
+
27
29
  :param url: Target url.
28
30
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
29
31
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -37,12 +39,14 @@ class Fetcher(BaseFetcher):
37
39
 
38
40
  def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
39
41
  """Make basic HTTP PUT request for you but with some added flavors.
42
+
40
43
  :param url: Target url
41
44
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
42
45
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
43
46
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
44
- create a referer header as if this request came from Google's search of this URL's domain.
47
+ create a referer header as if this request came from Google's search of this URL's domain.
45
48
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
49
+
46
50
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
47
51
  """
48
52
  response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
@@ -50,6 +54,7 @@ class Fetcher(BaseFetcher):
50
54
 
51
55
  def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
52
56
  """Make basic HTTP DELETE request for you but with some added flavors.
57
+
53
58
  :param url: Target url
54
59
  :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
55
60
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
@@ -77,6 +82,7 @@ class StealthyFetcher(BaseFetcher):
77
82
  ) -> Response:
78
83
  """
79
84
  Opens up a browser and do your request based on your chosen options below.
85
+
80
86
  :param url: Target url.
81
87
  :param headless: Run the browser in headless/hidden (default), 'virtual' screen mode, or headful/visible mode.
82
88
  :param block_images: Prevent the loading of images through Firefox preferences.
@@ -127,26 +133,28 @@ class PlayWrightFetcher(BaseFetcher):
127
133
  Using this Fetcher class, you can do requests with:
128
134
  - Vanilla Playwright without any modifications other than the ones you chose.
129
135
  - Stealthy Playwright with the stealth mode I wrote for it. It's still a work in progress but it bypasses many online tests like bot.sannysoft.com
130
- Some of the things stealth mode does include:
131
- 1) Patches the CDP runtime fingerprint.
132
- 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
133
- 3) Using custom flags on launch to hide Playwright even more and make it faster.
134
- 4) Generates real browser's headers of the same type and same user OS then append it to the request.
135
- - Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
136
+ Some of the things stealth mode does include:
137
+ 1) Patches the CDP runtime fingerprint.
138
+ 2) Mimics some of the real browsers' properties by injecting several JS files and using custom options.
139
+ 3) Using custom flags on launch to hide Playwright even more and make it faster.
140
+ 4) Generates real browser's headers of the same type and same user OS then append it to the request.
141
+ - Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
136
142
  - NSTBrowser's docker browserless option by passing the CDP URL and enabling `nstbrowser_mode` option.
137
- > Note that these are the main options with PlayWright but it can be mixed together.
143
+
144
+ > Note that these are the main options with PlayWright but it can be mixed together.
138
145
  """
139
146
  def fetch(
140
147
  self, url: str, headless: Union[bool, str] = True, disable_resources: bool = None,
141
148
  useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
142
- page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
143
- hide_canvas: bool = True, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
149
+ page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
150
+ hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
144
151
  proxy: Optional[Union[str, Dict[str, str]]] = None,
145
- stealth: bool = False,
152
+ stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
146
153
  cdp_url: Optional[str] = None,
147
- nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
154
+ nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
148
155
  ) -> Response:
149
156
  """Opens up a browser and do your request based on your chosen options below.
157
+
150
158
  :param url: Target url.
151
159
  :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
152
160
  :param disable_resources: Drop requests of unnecessary resources for speed boost. It depends but it made requests ~25% faster in my tests for some websites.
@@ -159,6 +167,7 @@ class PlayWrightFetcher(BaseFetcher):
159
167
  :param wait_selector: Wait for a specific css selector to be in a specific state.
160
168
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
161
169
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
170
+ :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
162
171
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
163
172
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
164
173
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
@@ -176,6 +185,7 @@ class PlayWrightFetcher(BaseFetcher):
176
185
  cdp_url=cdp_url,
177
186
  headless=headless,
178
187
  useragent=useragent,
188
+ real_chrome=real_chrome,
179
189
  page_action=page_action,
180
190
  hide_canvas=hide_canvas,
181
191
  network_idle=network_idle,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.4
3
+ Version: 0.2.6
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -39,7 +39,7 @@ Requires-Dist: w3lib
39
39
  Requires-Dist: orjson>=3
40
40
  Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
- Requires-Dist: playwright
42
+ Requires-Dist: playwright==1.48
43
43
  Requires-Dist: rebrowser-playwright
44
44
  Requires-Dist: camoufox>=0.3.10
45
45
  Requires-Dist: browserforge
@@ -336,9 +336,11 @@ Using this Fetcher class, you can make requests with:
336
336
  * Mimics some of the real browsers' properties by injecting several JS files and using custom options.
337
337
  * Using custom flags on launch to hide Playwright even more and make it faster.
338
338
  * Generates real browser's headers of the same type and same user OS then append it to the request's headers.
339
- 3) Real browsers by passing the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
339
+ 3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
340
340
  4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
341
341
 
342
+ > Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
343
+
342
344
  Add that to a lot of controlling/hiding options as you will see in the arguments list below.
343
345
 
344
346
  <details><summary><strong>Expand this for the complete list of arguments</strong></summary>
@@ -360,6 +362,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
360
362
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
361
363
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
362
364
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
365
+ | real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
363
366
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
364
367
  | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
365
368
  | nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
@@ -42,6 +42,7 @@ tests/fetchers/__init__.py
42
42
  tests/fetchers/test_camoufox.py
43
43
  tests/fetchers/test_httpx.py
44
44
  tests/fetchers/test_playwright.py
45
+ tests/fetchers/test_utils.py
45
46
  tests/parser/__init__.py
46
47
  tests/parser/test_automatch.py
47
48
  tests/parser/test_general.py
@@ -5,7 +5,7 @@ w3lib
5
5
  orjson>=3
6
6
  tldextract
7
7
  httpx[brotli,zstd]
8
- playwright
8
+ playwright==1.48
9
9
  rebrowser-playwright
10
10
  camoufox>=0.3.10
11
11
  browserforge
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = scrapling
3
- version = 0.2.4
3
+ version = 0.2.6
4
4
  author = Karim Shoair
5
5
  author_email = karim.shoair@pm.me
6
6
  description = Scrapling is an undetectable, powerful, flexible, adaptive, and high-performance web scraping library for Python.
@@ -6,7 +6,7 @@ with open("README.md", "r", encoding="utf-8") as fh:
6
6
 
7
7
  setup(
8
8
  name="scrapling",
9
- version="0.2.4",
9
+ version="0.2.6",
10
10
  description="""Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
11
11
  simplifies the process of extracting data from websites, even when they undergo structural changes, and offers
12
12
  impressive speed improvements over many popular scraping tools.""",
@@ -55,7 +55,7 @@ setup(
55
55
  "orjson>=3",
56
56
  "tldextract",
57
57
  'httpx[brotli,zstd]',
58
- 'playwright',
58
+ 'playwright==1.48', # Temporary because currently All libraries that provide CDP patches doesn't support playwright 1.49 yet
59
59
  'rebrowser-playwright',
60
60
  'camoufox>=0.3.10',
61
61
  'browserforge',
@@ -36,6 +36,7 @@ class TestStealthyFetcher(unittest.TestCase):
36
36
  def test_waiting_selector(self):
37
37
  """Test if waiting for a selector make page does not finish loading or not"""
38
38
  self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
39
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
39
40
 
40
41
  def test_cookies_loading(self):
41
42
  """Test if cookies are set after the request"""
@@ -56,6 +57,7 @@ class TestStealthyFetcher(unittest.TestCase):
56
57
  self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=True).status, 200)
57
58
  self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=False, allow_webgl=True).status, 200)
58
59
  self.assertEqual(self.fetcher.fetch(self.html_url, block_webrtc=True, allow_webgl=False).status, 200)
60
+ self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}, os_randomize=True).status, 200)
59
61
 
60
62
  def test_infinite_timeout(self):
61
63
  """Test if infinite timeout breaks the code or not"""
@@ -35,6 +35,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
35
35
  def test_waiting_selector(self):
36
36
  """Test if waiting for a selector make page does not finish loading or not"""
37
37
  self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1').status, 200)
38
+ self.assertEqual(self.fetcher.fetch(self.html_url, wait_selector='h1', wait_selector_state='visible').status, 200)
38
39
 
39
40
  def test_cookies_loading(self):
40
41
  """Test if cookies are set after the request"""
@@ -56,6 +57,7 @@ class TestPlayWrightFetcher(unittest.TestCase):
56
57
  self.assertEqual(self.fetcher.fetch(self.html_url, disable_webgl=False, hide_canvas=True).status, 200)
57
58
  self.assertEqual(self.fetcher.fetch(self.html_url, stealth=True).status, 200)
58
59
  self.assertEqual(self.fetcher.fetch(self.html_url, useragent='Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:131.0) Gecko/20100101 Firefox/131.0').status, 200)
60
+ self.assertEqual(self.fetcher.fetch(self.html_url, extra_headers={'ayo': ''}).status, 200)
59
61
 
60
62
  def test_cdp_url(self):
61
63
  """Test if it's going to try to connect to cdp url or not"""
@@ -0,0 +1,129 @@
1
+ import unittest
2
+
3
+ from scrapling.engines.toolbelt.custom import ResponseEncoding, StatusText
4
+
5
+
6
+ class TestPlayWrightFetcher(unittest.TestCase):
7
+ def setUp(self):
8
+ self.content_type_map = {
9
+ # A map generated by ChatGPT for most possible `content_type` values and the expected outcome
10
+ 'text/html; charset=UTF-8': 'UTF-8',
11
+ 'text/html; charset=ISO-8859-1': 'ISO-8859-1',
12
+ 'text/html': 'ISO-8859-1',
13
+ 'application/json; charset=UTF-8': 'UTF-8',
14
+ 'application/json': 'utf-8',
15
+ 'text/json': 'utf-8',
16
+ 'application/javascript; charset=UTF-8': 'UTF-8',
17
+ 'application/javascript': 'utf-8',
18
+ 'text/plain; charset=UTF-8': 'UTF-8',
19
+ 'text/plain; charset=ISO-8859-1': 'ISO-8859-1',
20
+ 'text/plain': 'ISO-8859-1',
21
+ 'application/xhtml+xml; charset=UTF-8': 'UTF-8',
22
+ 'application/xhtml+xml': 'utf-8',
23
+ 'text/html; charset=windows-1252': 'windows-1252',
24
+ 'application/json; charset=windows-1252': 'windows-1252',
25
+ 'text/plain; charset=windows-1252': 'windows-1252',
26
+ 'text/html; charset="UTF-8"': 'UTF-8',
27
+ 'text/html; charset="ISO-8859-1"': 'ISO-8859-1',
28
+ 'text/html; charset="windows-1252"': 'windows-1252',
29
+ 'application/json; charset="UTF-8"': 'UTF-8',
30
+ 'application/json; charset="ISO-8859-1"': 'ISO-8859-1',
31
+ 'application/json; charset="windows-1252"': 'windows-1252',
32
+ 'text/json; charset="UTF-8"': 'UTF-8',
33
+ 'application/javascript; charset="UTF-8"': 'UTF-8',
34
+ 'application/javascript; charset="ISO-8859-1"': 'ISO-8859-1',
35
+ 'text/plain; charset="UTF-8"': 'UTF-8',
36
+ 'text/plain; charset="ISO-8859-1"': 'ISO-8859-1',
37
+ 'text/plain; charset="windows-1252"': 'windows-1252',
38
+ 'application/xhtml+xml; charset="UTF-8"': 'UTF-8',
39
+ 'application/xhtml+xml; charset="ISO-8859-1"': 'ISO-8859-1',
40
+ 'application/xhtml+xml; charset="windows-1252"': 'windows-1252',
41
+ 'text/html; charset="US-ASCII"': 'US-ASCII',
42
+ 'application/json; charset="US-ASCII"': 'US-ASCII',
43
+ 'text/plain; charset="US-ASCII"': 'US-ASCII',
44
+ 'text/html; charset="Shift_JIS"': 'Shift_JIS',
45
+ 'application/json; charset="Shift_JIS"': 'Shift_JIS',
46
+ 'text/plain; charset="Shift_JIS"': 'Shift_JIS',
47
+ 'application/xml; charset="UTF-8"': 'UTF-8',
48
+ 'application/xml; charset="ISO-8859-1"': 'ISO-8859-1',
49
+ 'application/xml': 'utf-8',
50
+ 'text/xml; charset="UTF-8"': 'UTF-8',
51
+ 'text/xml; charset="ISO-8859-1"': 'ISO-8859-1',
52
+ 'text/xml': 'utf-8'
53
+ }
54
+ self.status_map = {
55
+ 100: "Continue",
56
+ 101: "Switching Protocols",
57
+ 102: "Processing",
58
+ 103: "Early Hints",
59
+ 200: "OK",
60
+ 201: "Created",
61
+ 202: "Accepted",
62
+ 203: "Non-Authoritative Information",
63
+ 204: "No Content",
64
+ 205: "Reset Content",
65
+ 206: "Partial Content",
66
+ 207: "Multi-Status",
67
+ 208: "Already Reported",
68
+ 226: "IM Used",
69
+ 300: "Multiple Choices",
70
+ 301: "Moved Permanently",
71
+ 302: "Found",
72
+ 303: "See Other",
73
+ 304: "Not Modified",
74
+ 305: "Use Proxy",
75
+ 307: "Temporary Redirect",
76
+ 308: "Permanent Redirect",
77
+ 400: "Bad Request",
78
+ 401: "Unauthorized",
79
+ 402: "Payment Required",
80
+ 403: "Forbidden",
81
+ 404: "Not Found",
82
+ 405: "Method Not Allowed",
83
+ 406: "Not Acceptable",
84
+ 407: "Proxy Authentication Required",
85
+ 408: "Request Timeout",
86
+ 409: "Conflict",
87
+ 410: "Gone",
88
+ 411: "Length Required",
89
+ 412: "Precondition Failed",
90
+ 413: "Payload Too Large",
91
+ 414: "URI Too Long",
92
+ 415: "Unsupported Media Type",
93
+ 416: "Range Not Satisfiable",
94
+ 417: "Expectation Failed",
95
+ 418: "I'm a teapot",
96
+ 421: "Misdirected Request",
97
+ 422: "Unprocessable Entity",
98
+ 423: "Locked",
99
+ 424: "Failed Dependency",
100
+ 425: "Too Early",
101
+ 426: "Upgrade Required",
102
+ 428: "Precondition Required",
103
+ 429: "Too Many Requests",
104
+ 431: "Request Header Fields Too Large",
105
+ 451: "Unavailable For Legal Reasons",
106
+ 500: "Internal Server Error",
107
+ 501: "Not Implemented",
108
+ 502: "Bad Gateway",
109
+ 503: "Service Unavailable",
110
+ 504: "Gateway Timeout",
111
+ 505: "HTTP Version Not Supported",
112
+ 506: "Variant Also Negotiates",
113
+ 507: "Insufficient Storage",
114
+ 508: "Loop Detected",
115
+ 510: "Not Extended",
116
+ 511: "Network Authentication Required"
117
+ }
118
+
119
+ def test_parsing_content_type(self):
120
+ """Test if parsing different types of content-type returns the expected result"""
121
+ for header_value, expected_encoding in self.content_type_map.items():
122
+ self.assertEqual(ResponseEncoding.get_value(header_value), expected_encoding)
123
+
124
+ def test_parsing_response_status(self):
125
+ """Test if using different http responses' status codes returns the expected result"""
126
+ for status_code, expected_status_text in self.status_map.items():
127
+ self.assertEqual(StatusText.get(status_code), expected_status_text)
128
+
129
+ self.assertEqual(StatusText.get(1000), "Unknown Status Code")
@@ -278,7 +278,7 @@ class TestParser(unittest.TestCase):
278
278
  self.assertEqual(len(elements), 5000)
279
279
  # Converting 5000 elements to a class and doing operations on them will take time
280
280
  # Based on my tests with 100 runs, 1 loop each Scrapling (given the extra work/features) takes 10.4ms on average
281
- self.assertLess(end_time - start_time, 0.1)
281
+ self.assertLess(end_time - start_time, 0.5) # Locally I test on 0.1 but on GitHub actions with browsers and threading sometimes closing adds fractions of seconds
282
282
 
283
283
 
284
284
  # Use `coverage run -m unittest --verbose tests/test_parser_functions.py` instead for the coverage report
File without changes
File without changes
File without changes
File without changes
File without changes