scrapling 0.2.6__py3-none-any.whl → 0.2.8__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/__init__.py CHANGED
@@ -1,10 +1,11 @@
1
1
  # Declare top-level shortcuts
2
- from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
2
+ from scrapling.core.custom_types import AttributesHandler, TextHandler
3
+ from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
4
+ StealthyFetcher)
3
5
  from scrapling.parser import Adaptor, Adaptors
4
- from scrapling.core.custom_types import TextHandler, AttributesHandler
5
6
 
6
7
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
7
- __version__ = "0.2.6"
8
+ __version__ = "0.2.8"
8
9
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
9
10
 
10
11
 
scrapling/core/_types.py CHANGED
@@ -2,9 +2,8 @@
2
2
  Type definitions for type checking purposes.
3
3
  """
4
4
 
5
- from typing import (
6
- Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
7
- )
5
+ from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
6
+ List, Literal, Optional, Pattern, Tuple, Type, Union)
8
7
 
9
8
  try:
10
9
  from typing import Protocol
@@ -1,13 +1,13 @@
1
1
  import re
2
- from types import MappingProxyType
3
2
  from collections.abc import Mapping
3
+ from types import MappingProxyType
4
4
 
5
- from scrapling.core.utils import _is_iterable, flatten
6
- from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
7
-
8
- from orjson import loads, dumps
5
+ from orjson import dumps, loads
9
6
  from w3lib.html import replace_entities as _replace_entities
10
7
 
8
+ from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
9
+ from scrapling.core.utils import _is_iterable, flatten
10
+
11
11
 
12
12
  class TextHandler(str):
13
13
  """Extends standard Python string by adding more functionality"""
@@ -1,16 +1,16 @@
1
- import orjson
2
- import sqlite3
3
1
  import logging
2
+ import sqlite3
4
3
  import threading
5
- from hashlib import sha256
6
4
  from abc import ABC, abstractmethod
5
+ from hashlib import sha256
7
6
 
8
- from scrapling.core._types import Dict, Optional, Union
9
- from scrapling.core.utils import _StorageTools, cache
10
-
7
+ import orjson
11
8
  from lxml import html
12
9
  from tldextract import extract as tld
13
10
 
11
+ from scrapling.core._types import Dict, Optional, Union
12
+ from scrapling.core.utils import _StorageTools, cache
13
+
14
14
 
15
15
  class StorageSystemMixin(ABC):
16
16
  # If you want to make your own storage system, you have to inherit from this
@@ -10,15 +10,14 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
10
10
 
11
11
  import re
12
12
 
13
- from w3lib.html import HTML5_WHITESPACE
14
- from scrapling.core.utils import cache
15
- from scrapling.core._types import Any, Optional, Protocol, Self
16
-
17
- from cssselect.xpath import ExpressionError
18
- from cssselect.xpath import XPathExpr as OriginalXPathExpr
19
13
  from cssselect import HTMLTranslator as OriginalHTMLTranslator
20
14
  from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
15
+ from cssselect.xpath import ExpressionError
16
+ from cssselect.xpath import XPathExpr as OriginalXPathExpr
17
+ from w3lib.html import HTML5_WHITESPACE
21
18
 
19
+ from scrapling.core._types import Any, Optional, Protocol, Self
20
+ from scrapling.core.utils import cache
22
21
 
23
22
  regex = f"[{HTML5_WHITESPACE}]+"
24
23
  replace_html5_whitespaces = re.compile(regex).sub
scrapling/core/utils.py CHANGED
@@ -1,22 +1,25 @@
1
- import re
2
1
  import logging
2
+ import re
3
3
  from itertools import chain
4
- # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
5
- from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
6
-
7
- from scrapling.core._types import Dict, Iterable, Any, Union
8
4
 
9
5
  import orjson
10
6
  from lxml import html
11
7
 
8
+ from scrapling.core._types import Any, Dict, Iterable, Union
9
+
10
+ # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
11
+ # functools.cache is available on Python 3.9+ only so let's keep lru_cache
12
+ from functools import lru_cache as cache # isort:skip
13
+
14
+
12
15
  html_forbidden = {html.HtmlComment, }
13
16
  logging.basicConfig(
14
- level=logging.ERROR,
15
- format='%(asctime)s - %(levelname)s - %(message)s',
16
- handlers=[
17
- logging.StreamHandler()
18
- ]
19
- )
17
+ level=logging.ERROR,
18
+ format='%(asctime)s - %(levelname)s - %(message)s',
19
+ handlers=[
20
+ logging.StreamHandler()
21
+ ]
22
+ )
20
23
 
21
24
 
22
25
  def is_jsonable(content: Union[bytes, str]) -> bool:
@@ -94,7 +97,7 @@ class _StorageTools:
94
97
  parent = element.getparent()
95
98
  return tuple(
96
99
  (element.tag,) if parent is None else (
97
- cls._get_element_path(parent) + (element.tag,)
100
+ cls._get_element_path(parent) + (element.tag,)
98
101
  )
99
102
  )
100
103
 
scrapling/defaults.py CHANGED
@@ -1,4 +1,4 @@
1
- from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
1
+ from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
2
2
 
3
3
  # If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
4
4
  Fetcher = Fetcher()
@@ -1,7 +1,7 @@
1
1
  from .camo import CamoufoxEngine
2
- from .static import StaticEngine
3
- from .pw import PlaywrightEngine
4
2
  from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
3
+ from .pw import PlaywrightEngine
4
+ from .static import StaticEngine
5
5
  from .toolbelt import check_if_engine_usable
6
6
 
7
7
  __all__ = ['CamoufoxEngine', 'PlaywrightEngine']
scrapling/engines/camo.py CHANGED
@@ -1,19 +1,16 @@
1
1
  import logging
2
- from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
3
-
4
- from scrapling.engines.toolbelt import (
5
- Response,
6
- do_nothing,
7
- StatusText,
8
- get_os_name,
9
- intercept_route,
10
- check_type_validity,
11
- construct_proxy_dict,
12
- generate_convincing_referer,
13
- )
14
2
 
3
+ from camoufox import DefaultAddons
15
4
  from camoufox.sync_api import Camoufox
16
5
 
6
+ from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
7
+ Union)
8
+ from scrapling.engines.toolbelt import (Response, StatusText,
9
+ check_type_validity,
10
+ construct_proxy_dict, do_nothing,
11
+ generate_convincing_referer,
12
+ get_os_name, intercept_route)
13
+
17
14
 
18
15
  class CamoufoxEngine:
19
16
  def __init__(
@@ -21,7 +18,8 @@ class CamoufoxEngine:
21
18
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
22
19
  timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
23
20
  wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
24
- proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
21
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
22
+ adaptor_arguments: Dict = None,
25
23
  ):
26
24
  """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
27
25
 
@@ -36,6 +34,7 @@ class CamoufoxEngine:
36
34
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
37
35
  :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
38
36
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
37
+ :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
39
38
  :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
40
39
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
41
40
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
@@ -54,6 +53,7 @@ class CamoufoxEngine:
54
53
  self.network_idle = bool(network_idle)
55
54
  self.google_search = bool(google_search)
56
55
  self.os_randomize = bool(os_randomize)
56
+ self.disable_ads = bool(disable_ads)
57
57
  self.extra_headers = extra_headers or {}
58
58
  self.proxy = construct_proxy_dict(proxy)
59
59
  self.addons = addons or []
@@ -75,9 +75,11 @@ class CamoufoxEngine:
75
75
  :param url: Target url.
76
76
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
77
77
  """
78
+ addons = [] if self.disable_ads else [DefaultAddons.UBO]
78
79
  with Camoufox(
79
80
  proxy=self.proxy,
80
81
  addons=self.addons,
82
+ exclude_addons=addons,
81
83
  headless=self.headless,
82
84
  humanize=self.humanize,
83
85
  i_know_what_im_doing=True, # To turn warnings off with the user configurations
@@ -105,6 +107,11 @@ class CamoufoxEngine:
105
107
  if self.wait_selector and type(self.wait_selector) is str:
106
108
  waiter = page.locator(self.wait_selector)
107
109
  waiter.first.wait_for(state=self.wait_selector_state)
110
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
111
+ page.wait_for_load_state(state="load")
112
+ page.wait_for_load_state(state="domcontentloaded")
113
+ if self.network_idle:
114
+ page.wait_for_load_state('networkidle')
108
115
 
109
116
  # This will be parsed inside `Response`
110
117
  encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
@@ -44,7 +44,7 @@ DEFAULT_STEALTH_FLAGS = [
44
44
  '--disable-default-apps',
45
45
  '--disable-print-preview',
46
46
  '--disable-dev-shm-usage',
47
- '--disable-popup-blocking',
47
+ # '--disable-popup-blocking',
48
48
  '--metrics-recording-only',
49
49
  '--disable-crash-reporter',
50
50
  '--disable-partial-raster',
scrapling/engines/pw.py CHANGED
@@ -1,20 +1,15 @@
1
1
  import json
2
2
  import logging
3
- from scrapling.core._types import Union, Callable, Optional, List, Dict
4
-
5
- from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
6
- from scrapling.engines.toolbelt import (
7
- Response,
8
- do_nothing,
9
- StatusText,
10
- js_bypass_path,
11
- intercept_route,
12
- generate_headers,
13
- construct_cdp_url,
14
- check_type_validity,
15
- construct_proxy_dict,
16
- generate_convincing_referer,
17
- )
3
+
4
+ from scrapling.core._types import Callable, Dict, List, Optional, Union
5
+ from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
6
+ NSTBROWSER_DEFAULT_QUERY)
7
+ from scrapling.engines.toolbelt import (Response, StatusText,
8
+ check_type_validity, construct_cdp_url,
9
+ construct_proxy_dict, do_nothing,
10
+ generate_convincing_referer,
11
+ generate_headers, intercept_route,
12
+ js_bypass_path)
18
13
 
19
14
 
20
15
  class PlaywrightEngine:
@@ -26,6 +21,7 @@ class PlaywrightEngine:
26
21
  timeout: Optional[float] = 30000,
27
22
  page_action: Callable = do_nothing,
28
23
  wait_selector: Optional[str] = None,
24
+ locale: Optional[str] = 'en-US',
29
25
  wait_selector_state: Optional[str] = 'attached',
30
26
  stealth: Optional[bool] = False,
31
27
  real_chrome: Optional[bool] = False,
@@ -50,6 +46,7 @@ class PlaywrightEngine:
50
46
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
51
47
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
52
48
  :param wait_selector: Wait for a specific css selector to be in a specific state.
49
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
53
50
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
54
51
  :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
55
52
  :param real_chrome: If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it.
@@ -64,6 +61,7 @@ class PlaywrightEngine:
64
61
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
65
62
  """
66
63
  self.headless = headless
64
+ self.locale = check_type_validity(locale, [str], 'en-US', param_name='locale')
67
65
  self.disable_resources = disable_resources
68
66
  self.network_idle = bool(network_idle)
69
67
  self.stealth = bool(stealth)
@@ -87,6 +85,14 @@ class PlaywrightEngine:
87
85
  self.nstbrowser_mode = bool(nstbrowser_mode)
88
86
  self.nstbrowser_config = nstbrowser_config
89
87
  self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
88
+ self.harmful_default_args = [
89
+ # This will be ignored to avoid detection more and possibly avoid the popup crashing bug abuse: https://issues.chromium.org/issues/340836884
90
+ '--enable-automation',
91
+ '--disable-popup-blocking',
92
+ # '--disable-component-update',
93
+ # '--disable-default-apps',
94
+ # '--disable-extensions',
95
+ ]
90
96
 
91
97
  def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
92
98
  """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
@@ -151,15 +157,15 @@ class PlaywrightEngine:
151
157
  else:
152
158
  if self.stealth:
153
159
  browser = p.chromium.launch(
154
- headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
160
+ headless=self.headless, args=flags, ignore_default_args=self.harmful_default_args, chromium_sandbox=True, channel='chrome' if self.real_chrome else 'chromium'
155
161
  )
156
162
  else:
157
- browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'], channel='chrome' if self.real_chrome else 'chromium')
163
+ browser = p.chromium.launch(headless=self.headless, ignore_default_args=self.harmful_default_args, channel='chrome' if self.real_chrome else 'chromium')
158
164
 
159
165
  # Creating the context
160
166
  if self.stealth:
161
167
  context = browser.new_context(
162
- locale='en-US',
168
+ locale=self.locale,
163
169
  is_mobile=False,
164
170
  has_touch=False,
165
171
  proxy=self.proxy,
@@ -176,6 +182,8 @@ class PlaywrightEngine:
176
182
  )
177
183
  else:
178
184
  context = browser.new_context(
185
+ locale=self.locale,
186
+ proxy=self.proxy,
179
187
  color_scheme='dark',
180
188
  user_agent=useragent,
181
189
  device_scale_factor=2,
@@ -221,6 +229,11 @@ class PlaywrightEngine:
221
229
  if self.wait_selector and type(self.wait_selector) is str:
222
230
  waiter = page.locator(self.wait_selector)
223
231
  waiter.first.wait_for(state=self.wait_selector_state)
232
+ # Wait again after waiting for the selector, helpful with protections like Cloudflare
233
+ page.wait_for_load_state(state="load")
234
+ page.wait_for_load_state(state="domcontentloaded")
235
+ if self.network_idle:
236
+ page.wait_for_load_state('networkidle')
224
237
 
225
238
  # This will be parsed inside `Response`
226
239
  encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
@@ -1,11 +1,12 @@
1
1
  import logging
2
2
 
3
- from scrapling.core._types import Union, Optional, Dict
4
- from .toolbelt import Response, generate_convincing_referer, generate_headers
5
-
6
3
  import httpx
7
4
  from httpx._models import Response as httpxResponse
8
5
 
6
+ from scrapling.core._types import Dict, Optional, Union
7
+
8
+ from .toolbelt import Response, generate_convincing_referer, generate_headers
9
+
9
10
 
10
11
  class StaticEngine:
11
12
  def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
@@ -63,54 +64,66 @@ class StaticEngine:
63
64
  **self.adaptor_arguments
64
65
  )
65
66
 
66
- def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
67
+ def get(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
67
68
  """Make basic HTTP GET request for you but with some added flavors.
68
69
 
69
70
  :param url: Target url.
70
71
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
71
72
  create a referer header as if this request had came from Google's search of this URL's domain.
73
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
72
74
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
73
75
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
74
76
  """
75
77
  headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
76
- request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
78
+ with httpx.Client(proxy=proxy) as client:
79
+ request = client.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
80
+
77
81
  return self._prepare_response(request)
78
82
 
79
- def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
83
+ def post(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
80
84
  """Make basic HTTP POST request for you but with some added flavors.
81
85
 
82
86
  :param url: Target url.
83
87
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
84
88
  create a referer header as if this request had came from Google's search of this URL's domain.
89
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
85
90
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
86
91
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
87
92
  """
88
93
  headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
89
- request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
94
+ with httpx.Client(proxy=proxy) as client:
95
+ request = client.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
96
+
90
97
  return self._prepare_response(request)
91
98
 
92
- def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
99
+ def delete(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
93
100
  """Make basic HTTP DELETE request for you but with some added flavors.
94
101
 
95
102
  :param url: Target url.
96
103
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
97
104
  create a referer header as if this request had came from Google's search of this URL's domain.
105
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
98
106
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
99
107
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
100
108
  """
101
109
  headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
102
- request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
110
+ with httpx.Client(proxy=proxy) as client:
111
+ request = client.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
112
+
103
113
  return self._prepare_response(request)
104
114
 
105
- def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
115
+ def put(self, url: str, proxy: Optional[str] = None, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
106
116
  """Make basic HTTP PUT request for you but with some added flavors.
107
117
 
108
118
  :param url: Target url.
109
119
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
110
120
  create a referer header as if this request had came from Google's search of this URL's domain.
121
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
111
122
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
112
123
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
113
124
  """
114
125
  headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
115
- request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
126
+ with httpx.Client(proxy=proxy) as client:
127
+ request = client.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
128
+
116
129
  return self._prepare_response(request)
@@ -1,20 +1,6 @@
1
- from .fingerprints import (
2
- get_os_name,
3
- generate_headers,
4
- generate_convincing_referer,
5
- )
6
- from .custom import (
7
- Response,
8
- do_nothing,
9
- StatusText,
10
- BaseFetcher,
11
- get_variable_name,
12
- check_type_validity,
13
- check_if_engine_usable,
14
- )
15
- from .navigation import (
16
- js_bypass_path,
17
- intercept_route,
18
- construct_cdp_url,
19
- construct_proxy_dict,
20
- )
1
+ from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
2
+ check_type_validity, do_nothing, get_variable_name)
3
+ from .fingerprints import (generate_convincing_referer, generate_headers,
4
+ get_os_name)
5
+ from .navigation import (construct_cdp_url, construct_proxy_dict,
6
+ intercept_route, js_bypass_path)
@@ -5,10 +5,11 @@ import inspect
5
5
  import logging
6
6
  from email.message import Message
7
7
 
8
+ from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
9
+ Type, Union)
8
10
  from scrapling.core.custom_types import MappingProxyType
11
+ from scrapling.core.utils import cache, setup_basic_logging
9
12
  from scrapling.parser import Adaptor, SQLiteStorageSystem
10
- from scrapling.core.utils import setup_basic_logging, cache
11
- from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
12
13
 
13
14
 
14
15
  class ResponseEncoding:
@@ -39,7 +40,7 @@ class ResponseEncoding:
39
40
 
40
41
  @classmethod
41
42
  @cache(maxsize=None)
42
- def get_value(cls, content_type: Optional[str]) -> str:
43
+ def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') -> str:
43
44
  """Determine the appropriate character encoding from a content-type header.
44
45
 
45
46
  The encoding is determined by these rules in order:
@@ -50,26 +51,30 @@ class ResponseEncoding:
50
51
  5. Default to UTF-8 if nothing else matches
51
52
 
52
53
  :param content_type: Content-Type header value or None
54
+ :param text: A text to test the encoding on it
53
55
  :return: String naming the character encoding
54
56
  """
55
57
  if not content_type:
56
58
  return cls.__DEFAULT_ENCODING
57
59
 
58
60
  try:
61
+ encoding = None
59
62
  content_type, params = cls.__parse_content_type(content_type)
60
63
 
61
64
  # First check for explicit charset parameter
62
65
  if "charset" in params:
63
66
  encoding = params["charset"].strip("'\"")
64
- "test".encode(encoding) # Validate encoding
65
- return encoding
66
67
 
67
68
  # Apply content-type specific rules
68
- if content_type in cls.__ISO_8859_1_CONTENT_TYPES:
69
- return "ISO-8859-1"
69
+ elif content_type in cls.__ISO_8859_1_CONTENT_TYPES:
70
+ encoding = "ISO-8859-1"
71
+
72
+ elif content_type == "application/json":
73
+ encoding = cls.__DEFAULT_ENCODING
70
74
 
71
- if content_type == "application/json":
72
- return cls.__DEFAULT_ENCODING
75
+ if encoding:
76
+ _ = text.encode(encoding) # Validate encoding and validate it can encode the given text
77
+ return encoding
73
78
 
74
79
  return cls.__DEFAULT_ENCODING
75
80
 
@@ -87,7 +92,7 @@ class Response(Adaptor):
87
92
  self.cookies = cookies
88
93
  self.headers = headers
89
94
  self.request_headers = request_headers
90
- encoding = ResponseEncoding.get_value(encoding)
95
+ encoding = ResponseEncoding.get_value(encoding, text)
91
96
  super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
92
97
  # For back-ward compatibility
93
98
  self.adaptor = self
@@ -4,12 +4,12 @@ Functions related to generating headers and fingerprints generally
4
4
 
5
5
  import platform
6
6
 
7
- from scrapling.core.utils import cache
8
- from scrapling.core._types import Union, Dict
9
-
7
+ from browserforge.fingerprints import Fingerprint, FingerprintGenerator
8
+ from browserforge.headers import Browser, HeaderGenerator
10
9
  from tldextract import extract
11
- from browserforge.headers import HeaderGenerator, Browser
12
- from browserforge.fingerprints import FingerprintGenerator, Fingerprint
10
+
11
+ from scrapling.core._types import Dict, Union
12
+ from scrapling.core.utils import cache
13
13
 
14
14
 
15
15
  @cache(None, typed=True)
@@ -2,16 +2,16 @@
2
2
  Functions related to files and URLs
3
3
  """
4
4
 
5
- import os
6
5
  import logging
7
- from urllib.parse import urlparse, urlencode
6
+ import os
7
+ from urllib.parse import urlencode, urlparse
8
+
9
+ from playwright.sync_api import Route
8
10
 
11
+ from scrapling.core._types import Dict, Optional, Union
9
12
  from scrapling.core.utils import cache
10
- from scrapling.core._types import Union, Dict, Optional
11
13
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
12
14
 
13
- from playwright.sync_api import Route
14
-
15
15
 
16
16
  def intercept_route(route: Route) -> Union[Route, None]:
17
17
  """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
@@ -43,7 +43,7 @@ def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict
43
43
  }
44
44
  except ValueError:
45
45
  # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
46
- raise TypeError(f'The proxy argument\'s string is in invalid format!')
46
+ raise TypeError('The proxy argument\'s string is in invalid format!')
47
47
 
48
48
  elif isinstance(proxy_string, dict):
49
49
  valid_keys = ('server', 'username', 'password', )
scrapling/fetchers.py CHANGED
@@ -1,7 +1,8 @@
1
- from scrapling.core._types import Dict, Optional, Union, Callable, List, Literal
2
-
3
- from scrapling.engines.toolbelt import Response, BaseFetcher, do_nothing
4
- from scrapling.engines import CamoufoxEngine, PlaywrightEngine, StaticEngine, check_if_engine_usable
1
+ from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
2
+ Union)
3
+ from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
4
+ check_if_engine_usable)
5
+ from scrapling.engines.toolbelt import BaseFetcher, Response, do_nothing
5
6
 
6
7
 
7
8
  class Fetcher(BaseFetcher):
@@ -9,7 +10,7 @@ class Fetcher(BaseFetcher):
9
10
 
10
11
  Any additional keyword arguments passed to the methods below are passed to the respective httpx's method directly.
11
12
  """
12
- def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
13
+ def get(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
13
14
  """Make basic HTTP GET request for you but with some added flavors.
14
15
 
15
16
  :param url: Target url.
@@ -17,13 +18,14 @@ class Fetcher(BaseFetcher):
17
18
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
18
19
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
19
20
  create a referer header as if this request had came from Google's search of this URL's domain.
21
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
20
22
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
21
23
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
22
24
  """
23
- response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
25
+ response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, proxy, stealthy_headers, **kwargs)
24
26
  return response_object
25
27
 
26
- def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
28
+ def post(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
27
29
  """Make basic HTTP POST request for you but with some added flavors.
28
30
 
29
31
  :param url: Target url.
@@ -31,13 +33,14 @@ class Fetcher(BaseFetcher):
31
33
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
32
34
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
33
35
  create a referer header as if this request came from Google's search of this URL's domain.
36
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
34
37
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
35
38
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
36
39
  """
37
- response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
40
+ response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, proxy, stealthy_headers, **kwargs)
38
41
  return response_object
39
42
 
40
- def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
43
+ def put(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
41
44
  """Make basic HTTP PUT request for you but with some added flavors.
42
45
 
43
46
  :param url: Target url
@@ -45,14 +48,15 @@ class Fetcher(BaseFetcher):
45
48
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
46
49
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
47
50
  create a referer header as if this request came from Google's search of this URL's domain.
51
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
48
52
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
49
53
 
50
54
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
51
55
  """
52
- response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
56
+ response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, proxy, stealthy_headers, **kwargs)
53
57
  return response_object
54
58
 
55
- def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
59
+ def delete(self, url: str, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = 10, stealthy_headers: Optional[bool] = True, proxy: Optional[str] = None, **kwargs: Dict) -> Response:
56
60
  """Make basic HTTP DELETE request for you but with some added flavors.
57
61
 
58
62
  :param url: Target url
@@ -60,10 +64,11 @@ class Fetcher(BaseFetcher):
60
64
  :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
61
65
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
62
66
  create a referer header as if this request came from Google's search of this URL's domain.
67
+ :param proxy: A string of a proxy to use for http and https requests, the format accepted is `http://username:password@localhost:8030`
63
68
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
64
69
  :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
65
70
  """
66
- response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
71
+ response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, proxy, stealthy_headers, **kwargs)
67
72
  return response_object
68
73
 
69
74
 
@@ -78,7 +83,7 @@ class StealthyFetcher(BaseFetcher):
78
83
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
79
84
  timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
80
85
  wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
81
- os_randomize: Optional[bool] = None
86
+ os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
82
87
  ) -> Response:
83
88
  """
84
89
  Opens up a browser and do your request based on your chosen options below.
@@ -92,6 +97,7 @@ class StealthyFetcher(BaseFetcher):
92
97
  This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
93
98
  :param block_webrtc: Blocks WebRTC entirely.
94
99
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
100
+ :param disable_ads: Enabled by default, this installs `uBlock Origin` addon on the browser if enabled.
95
101
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
96
102
  :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
97
103
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
@@ -111,6 +117,7 @@ class StealthyFetcher(BaseFetcher):
111
117
  timeout=timeout,
112
118
  headless=headless,
113
119
  humanize=humanize,
120
+ disable_ads=disable_ads,
114
121
  allow_webgl=allow_webgl,
115
122
  page_action=page_action,
116
123
  network_idle=network_idle,
@@ -148,7 +155,7 @@ class PlayWrightFetcher(BaseFetcher):
148
155
  useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
149
156
  page_action: Optional[Callable] = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
150
157
  hide_canvas: Optional[bool] = False, disable_webgl: Optional[bool] = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
151
- proxy: Optional[Union[str, Dict[str, str]]] = None,
158
+ proxy: Optional[Union[str, Dict[str, str]]] = None, locale: Optional[str] = 'en-US',
152
159
  stealth: Optional[bool] = False, real_chrome: Optional[bool] = False,
153
160
  cdp_url: Optional[str] = None,
154
161
  nstbrowser_mode: Optional[bool] = False, nstbrowser_config: Optional[Dict] = None,
@@ -163,6 +170,7 @@ class PlayWrightFetcher(BaseFetcher):
163
170
  :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
164
171
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
165
172
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
173
+ :param locale: Set the locale for the browser if wanted. The default value is `en-US`.
166
174
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
167
175
  :param wait_selector: Wait for a specific css selector to be in a specific state.
168
176
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
@@ -180,6 +188,7 @@ class PlayWrightFetcher(BaseFetcher):
180
188
  """
181
189
  engine = PlaywrightEngine(
182
190
  proxy=proxy,
191
+ locale=locale,
183
192
  timeout=timeout,
184
193
  stealth=stealth,
185
194
  cdp_url=cdp_url,
scrapling/parser.py CHANGED
@@ -1,16 +1,23 @@
1
+ import inspect
1
2
  import os
2
3
  import re
3
- import inspect
4
4
  from difflib import SequenceMatcher
5
5
 
6
- from scrapling.core.translator import HTMLTranslator
7
- from scrapling.core.mixins import SelectorsGeneration
8
- from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
9
- from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
10
- from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
11
- from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
6
+ from cssselect import SelectorError, SelectorSyntaxError
7
+ from cssselect import parse as split_selectors
12
8
  from lxml import etree, html
13
- from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
9
+
10
+ from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
11
+ List, Optional, Pattern, SupportsIndex,
12
+ Tuple, Union)
13
+ from scrapling.core.custom_types import (AttributesHandler, TextHandler,
14
+ TextHandlers)
15
+ from scrapling.core.mixins import SelectorsGeneration
16
+ from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
17
+ StorageSystemMixin, _StorageTools)
18
+ from scrapling.core.translator import HTMLTranslator
19
+ from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
20
+ is_jsonable, logging, setup_basic_logging)
14
21
 
15
22
 
16
23
  class Adaptor(SelectorsGeneration):
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.6
4
- Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
3
+ Version: 0.2.8
4
+ Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
7
7
  Author-email: karim.shoair@pm.me
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
42
  Requires-Dist: playwright==1.48
43
43
  Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox>=0.3.10
44
+ Requires-Dist: camoufox>=0.4.4
45
45
  Requires-Dist: browserforge
46
46
 
47
47
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -52,7 +52,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
52
52
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
53
53
 
54
54
  ```python
55
- >> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
55
+ >> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
56
56
  # Fetch websites' source under the radar!
57
57
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
58
  >> print(page.status)
@@ -90,10 +90,11 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
90
90
  * [Text Extraction Speed Test (5000 nested elements).](#text-extraction-speed-test-5000-nested-elements)
91
91
  * [Extraction By Text Speed Test](#extraction-by-text-speed-test)
92
92
  * [Installation](#installation)
93
- * [Fetching Websites Features](#fetching-websites-features)
94
- * [Fetcher](#fetcher)
95
- * [StealthyFetcher](#stealthyfetcher)
96
- * [PlayWrightFetcher](#playwrightfetcher)
93
+ * [Fetching Websites](#fetching-websites)
94
+ * [Features](#features)
95
+ * [Fetcher class](#fetcher)
96
+ * [StealthyFetcher class](#stealthyfetcher)
97
+ * [PlayWrightFetcher class](#playwrightfetcher)
97
98
  * [Advanced Parsing Features](#advanced-parsing-features)
98
99
  * [Smart Navigation](#smart-navigation)
99
100
  * [Content-based Selection & Finding Similar Elements](#content-based-selection--finding-similar-elements)
@@ -256,43 +257,48 @@ playwright install chromium
256
257
  python -m browserforge update
257
258
  ```
258
259
 
259
- ## Fetching Websites Features
260
- You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
260
+ ## Fetching Websites
261
+ Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
262
+
263
+ ### Features
264
+ You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
261
265
  ```python
262
266
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
263
267
  ```
264
- And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
268
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
265
269
 
266
270
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
267
271
  ```python
268
- from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
272
+ from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
269
273
  ```
270
274
  then use it right away without initializing like:
271
275
  ```python
272
276
  page = StealthyFetcher.fetch('https://example.com')
273
277
  ```
274
278
 
275
- Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
279
+ Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
276
280
  > [!NOTE]
277
281
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
278
282
  ### Fetcher
279
283
  This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
280
284
 
281
285
  For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
286
+
287
+ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
282
288
  ```python
283
289
  >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
284
- >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'})
290
+ >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
285
291
  >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
286
292
  >> page = Fetcher().delete('https://httpbin.org/delete')
287
293
  ```
288
294
  ### StealthyFetcher
289
- This class is built on top of [Camoufox](https://github.com/daijro/camoufox) which by default bypasses most of the anti-bot protections. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
295
+ This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
290
296
  ```python
291
297
  >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
292
298
  >> page.status == 200
293
299
  True
294
300
  ```
295
- > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
301
+ > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
296
302
 
297
303
  <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
298
304
 
@@ -309,6 +315,7 @@ True
309
315
  | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
310
316
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
311
317
  | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
318
+ | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
312
319
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
313
320
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
314
321
  | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
@@ -327,7 +334,7 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
327
334
  >> page.css_first("#search a::attr(href)")
328
335
  'https://github.com/D4Vinci/Scrapling'
329
336
  ```
330
- > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
337
+ > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
331
338
 
332
339
  Using this Fetcher class, you can make requests with:
333
340
  1) Vanilla Playwright without any modifications other than the ones you chose.
@@ -339,7 +346,7 @@ Using this Fetcher class, you can make requests with:
339
346
  3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
340
347
  4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
341
348
 
342
- > Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
349
+ > Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
343
350
 
344
351
  Add that to a lot of controlling/hiding options as you will see in the arguments list below.
345
352
 
@@ -362,7 +369,8 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
362
369
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
363
370
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
364
371
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
365
- | real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
372
+ | real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
373
+ | locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
366
374
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
367
375
  | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
368
376
  | nstbrowser_config | The config you want to send with requests to the NSTBrowser. _If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config._ | ✔️ |
@@ -814,8 +822,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
814
822
  Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
815
823
 
816
824
  ## More Sponsors!
817
- [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
818
- <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
825
+ <a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" alt="SerpApi Banner" ></a>
819
826
 
820
827
 
821
828
  ## Contributing
@@ -0,0 +1,42 @@
1
+ scrapling/__init__.py,sha256=0-gw4uqckCs7ikl6sHiB5c6y0AelpgefqJkBmSd7j1k,469
2
+ scrapling/defaults.py,sha256=qO6zAS7k5_QXvbjuoBv87fUMqASGMuM2dVry9J9auv0,287
3
+ scrapling/fetchers.py,sha256=iw1wEuFg14akJYpSg9webfBjAL341Pnofn4IkWahGlE,17486
4
+ scrapling/parser.py,sha256=suXggr39GimLnnLm9ivM1CQ40AoDwGke2sgnWszqFqk,54331
5
+ scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
6
+ scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ scrapling/core/_types.py,sha256=__HJ2JTk5vx5eg_7HAJmDjaHrMDIaoxNG8fadLLyKV8,566
8
+ scrapling/core/custom_types.py,sha256=8GCgcZL-IT5lP6titxL-RPCiItQSuJZjSlFIGCDxoSs,8402
9
+ scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
10
+ scrapling/core/storage_adaptors.py,sha256=Q2-G7oDqoIqlIBEmnUsKwSzM2lNGNUPKtTbMjTV9178,6218
11
+ scrapling/core/translator.py,sha256=WN_xPyYrD1MjLPv8Ar8zHNTPC_iYsW29kkjET4hbFI0,5228
12
+ scrapling/core/utils.py,sha256=RajDRSPkVmszjpwNy8NIz8ZlUxPox8j2rSractr7Q9s,3779
13
+ scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
14
+ scrapling/engines/camo.py,sha256=fmpGMW5T7we5cQC8muyvVo_A27yAqc5csm7dO_2jHiE,8446
15
+ scrapling/engines/constants.py,sha256=WTn-X4kFIDWjXTiqOT0tm4XT5pijcdohFyZ0Af2C5Xc,3723
16
+ scrapling/engines/pw.py,sha256=kWbkHm2vnQYeGuJnicKlAL1HrBKuXoFtyRMNFXLs4VY,13962
17
+ scrapling/engines/static.py,sha256=h629IjT78YbhjFYBVSli53lKiYrG3929TAaZ7TA-j-Y,8022
18
+ scrapling/engines/toolbelt/__init__.py,sha256=0tSsxMH5ALOMPXrLkr8mTH7LWg9QfIse4Ij9vUFgYjY,391
19
+ scrapling/engines/toolbelt/custom.py,sha256=tab_wJmN6onvu2U8tDXeJ9jn6A47jTkmxSBoc-w8dIk,12789
20
+ scrapling/engines/toolbelt/fingerprints.py,sha256=Y3FW8uqxxeNK3v6vBVvki8VjeG5oRxSwim4Q2Hv_cRk,2917
21
+ scrapling/engines/toolbelt/navigation.py,sha256=Okpl4ynlLn2cUpSiaaoXDSOdDOXhvxNOOGphE_HXc5k,4016
22
+ scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
23
+ scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
24
+ scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
25
+ scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
26
+ scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
27
+ scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
28
+ scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
29
+ tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
30
+ tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
31
+ tests/fetchers/test_camoufox.py,sha256=-1v_0mXeBcAVW932nkFws1HIDCodGbpNYniSnVMHeeU,3116
32
+ tests/fetchers/test_httpx.py,sha256=rrw9q4KdDAHpQVa4sTmw278Yv1OlwY_SKPbpBPLVN7c,3508
33
+ tests/fetchers/test_playwright.py,sha256=xwhRmlw7WBrtqyilZsoMHkHpyAx7iXQ-YexDMJURTao,3702
34
+ tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
35
+ tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
+ tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
37
+ tests/parser/test_general.py,sha256=sPbwQRka9Mh8MDz2Sto8Rwg78t0SWWxELgzhTVPEplE,11785
38
+ scrapling-0.2.8.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
39
+ scrapling-0.2.8.dist-info/METADATA,sha256=0As--zWykpljObaw8DZQJr6udpHm4NyRN-dfUOUrhBc,66605
40
+ scrapling-0.2.8.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
41
+ scrapling-0.2.8.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
42
+ scrapling-0.2.8.dist-info/RECORD,,
@@ -1,4 +1,5 @@
1
1
  import unittest
2
+
2
3
  import pytest_httpbin
3
4
 
4
5
  from scrapling import StealthyFetcher
@@ -1,4 +1,5 @@
1
1
  import unittest
2
+
2
3
  import pytest_httpbin
3
4
 
4
5
  from scrapling import Fetcher
@@ -1,4 +1,5 @@
1
1
  import unittest
2
+
2
3
  import pytest_httpbin
3
4
 
4
5
  from scrapling import PlayWrightFetcher
@@ -1,9 +1,11 @@
1
1
 
2
2
  import pickle
3
3
  import unittest
4
- from scrapling import Adaptor
4
+
5
5
  from cssselect import SelectorError, SelectorSyntaxError
6
6
 
7
+ from scrapling import Adaptor
8
+
7
9
 
8
10
  class TestParser(unittest.TestCase):
9
11
  def setUp(self):
@@ -1,42 +0,0 @@
1
- scrapling/__init__.py,sha256=NnIpEZcBGs5Pu2TjqPCacC7N6LN37SbnniBU1AhgdXs,435
2
- scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
3
- scrapling/fetchers.py,sha256=-gc-Yo1MjF_4cdJ-5rxZqNC0owxFXTFoEBj08BFEYPs,16361
4
- scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
5
- scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
6
- scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
- scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
8
- scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
9
- scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
10
- scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
11
- scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
12
- scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
13
- scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
14
- scrapling/engines/camo.py,sha256=dXkdfFmf3M09RXAvaZ8CE5khsblC3Wd7_6jWfu8XO6I,7618
15
- scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
16
- scrapling/engines/pw.py,sha256=gMWJAZYpJbFK-GiyRrpVrMjyMqSSetE6hf8kmf0zR2o,12729
17
- scrapling/engines/static.py,sha256=wzBsoOHPpN5JV1izQSSSarPBNWB-wo0BDWNFuin6ID8,7109
18
- scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
19
- scrapling/engines/toolbelt/custom.py,sha256=ELr3_FwUqNI27E98kz-50OA5a6hQQtoIYrZoLKsvUpM,12551
20
- scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
21
- scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
22
- scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
23
- scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
24
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
25
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
26
- scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
27
- scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
28
- scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
29
- tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
30
- tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
31
- tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
32
- tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
33
- tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
34
- tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
35
- tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
- tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
37
- tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
38
- scrapling-0.2.6.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
39
- scrapling-0.2.6.dist-info/METADATA,sha256=cFOu2nlkXDsjyjkIt9kDu1nKKvS14xYH2LT4_VNH5j0,65362
40
- scrapling-0.2.6.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
41
- scrapling-0.2.6.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
42
- scrapling-0.2.6.dist-info/RECORD,,