scrapling 0.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/__init__.py CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
4
4
  from scrapling.core.custom_types import TextHandler, AttributesHandler
5
5
 
6
6
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
7
- __version__ = "0.2"
7
+ __version__ = "0.2.1"
8
8
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
9
9
 
10
10
 
scrapling/core/utils.py CHANGED
@@ -4,8 +4,9 @@ from itertools import chain
4
4
  # Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
5
5
  from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
6
6
 
7
- from scrapling.core._types import Dict, Iterable, Any
7
+ from scrapling.core._types import Dict, Iterable, Any, Union
8
8
 
9
+ import orjson
9
10
  from lxml import html
10
11
 
11
12
  html_forbidden = {html.HtmlComment, }
@@ -18,6 +19,17 @@ logging.basicConfig(
18
19
  )
19
20
 
20
21
 
22
+ def is_jsonable(content: Union[bytes, str]) -> bool:
23
+ if type(content) is bytes:
24
+ content = content.decode()
25
+
26
+ try:
27
+ _ = orjson.loads(content)
28
+ return True
29
+ except orjson.JSONDecodeError:
30
+ return False
31
+
32
+
21
33
  @cache(None, typed=True)
22
34
  def setup_basic_logging(level: str = 'debug'):
23
35
  levels = {
scrapling/engines/camo.py CHANGED
@@ -7,6 +7,7 @@ from scrapling.engines.toolbelt import (
7
7
  get_os_name,
8
8
  intercept_route,
9
9
  check_type_validity,
10
+ construct_proxy_dict,
10
11
  generate_convincing_referer,
11
12
  )
12
13
 
@@ -18,7 +19,8 @@ class CamoufoxEngine:
18
19
  self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
19
20
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
20
21
  timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, adaptor_arguments: Dict = None
22
+ wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
23
+ proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
22
24
  ):
23
25
  """An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
24
26
 
@@ -33,12 +35,14 @@ class CamoufoxEngine:
33
35
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
34
36
  :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
35
37
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
38
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
36
39
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
37
40
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
38
41
  :param wait_selector: Wait for a specific css selector to be in a specific state.
39
42
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
40
43
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
41
44
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
45
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
42
46
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
43
47
  """
44
48
  self.headless = headless
@@ -48,7 +52,9 @@ class CamoufoxEngine:
48
52
  self.allow_webgl = bool(allow_webgl)
49
53
  self.network_idle = bool(network_idle)
50
54
  self.google_search = bool(google_search)
55
+ self.os_randomize = bool(os_randomize)
51
56
  self.extra_headers = extra_headers or {}
57
+ self.proxy = construct_proxy_dict(proxy)
52
58
  self.addons = addons or []
53
59
  self.humanize = humanize
54
60
  self.timeout = check_type_validity(timeout, [int, float], 30000)
@@ -66,17 +72,18 @@ class CamoufoxEngine:
66
72
  """Opens up the browser and do your request based on your chosen options.
67
73
 
68
74
  :param url: Target url.
69
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
75
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
70
76
  """
71
77
  with Camoufox(
72
- headless=self.headless,
73
- block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
74
- os=get_os_name(),
75
- block_webrtc=self.block_webrtc,
76
- allow_webgl=self.allow_webgl,
78
+ proxy=self.proxy,
77
79
  addons=self.addons,
80
+ headless=self.headless,
78
81
  humanize=self.humanize,
79
- i_know_what_im_doing=True, # To turn warnings off with user configurations
82
+ i_know_what_im_doing=True, # To turn warnings off with the user configurations
83
+ allow_webgl=self.allow_webgl,
84
+ block_webrtc=self.block_webrtc,
85
+ block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
86
+ os=None if self.os_randomize else get_os_name(),
80
87
  ) as browser:
81
88
  page = browser.new_page()
82
89
  page.set_default_navigation_timeout(self.timeout)
scrapling/engines/pw.py CHANGED
@@ -9,8 +9,9 @@ from scrapling.engines.toolbelt import (
9
9
  js_bypass_path,
10
10
  intercept_route,
11
11
  generate_headers,
12
- check_type_validity,
13
12
  construct_cdp_url,
13
+ check_type_validity,
14
+ construct_proxy_dict,
14
15
  generate_convincing_referer,
15
16
  )
16
17
 
@@ -33,6 +34,7 @@ class PlaywrightEngine:
33
34
  nstbrowser_config: Optional[Dict] = None,
34
35
  google_search: Optional[bool] = True,
35
36
  extra_headers: Optional[Dict[str, str]] = None,
37
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
36
38
  adaptor_arguments: Dict = None
37
39
  ):
38
40
  """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
@@ -54,6 +56,7 @@ class PlaywrightEngine:
54
56
  :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
55
57
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
56
58
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
59
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
57
60
  :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
58
61
  :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
59
62
  """
@@ -65,6 +68,7 @@ class PlaywrightEngine:
65
68
  self.disable_webgl = bool(disable_webgl)
66
69
  self.google_search = bool(google_search)
67
70
  self.extra_headers = extra_headers or {}
71
+ self.proxy = construct_proxy_dict(proxy)
68
72
  self.cdp_url = cdp_url
69
73
  self.useragent = useragent
70
74
  self.timeout = check_type_validity(timeout, [int, float], 30000)
@@ -112,7 +116,7 @@ class PlaywrightEngine:
112
116
  """Opens up the browser and do your request based on your chosen options.
113
117
 
114
118
  :param url: Target url.
115
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
119
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
116
120
  """
117
121
  if not self.stealth:
118
122
  from playwright.sync_api import sync_playwright
@@ -151,6 +155,7 @@ class PlaywrightEngine:
151
155
  locale='en-US',
152
156
  is_mobile=False,
153
157
  has_touch=False,
158
+ proxy=self.proxy,
154
159
  color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
155
160
  user_agent=useragent,
156
161
  device_scale_factor=2,
@@ -48,7 +48,7 @@ class StaticEngine:
48
48
  """Takes httpx response and generates `Response` object from it.
49
49
 
50
50
  :param response: httpx response object
51
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
51
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
52
52
  """
53
53
  return Response(
54
54
  url=str(response.url),
@@ -69,9 +69,9 @@ class StaticEngine:
69
69
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
70
70
  create a referer header as if this request had came from Google's search of this URL's domain.
71
71
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
72
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
72
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
73
73
  """
74
- headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
74
+ headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
75
75
  request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
76
76
  return self._prepare_response(request)
77
77
 
@@ -81,9 +81,9 @@ class StaticEngine:
81
81
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
82
82
  create a referer header as if this request had came from Google's search of this URL's domain.
83
83
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
84
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
84
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
85
85
  """
86
- headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
86
+ headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
87
87
  request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
88
88
  return self._prepare_response(request)
89
89
 
@@ -93,9 +93,9 @@ class StaticEngine:
93
93
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
94
94
  create a referer header as if this request had came from Google's search of this URL's domain.
95
95
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
96
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
96
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
97
97
  """
98
- headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
98
+ headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
99
99
  request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
100
100
  return self._prepare_response(request)
101
101
 
@@ -105,8 +105,8 @@ class StaticEngine:
105
105
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
106
106
  create a referer header as if this request had came from Google's search of this URL's domain.
107
107
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
108
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
108
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
109
109
  """
110
- headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
110
+ headers = self._headers_job(kwargs.pop('headers', {}), url, stealthy_headers)
111
111
  request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
112
112
  return self._prepare_response(request)
@@ -15,4 +15,5 @@ from .navigation import (
15
15
  js_bypass_path,
16
16
  intercept_route,
17
17
  construct_cdp_url,
18
+ construct_proxy_dict,
18
19
  )
@@ -3,43 +3,29 @@ Functions related to custom types or type checking
3
3
  """
4
4
  import inspect
5
5
  import logging
6
- from dataclasses import dataclass, field
7
6
 
8
7
  from scrapling.core.utils import setup_basic_logging
9
8
  from scrapling.parser import Adaptor, SQLiteStorageSystem
10
9
  from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
11
10
 
12
11
 
13
- @dataclass(frozen=True)
14
- class Response:
12
+ class Response(Adaptor):
15
13
  """This class is returned by all engines as a way to unify response type between different libraries."""
16
- url: str
17
- text: str
18
- content: bytes
19
- status: int
20
- reason: str
21
- encoding: str = 'utf-8' # default encoding
22
- cookies: Dict = field(default_factory=dict)
23
- headers: Dict = field(default_factory=dict)
24
- request_headers: Dict = field(default_factory=dict)
25
- adaptor_arguments: Dict = field(default_factory=dict)
26
-
27
- @property
28
- def adaptor(self) -> Union[Adaptor, None]:
29
- """Generate Adaptor instance from this response if possible, otherwise return None"""
30
- automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
31
- if self.text:
32
- # For playwright that will be the response after all JS executed
33
- return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
34
- elif self.content:
35
- # For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
36
- # To get response Bytes after the load states
37
- # Reference: https://playwright.dev/python/docs/api/class-page
38
- return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
39
- return None
40
-
41
- def __repr__(self):
42
- return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
14
+
15
+ def __init__(self, url: str, text: str, content: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, adaptor_arguments: Dict, encoding: str = 'utf-8'):
16
+ automatch_domain = adaptor_arguments.pop('automatch_domain', None)
17
+ super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
18
+
19
+ self.status = status
20
+ self.reason = reason
21
+ self.cookies = cookies
22
+ self.headers = headers
23
+ self.request_headers = request_headers
24
+ # For back-ward compatibility
25
+ self.adaptor = self
26
+
27
+ # def __repr__(self):
28
+ # return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
43
29
 
44
30
 
45
31
  class BaseFetcher:
@@ -25,6 +25,40 @@ def intercept_route(route: Route) -> Union[Route, None]:
25
25
  return route.continue_()
26
26
 
27
27
 
28
+ def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
29
+ """Validate a proxy and return it in the acceptable format for Playwright
30
+ Reference: https://playwright.dev/python/docs/network#http-proxy
31
+
32
+ :param proxy_string: A string or a dictionary representation of the proxy.
33
+ :return:
34
+ """
35
+ if proxy_string:
36
+ if isinstance(proxy_string, str):
37
+ proxy = urlparse(proxy_string)
38
+ try:
39
+ return {
40
+ 'server': f'{proxy.scheme}://{proxy.hostname}:{proxy.port}',
41
+ 'username': proxy.username or '',
42
+ 'password': proxy.password or '',
43
+ }
44
+ except ValueError:
45
+ # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
46
+ raise TypeError(f'The proxy argument\'s string is in invalid format!')
47
+
48
+ elif isinstance(proxy_string, dict):
49
+ valid_keys = ('server', 'username', 'password', )
50
+ if all(key in valid_keys for key in proxy_string.keys()) and not any(key not in valid_keys for key in proxy_string.keys()):
51
+ return proxy_string
52
+ else:
53
+ raise TypeError(f'A proxy dictionary must have only these keys: {valid_keys}')
54
+
55
+ else:
56
+ raise TypeError(f'Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!')
57
+
58
+ # The default value for proxy in Playwright's source is `None`
59
+ return None
60
+
61
+
28
62
  def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
29
63
  """Takes a CDP URL, reconstruct it to check it's valid, then adds encoded parameters if exists
30
64
 
scrapling/fetchers.py CHANGED
@@ -17,7 +17,7 @@ class Fetcher(BaseFetcher):
17
17
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
18
18
  create a referer header as if this request had came from Google's search of this URL's domain.
19
19
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
20
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
20
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
21
21
  """
22
22
  response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).get(url, stealthy_headers, **kwargs)
23
23
  return response_object
@@ -30,7 +30,7 @@ class Fetcher(BaseFetcher):
30
30
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
31
31
  create a referer header as if this request came from Google's search of this URL's domain.
32
32
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
33
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
33
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
34
34
  """
35
35
  response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).post(url, stealthy_headers, **kwargs)
36
36
  return response_object
@@ -43,7 +43,7 @@ class Fetcher(BaseFetcher):
43
43
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
44
44
  create a referer header as if this request came from Google's search of this URL's domain.
45
45
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
46
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
46
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
47
47
  """
48
48
  response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).put(url, stealthy_headers, **kwargs)
49
49
  return response_object
@@ -56,7 +56,7 @@ class Fetcher(BaseFetcher):
56
56
  :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
57
57
  create a referer header as if this request came from Google's search of this URL's domain.
58
58
  :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
59
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
59
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
60
60
  """
61
61
  response_object = StaticEngine(follow_redirects, timeout, adaptor_arguments=self.adaptor_arguments).delete(url, stealthy_headers, **kwargs)
62
62
  return response_object
@@ -72,7 +72,8 @@ class StealthyFetcher(BaseFetcher):
72
72
  self, url: str, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
73
73
  block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, addons: Optional[List[str]] = None,
74
74
  timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, humanize: Optional[Union[bool, float]] = True,
75
- wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None
75
+ wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None, proxy: Optional[Union[str, Dict[str, str]]] = None,
76
+ os_randomize: Optional[bool] = None
76
77
  ) -> Response:
77
78
  """
78
79
  Opens up a browser and do your request based on your chosen options below.
@@ -88,29 +89,33 @@ class StealthyFetcher(BaseFetcher):
88
89
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
89
90
  :param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
90
91
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
92
+ :param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
91
93
  :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
92
94
  :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
93
95
  :param wait_selector: Wait for a specific css selector to be in a specific state.
94
96
  :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
95
97
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
96
98
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
97
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
99
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
100
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
98
101
  """
99
102
  engine = CamoufoxEngine(
103
+ proxy=proxy,
104
+ addons=addons,
100
105
  timeout=timeout,
101
106
  headless=headless,
102
- page_action=page_action,
103
- block_images=block_images,
104
- block_webrtc=block_webrtc,
105
- addons=addons,
106
107
  humanize=humanize,
107
108
  allow_webgl=allow_webgl,
108
- disable_resources=disable_resources,
109
+ page_action=page_action,
109
110
  network_idle=network_idle,
111
+ block_images=block_images,
112
+ block_webrtc=block_webrtc,
113
+ os_randomize=os_randomize,
110
114
  wait_selector=wait_selector,
111
- wait_selector_state=wait_selector_state,
112
115
  google_search=google_search,
113
116
  extra_headers=extra_headers,
117
+ disable_resources=disable_resources,
118
+ wait_selector_state=wait_selector_state,
114
119
  adaptor_arguments=self.adaptor_arguments,
115
120
  )
116
121
  return engine.fetch(url)
@@ -136,6 +141,7 @@ class PlayWrightFetcher(BaseFetcher):
136
141
  useragent: Optional[str] = None, network_idle: Optional[bool] = False, timeout: Optional[float] = 30000,
137
142
  page_action: Callable = do_nothing, wait_selector: Optional[str] = None, wait_selector_state: Optional[str] = 'attached',
138
143
  hide_canvas: bool = True, disable_webgl: bool = False, extra_headers: Optional[Dict[str, str]] = None, google_search: Optional[bool] = True,
144
+ proxy: Optional[Union[str, Dict[str, str]]] = None,
139
145
  stealth: bool = False,
140
146
  cdp_url: Optional[str] = None,
141
147
  nstbrowser_mode: bool = False, nstbrowser_config: Optional[Dict] = None,
@@ -157,12 +163,14 @@ class PlayWrightFetcher(BaseFetcher):
157
163
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
158
164
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
159
165
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
166
+ :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
160
167
  :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
161
168
  :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
162
169
  :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
163
- :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
170
+ :return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
164
171
  """
165
172
  engine = PlaywrightEngine(
173
+ proxy=proxy,
166
174
  timeout=timeout,
167
175
  stealth=stealth,
168
176
  cdp_url=cdp_url,
scrapling/parser.py CHANGED
@@ -7,7 +7,7 @@ from scrapling.core.translator import HTMLTranslator
7
7
  from scrapling.core.mixins import SelectorsGeneration
8
8
  from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
9
9
  from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
10
- from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
10
+ from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
11
11
  from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
12
12
  from lxml import etree, html
13
13
  from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
@@ -60,6 +60,7 @@ class Adaptor(SelectorsGeneration):
60
60
  if root is None and not body and text is None:
61
61
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
62
62
 
63
+ self.__text = None
63
64
  if root is None:
64
65
  if text is None:
65
66
  if not body or not isinstance(body, bytes):
@@ -72,12 +73,14 @@ class Adaptor(SelectorsGeneration):
72
73
 
73
74
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
74
75
 
76
+ # https://lxml.de/api/lxml.etree.HTMLParser-class.html
75
77
  parser = html.HTMLParser(
76
- # https://lxml.de/api/lxml.etree.HTMLParser-class.html
77
78
  recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
78
79
  compact=True, huge_tree=huge_tree, default_doctype=True
79
80
  )
80
81
  self._root = etree.fromstring(body, parser=parser, base_url=url)
82
+ if is_jsonable(text or body.decode()):
83
+ self.__text = TextHandler(text or body.decode())
81
84
 
82
85
  else:
83
86
  # All html types inherits from HtmlMixin so this to check for all at once
@@ -112,7 +115,6 @@ class Adaptor(SelectorsGeneration):
112
115
  self.url = url
113
116
  # For selector stuff
114
117
  self.__attributes = None
115
- self.__text = None
116
118
  self.__tag = None
117
119
  self.__debug = debug
118
120
 
@@ -185,23 +187,9 @@ class Adaptor(SelectorsGeneration):
185
187
  def text(self) -> TextHandler:
186
188
  """Get text content of the element"""
187
189
  if not self.__text:
188
- if self.__keep_comments:
189
- if not self.children:
190
- # If use chose to keep comments, remove comments from text
191
- # Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
192
- # This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
193
- code = self.html_content
194
- parser = html.HTMLParser(
195
- recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
196
- compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
197
- )
198
- fragment_root = html.fragment_fromstring(code, parser=parser)
199
- self.__text = TextHandler(fragment_root.text)
200
- else:
201
- self.__text = TextHandler(self._root.text)
202
- else:
203
- # If user already chose to not keep comments then all is good
204
- self.__text = TextHandler(self._root.text)
190
+ # If you want to escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
191
+ # before extracting text then keep `keep_comments` set to False while initializing the first class
192
+ self.__text = TextHandler(self._root.text)
205
193
  return self.__text
206
194
 
207
195
  def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
scrapling/py.typed ADDED
@@ -0,0 +1 @@
1
+
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2
3
+ Version: 0.2.1
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
42
  Requires-Dist: playwright
43
43
  Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox >=0.3.7
44
+ Requires-Dist: camoufox >=0.3.9
45
45
  Requires-Dist: browserforge
46
46
 
47
47
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
@@ -54,15 +54,31 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
54
54
  ```python
55
55
  >> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
56
56
  # Fetch websites' source under the radar!
57
- >> fetcher = StealthyFetcher().fetch('https://example.com', headless=True, disable_resources=True)
58
- >> print(fetcher.status)
57
+ >> page = StealthyFetcher().fetch('https://example.com', headless=True, network_idle=True)
58
+ >> print(page.status)
59
59
  200
60
- >> page = fetcher.adaptor
61
60
  >> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
62
61
  >> # Later, if the website structure changes, pass `auto_match=True`
63
62
  >> products = page.css('.product', auto_match=True) # and Scrapling still finds them!
64
63
  ```
65
64
 
65
+ # Sponsors
66
+
67
+ [Evomi](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling) is your Swiss Quality Proxy Provider, starting at **$0.49/GB**
68
+
69
+ - 👩‍💻 **$0.49 per GB Residential Proxies**: Our price is unbeatable
70
+ - 👩‍💻 **24/7 Expert Support**: We will join your Slack Channel
71
+ - 🌍 **Global Presence**: Available in 150+ Countries
72
+ - ⚡ **Low Latency**
73
+ - 🔒 **Swiss Quality and Privacy**
74
+ - 🎁 **Free Trial**
75
+ - 🛡️ **99.9% Uptime**
76
+ - 🤝 **Special IP Pool selection**: Optimize for fast, quality or quantity of ips
77
+ - 🔧 **Easy Integration**: Compatible with most software and programming languages
78
+
79
+ [![Evomi Banner](https://my.evomi.com/images/brand/cta.png)](https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling)
80
+ ---
81
+
66
82
  ## Table of content
67
83
  * [Key Features](#key-features)
68
84
  * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
@@ -95,7 +111,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
95
111
  * [Can Scrapling replace code built on top of BeautifulSoup4?](#can-scrapling-replace-code-built-on-top-of-beautifulsoup4)
96
112
  * [Can Scrapling replace code built on top of AutoScraper?](#can-scrapling-replace-code-built-on-top-of-autoscraper)
97
113
  * [Is Scrapling thread-safe?](#is-scrapling-thread-safe)
98
- * [Sponsors](#sponsors)
114
+ * [More Sponsors!](#more-sponsors)
99
115
  * [Contributing](#contributing)
100
116
  * [Disclaimer for Scrapling Project](#disclaimer-for-scrapling-project)
101
117
  * [License](#license)
@@ -136,7 +152,7 @@ from scrapling import Fetcher
136
152
  fetcher = Fetcher(auto_match=False)
137
153
 
138
154
  # Fetch a web page and create an Adaptor instance
139
- page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True).adaptor
155
+ page = fetcher.get('https://quotes.toscrape.com/', stealthy_headers=True)
140
156
  # Get all strings in the full page
141
157
  page.get_all_text(ignore_tags=('script', 'style'))
142
158
 
@@ -246,6 +262,8 @@ All fetcher-type classes are imported in the same way
246
262
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
247
263
  ```
248
264
  And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
265
+
266
+ Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
249
267
  > [!NOTE]
250
268
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
251
269
  ### Fetcher
@@ -265,6 +283,8 @@ This class is built on top of [Camoufox](https://github.com/daijro/camoufox) whi
265
283
  >> page.status == 200
266
284
  True
267
285
  ```
286
+ > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
287
+
268
288
  <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
269
289
 
270
290
  | Argument | Description | Optional |
@@ -283,6 +303,8 @@ True
283
303
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
284
304
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
285
305
  | wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
306
+ | proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
307
+ | os_randomize | If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS. | ✔️ |
286
308
  | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
287
309
 
288
310
  </details>
@@ -293,9 +315,11 @@ This list isn't final so expect a lot more additions and flexibility to be added
293
315
  This class is built on top of [Playwright](https://playwright.dev/python/) which currently provides 4 main run options but they can be mixed as you want.
294
316
  ```python
295
317
  >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
296
- >> page.adaptor.css_first("#search a::attr(href)")
318
+ >> page.css_first("#search a::attr(href)")
297
319
  'https://github.com/D4Vinci/Scrapling'
298
320
  ```
321
+ > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
322
+
299
323
  Using this Fetcher class, you can make requests with:
300
324
  1) Vanilla Playwright without any modifications other than the ones you chose.
301
325
  2) Stealthy Playwright with the stealth mode I wrote for it. It's still a WIP but it bypasses many online tests like [Sannysoft's](https://bot.sannysoft.com/).</br> Some of the things this fetcher's stealth mode does include:
@@ -323,6 +347,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
323
347
  | wait_selector_state | The state to wait for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
324
348
  | google_search | Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name. | ✔️ |
325
349
  | extra_headers | A dictionary of extra headers to add to the request. The referer set by the `google_search` argument takes priority over the referer set here if used together. | ✔️ |
350
+ | proxy | The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
326
351
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
327
352
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
328
353
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
@@ -387,7 +412,7 @@ You can search for a specific ancestor of an element that satisfies a function,
387
412
  ### Content-based Selection & Finding Similar Elements
388
413
  You can select elements by their text content in multiple ways, here's a full example on another website:
389
414
  ```python
390
- >>> page = Fetcher().get('https://books.toscrape.com/index.html').adaptor
415
+ >>> page = Fetcher().get('https://books.toscrape.com/index.html')
391
416
 
392
417
  >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
393
418
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
@@ -507,11 +532,11 @@ Now let's test the same selector in both versions
507
532
  >> old_url = "https://web.archive.org/web/20100102003420/http://stackoverflow.com/"
508
533
  >> new_url = "https://stackoverflow.com/"
509
534
  >>
510
- >> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30).adaptor
535
+ >> page = Fetcher(automatch_domain='stackoverflow.com').get(old_url, timeout=30)
511
536
  >> element1 = page.css_first(selector, auto_save=True)
512
537
  >>
513
538
  >> # Same selector but used in the updated website
514
- >> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url).adaptor
539
+ >> page = Fetcher(automatch_domain="stackoverflow.com").get(new_url)
515
540
  >> element2 = page.css_first(selector, auto_match=True)
516
541
  >>
517
542
  >> if element1.text == element2.text:
@@ -523,7 +548,7 @@ Note that I used a new argument called `automatch_domain`, this is because for S
523
548
  In a real-world scenario, the code will be the same except it will use the same URL for both requests so you won't need to use the `automatch_domain` argument. This is the closest example I can give to real-world cases so I hope it didn't confuse you :)
524
549
 
525
550
  **Notes:**
526
- 1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you on the `.adaptor` property.
551
+ 1. For the two examples above I used one time the `Adaptor` class and the second time the `Fetcher` class just to show you that you can create the `Adaptor` object by yourself if you have the source or fetch the source using any `Fetcher` class then it will create the `Adaptor` object for you.
527
552
  2. Passing the `auto_save` argument with the `auto_match` argument set to `False` while initializing the Adaptor/Fetcher object will only result in ignoring the `auto_save` argument value and the following warning message
528
553
  ```text
529
554
  Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.
@@ -564,7 +589,7 @@ Examples to clear any confusion :)
564
589
 
565
590
  ```python
566
591
  >> from scrapling import Fetcher
567
- >> page = Fetcher().get('https://quotes.toscrape.com/').adaptor
592
+ >> page = Fetcher().get('https://quotes.toscrape.com/')
568
593
  # Find all elements with tag name `div`.
569
594
  >> page.find_all('div')
570
595
  [<data='<div class="container"> <div class="row...' parent='<body> <div class="container"> <div clas...'>,
@@ -727,7 +752,10 @@ There are a lot of deep details skipped here to make this as short as possible s
727
752
 
728
753
  Note that implementing your storage system can be complex as there are some strict rules such as inheriting from the same abstract class, following the singleton design pattern used in other classes, and more. So make sure to read the docs first.
729
754
 
730
- To give detailed documentation of the library, it will need a website. I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. But you can help by using the [sponsor button](https://github.com/sponsors/D4Vinci) above :)
755
+ > [!IMPORTANT]
756
+ > A website is needed to provide detailed library documentation.<br/>
757
+ > I'm trying to rush creating the website, researching new ideas, and adding more features/tests/benchmarks but time is tight with too many spinning plates between work, personal life, and working on Scrapling. I have been working on Scrapling for months for free after all.<br/><br/>
758
+ > If you like `Scrapling` and want it to keep improving then this is a friendly reminder that you can help by supporting me through the [sponsor button](https://github.com/sponsors/D4Vinci).
731
759
 
732
760
  ## ⚡ Enlightening Questions and FAQs
733
761
  This section addresses common questions about Scrapling, please read this section before opening an issue.
@@ -741,8 +769,8 @@ This section addresses common questions about Scrapling, please read this sectio
741
769
 
742
770
  Together both are used to retrieve the element's unique properties from the database later.
743
771
  4. Now later when you enable the `auto_match` parameter for both the Adaptor instance and the method call. The element properties are retrieved and Scrapling loops over all elements in the page and compares each one's unique properties to the unique properties we already have for this element and a score is calculated for each one.
744
- 5. The comparison between elements is not exact but more about finding how similar these values are, so everything is taken into consideration even the values' order like the order in which the element class names were written before and the order in which the same element class names are written now.
745
- 6. The score for each element is stored in the table, and in the end, the element(s) with the highest combined similarity scores are returned.
772
+ 5. Comparing elements is not exact but more about finding how similar these values are, so everything is taken into consideration, even the values' order, like the order in which the element class names were written before and the order in which the same element class names are written now.
773
+ 6. The score for each element is stored in the table, and the element(s) with the highest combined similarity scores are returned.
746
774
 
747
775
  ### How does the auto-matching work if I didn't pass a URL while initializing the Adaptor object?
748
776
  Not a big problem as it depends on your usage. The word `default` will be used in place of the URL field while saving the element's unique properties. So this will only be an issue if you used the same identifier later for a different website that you didn't pass the URL parameter while initializing it as well. The save process will overwrite the previous data and auto-matching uses the latest saved properties only.
@@ -773,7 +801,7 @@ Of course, you can find elements by text/regex, find similar elements in a more
773
801
  ### Is Scrapling thread-safe?
774
802
  Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its state.
775
803
 
776
- ## Sponsors
804
+ ## More Sponsors!
777
805
  [![Capsolver Banner](https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/CapSolver.png)](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
778
806
 
779
807
  ## Contributing
@@ -1,22 +1,23 @@
1
- scrapling/__init__.py,sha256=cSitNNcOc3Ud0zZvaLy5NDfZ4c8_UCLWe7FfTBazKnY,433
2
- scrapling/fetchers.py,sha256=KD2moKWPYEcu7Lq4zIeBXcusmhFlPPueYSjyl8fMpLQ,15365
3
- scrapling/parser.py,sha256=oC1I9_jDP4zemU6V9e6wDyP-CQk2aMhJzSF2BGSBGp0,54253
1
+ scrapling/__init__.py,sha256=x8S2Da-4KgUBzNYdM9ahYw3hDw5875KnpDliQWxQiGo,435
2
+ scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
3
+ scrapling/parser.py,sha256=VGbrARu2hxXyKLbUgtdtht_tljDYPT1jaWZWgoncv5U,53551
4
+ scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
4
5
  scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
5
6
  scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
6
7
  scrapling/core/custom_types.py,sha256=-gMNOiByewoAUqFVrDp822V51rcWNlWVUOB6yGUL648,8403
7
8
  scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
8
9
  scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
9
10
  scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
10
- scrapling/core/utils.py,sha256=o35SxakRw5Bq_hpOiHu1KaSWrOBxeQpEMuOzG88NCqE,3530
11
+ scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
11
12
  scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
12
- scrapling/engines/camo.py,sha256=Cq8960Uz-y-__4OJviHXPPhjbbVz1ILt9koaPic2x8w,6954
13
+ scrapling/engines/camo.py,sha256=P8kPxP0awgV-AGMibMNDJUaxZC9oYDP64Ei_dk9D3jA,7549
13
14
  scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
14
- scrapling/engines/pw.py,sha256=cx1B0mfatEoGYpFkDt5zPg_cb0lKU0mu4MjuuU-COes,11805
15
- scrapling/engines/static.py,sha256=K-tT8mEfJY0Ix_gZceazeFIYmZ_ko4nyqZptj6POYmM,7159
16
- scrapling/engines/toolbelt/__init__.py,sha256=3zWs5aiV8QP5ua-cvIBkCRaDhmjWEEx_xycVpdp3ur4,341
17
- scrapling/engines/toolbelt/custom.py,sha256=cqXQ2UdzoH0IXBAa0ySg_90kPhlP-f2fLAauJUAMFOs,8167
15
+ scrapling/engines/pw.py,sha256=JKPdJkfz--8YyngLxFNwEyWF0O3_o5xR7zQCxF1D8Cs,12121
16
+ scrapling/engines/static.py,sha256=dY1iLBe7YhzRJYd9MM8P7hbqF44cpwOgTJ6CkIVfaRA,7120
17
+ scrapling/engines/toolbelt/__init__.py,sha256=BnBp34aDeohYgqdysEAAWnGZgA02YlExkc5FJLetMSo,367
18
+ scrapling/engines/toolbelt/custom.py,sha256=XB_oINjmVnigODxfP9hl-teRy0BkJqfrEprWDAqO-Jo,7473
18
19
  scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
19
- scrapling/engines/toolbelt/navigation.py,sha256=04Y1zjkVAgmvbgM3tHn6NsAruh5x6ESH1w0EW8CdVxo,2452
20
+ scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
20
21
  tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
21
22
  tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
22
23
  tests/fetchers/test_camoufox.py,sha256=XPTCDZ9sj_GpCzXyvzKF_uZWhEYX6J_jh_BLeMEl8yY,2874
@@ -25,8 +26,8 @@ tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-T
25
26
  tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
26
27
  tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
27
28
  tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
28
- scrapling-0.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
29
- scrapling-0.2.dist-info/METADATA,sha256=yieOuAeWNwx5UMtQN-E1bsNnKEum4xGgPUynOgbG7m0,61418
30
- scrapling-0.2.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
31
- scrapling-0.2.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
32
- scrapling-0.2.dist-info/RECORD,,
29
+ scrapling-0.2.1.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
30
+ scrapling-0.2.1.dist-info/METADATA,sha256=aeExP8jl7VQxIUnfvvo4QxIeasqfziscacOrOoHOuXk,64155
31
+ scrapling-0.2.1.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
32
+ scrapling-0.2.1.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
33
+ scrapling-0.2.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.3.0)
2
+ Generator: setuptools (75.5.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5