scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. scrapling/__init__.py +18 -31
  2. scrapling/cli.py +818 -20
  3. scrapling/core/_html_utils.py +348 -0
  4. scrapling/core/_types.py +34 -17
  5. scrapling/core/ai.py +611 -0
  6. scrapling/core/custom_types.py +183 -100
  7. scrapling/core/mixins.py +27 -19
  8. scrapling/core/shell.py +647 -0
  9. scrapling/core/{storage_adaptors.py → storage.py} +41 -33
  10. scrapling/core/translator.py +20 -26
  11. scrapling/core/utils.py +49 -54
  12. scrapling/engines/__init__.py +15 -6
  13. scrapling/engines/_browsers/__init__.py +2 -0
  14. scrapling/engines/_browsers/_camoufox.py +759 -0
  15. scrapling/engines/_browsers/_config_tools.py +130 -0
  16. scrapling/engines/_browsers/_controllers.py +644 -0
  17. scrapling/engines/_browsers/_page.py +93 -0
  18. scrapling/engines/_browsers/_validators.py +170 -0
  19. scrapling/engines/constants.py +101 -88
  20. scrapling/engines/static.py +667 -110
  21. scrapling/engines/toolbelt/__init__.py +20 -6
  22. scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
  23. scrapling/engines/toolbelt/convertor.py +254 -0
  24. scrapling/engines/toolbelt/custom.py +158 -175
  25. scrapling/engines/toolbelt/fingerprints.py +32 -46
  26. scrapling/engines/toolbelt/navigation.py +68 -39
  27. scrapling/fetchers.py +239 -333
  28. scrapling/parser.py +781 -449
  29. scrapling-0.3.1.dist-info/METADATA +411 -0
  30. scrapling-0.3.1.dist-info/RECORD +41 -0
  31. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
  32. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
  33. scrapling/defaults.py +0 -25
  34. scrapling/engines/camo.py +0 -339
  35. scrapling/engines/pw.py +0 -465
  36. scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
  37. scrapling-0.2.99.dist-info/METADATA +0 -290
  38. scrapling-0.2.99.dist-info/RECORD +0 -49
  39. tests/__init__.py +0 -1
  40. tests/fetchers/__init__.py +0 -1
  41. tests/fetchers/async/__init__.py +0 -0
  42. tests/fetchers/async/test_camoufox.py +0 -97
  43. tests/fetchers/async/test_httpx.py +0 -85
  44. tests/fetchers/async/test_playwright.py +0 -101
  45. tests/fetchers/sync/__init__.py +0 -0
  46. tests/fetchers/sync/test_camoufox.py +0 -70
  47. tests/fetchers/sync/test_httpx.py +0 -84
  48. tests/fetchers/sync/test_playwright.py +0 -89
  49. tests/fetchers/test_utils.py +0 -97
  50. tests/parser/__init__.py +0 -0
  51. tests/parser/test_automatch.py +0 -111
  52. tests/parser/test_general.py +0 -330
  53. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
  54. {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,74 +1,97 @@
1
1
  """
2
2
  Functions related to files and URLs
3
3
  """
4
- import os
4
+
5
+ from pathlib import Path
6
+ from functools import lru_cache
5
7
  from urllib.parse import urlencode, urlparse
6
8
 
7
9
  from playwright.async_api import Route as async_Route
10
+ from msgspec import Struct, structs, convert, ValidationError
8
11
  from playwright.sync_api import Route
9
12
 
10
- from scrapling.core._types import Dict, Optional, Union
11
- from scrapling.core.utils import log, lru_cache
13
+ from scrapling.core.utils import log
14
+ from scrapling.core._types import Dict, Optional, Tuple
12
15
  from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
13
16
 
17
+ __BYPASSES_DIR__ = Path(__file__).parent / "bypasses"
18
+
19
+
20
+ class ProxyDict(Struct):
21
+ server: str
22
+ username: str = ""
23
+ password: str = ""
24
+
14
25
 
15
26
  def intercept_route(route: Route):
16
- """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
27
+ """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
17
28
 
18
29
  :param route: PlayWright `Route` object of the current page
19
30
  :return: PlayWright `Route` object
20
31
  """
21
32
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
22
- log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
33
+ log.debug(
34
+ f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
35
+ )
23
36
  route.abort()
24
37
  else:
25
38
  route.continue_()
26
39
 
27
40
 
28
41
  async def async_intercept_route(route: async_Route):
29
- """This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
42
+ """This is just a route handler, but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
30
43
 
31
44
  :param route: PlayWright `Route` object of the current page
32
45
  :return: PlayWright `Route` object
33
46
  """
34
47
  if route.request.resource_type in DEFAULT_DISABLED_RESOURCES:
35
- log.debug(f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"')
48
+ log.debug(
49
+ f'Blocking background resource "{route.request.url}" of type "{route.request.resource_type}"'
50
+ )
36
51
  await route.abort()
37
52
  else:
38
53
  await route.continue_()
39
54
 
40
55
 
41
- def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict, None]:
56
+ def construct_proxy_dict(
57
+ proxy_string: str | Dict[str, str], as_tuple=False
58
+ ) -> Optional[Dict | Tuple]:
42
59
  """Validate a proxy and return it in the acceptable format for Playwright
43
60
  Reference: https://playwright.dev/python/docs/network#http-proxy
44
61
 
45
62
  :param proxy_string: A string or a dictionary representation of the proxy.
63
+ :param as_tuple: Return the proxy dictionary as a tuple to be cachable
46
64
  :return:
47
65
  """
48
- if proxy_string:
49
- if isinstance(proxy_string, str):
50
- proxy = urlparse(proxy_string)
51
- try:
52
- return {
53
- 'server': f'{proxy.scheme}://{proxy.hostname}:{proxy.port}',
54
- 'username': proxy.username or '',
55
- 'password': proxy.password or '',
56
- }
57
- except ValueError:
58
- # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
59
- raise TypeError('The proxy argument\'s string is in invalid format!')
60
-
61
- elif isinstance(proxy_string, dict):
62
- valid_keys = ('server', 'username', 'password', )
63
- if all(key in valid_keys for key in proxy_string.keys()) and not any(key not in valid_keys for key in proxy_string.keys()):
64
- return proxy_string
65
- else:
66
- raise TypeError(f'A proxy dictionary must have only these keys: {valid_keys}')
67
-
68
- else:
69
- raise TypeError(f'Invalid type of proxy ({type(proxy_string)}), the proxy argument must be a string or a dictionary!')
70
-
71
- # The default value for proxy in Playwright's source is `None`
66
+ if isinstance(proxy_string, str):
67
+ proxy = urlparse(proxy_string)
68
+ if (
69
+ proxy.scheme not in ("http", "https", "socks4", "socks5")
70
+ or not proxy.hostname
71
+ ):
72
+ raise ValueError("Invalid proxy string!")
73
+
74
+ try:
75
+ result = {
76
+ "server": f"{proxy.scheme}://{proxy.hostname}",
77
+ "username": proxy.username or "",
78
+ "password": proxy.password or "",
79
+ }
80
+ if proxy.port:
81
+ result["server"] += f":{proxy.port}"
82
+ return tuple(result.items()) if as_tuple else result
83
+ except ValueError:
84
+ # Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
85
+ raise ValueError("The proxy argument's string is in invalid format!")
86
+
87
+ elif isinstance(proxy_string, dict):
88
+ try:
89
+ validated = convert(proxy_string, ProxyDict)
90
+ result_dict = structs.asdict(validated)
91
+ return tuple(result_dict.items()) if as_tuple else result_dict
92
+ except ValidationError as e:
93
+ raise TypeError(f"Invalid proxy dictionary: {e}")
94
+
72
95
  return None
73
96
 
74
97
 
@@ -84,17 +107,24 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
84
107
  parsed = urlparse(cdp_url)
85
108
 
86
109
  # Check scheme
87
- if parsed.scheme not in ('ws', 'wss'):
110
+ if parsed.scheme not in ("ws", "wss"):
88
111
  raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
89
112
 
90
113
  # Validate hostname and port
91
114
  if not parsed.netloc:
92
115
  raise ValueError("Invalid hostname for the CDP URL")
93
116
 
94
- # Ensure path starts with /
117
+ try:
118
+ # Checking if the port is valid (if available)
119
+ _ = parsed.port
120
+ except ValueError:
121
+ # urlparse will raise `ValueError` if the port can't be casted to integer
122
+ raise ValueError("Invalid port for the CDP URL")
123
+
124
+ # Ensure the path starts with /
95
125
  path = parsed.path
96
- if not path.startswith('/'):
97
- path = '/' + path
126
+ if not path.startswith("/"):
127
+ path = "/" + path
98
128
 
99
129
  # Reconstruct the base URL with validated parts
100
130
  validated_base = f"{parsed.scheme}://{parsed.netloc}{path}"
@@ -112,10 +142,9 @@ def construct_cdp_url(cdp_url: str, query_params: Optional[Dict] = None) -> str:
112
142
 
113
143
  @lru_cache(10, typed=True)
114
144
  def js_bypass_path(filename: str) -> str:
115
- """Takes the base filename of JS file inside the `bypasses` folder then return the full path of it
145
+ """Takes the base filename of a JS file inside the `bypasses` folder, then return the full path of it
116
146
 
117
147
  :param filename: The base filename of the JS file.
118
148
  :return: The full path of the JS file.
119
149
  """
120
- current_directory = os.path.dirname(__file__)
121
- return os.path.join(current_directory, 'bypasses', filename)
150
+ return str(__BYPASSES_DIR__ / filename)