scrapling 0.3.6__tar.gz → 0.3.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. {scrapling-0.3.6/scrapling.egg-info → scrapling-0.3.7}/PKG-INFO +3 -4
  2. {scrapling-0.3.6 → scrapling-0.3.7}/README.md +1 -2
  3. {scrapling-0.3.6 → scrapling-0.3.7}/pyproject.toml +2 -2
  4. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/__init__.py +1 -1
  5. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/_types.py +3 -0
  6. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/ai.py +2 -1
  7. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/custom_types.py +20 -27
  8. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/mixins.py +15 -9
  9. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/shell.py +4 -3
  10. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/storage.py +5 -5
  11. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/translator.py +13 -8
  12. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/_browsers/_base.py +37 -14
  13. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/_browsers/_camoufox.py +76 -35
  14. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/_browsers/_config_tools.py +1 -1
  15. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/_browsers/_controllers.py +32 -11
  16. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/_browsers/_validators.py +31 -10
  17. scrapling-0.3.7/scrapling/engines/static.py +1074 -0
  18. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/convertor.py +13 -15
  19. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/custom.py +6 -9
  20. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/fingerprints.py +17 -10
  21. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/navigation.py +11 -3
  22. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/fetchers/__init__.py +11 -1
  23. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/fetchers/chrome.py +9 -4
  24. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/fetchers/firefox.py +0 -4
  25. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/parser.py +105 -80
  26. {scrapling-0.3.6 → scrapling-0.3.7/scrapling.egg-info}/PKG-INFO +3 -4
  27. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling.egg-info/requires.txt +1 -1
  28. {scrapling-0.3.6 → scrapling-0.3.7}/setup.cfg +1 -1
  29. scrapling-0.3.6/scrapling/engines/static.py +0 -1064
  30. {scrapling-0.3.6 → scrapling-0.3.7}/LICENSE +0 -0
  31. {scrapling-0.3.6 → scrapling-0.3.7}/MANIFEST.in +0 -0
  32. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/cli.py +0 -0
  33. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/__init__.py +0 -0
  34. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/_html_utils.py +0 -0
  35. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/utils/__init__.py +0 -0
  36. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/utils/_shell.py +0 -0
  37. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/core/utils/_utils.py +0 -0
  38. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/__init__.py +0 -0
  39. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/_browsers/__init__.py +0 -0
  40. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/_browsers/_page.py +0 -0
  41. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/constants.py +0 -0
  42. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/__init__.py +0 -0
  43. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  44. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  45. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  46. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  47. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  48. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  49. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/fetchers/requests.py +0 -0
  50. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling/py.typed +0 -0
  51. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling.egg-info/SOURCES.txt +0 -0
  52. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling.egg-info/dependency_links.txt +0 -0
  53. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling.egg-info/entry_points.txt +0 -0
  54. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling.egg-info/not-zip-safe +0 -0
  55. {scrapling-0.3.6 → scrapling-0.3.7}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrapling
3
- Version: 0.3.6
3
+ Version: 0.3.7
4
4
  Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -77,7 +77,7 @@ Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
77
77
  Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
78
78
  Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
79
79
  Provides-Extra: ai
80
- Requires-Dist: mcp>=1.15.0; extra == "ai"
80
+ Requires-Dist: mcp>=1.16.0; extra == "ai"
81
81
  Requires-Dist: markdownify>=1.2.0; extra == "ai"
82
82
  Requires-Dist: scrapling[fetchers]; extra == "ai"
83
83
  Provides-Extra: shell
@@ -162,7 +162,6 @@ Built for the modern Web, Scrapling features its own rapid parsing engine and fe
162
162
  <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
163
163
  <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
164
164
  <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
165
- <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
166
165
  <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
167
166
  <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
168
167
 
@@ -389,7 +388,7 @@ Starting with v0.3.2, this installation only includes the parser engine and its
389
388
  ### Docker
390
389
  You can also install a Docker image with all extras and browsers with the following command:
391
390
  ```bash
392
- docker pull scrapling
391
+ docker pull pyd4vinci/scrapling
393
392
  ```
394
393
  This image is automatically built and pushed to Docker Hub through GitHub actions right here.
395
394
 
@@ -72,7 +72,6 @@ Built for the modern Web, Scrapling features its own rapid parsing engine and fe
72
72
  <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
73
73
  <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
74
74
  <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
75
- <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
76
75
  <a href="https://www.rapidproxy.io/?ref=d4v" target="_blank" title="Affordable Access to the Proxy World – bypass CAPTCHAs blocks, and avoid additional costs."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/rapidproxy.jpg"></a>
77
76
  <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
78
77
 
@@ -299,7 +298,7 @@ Starting with v0.3.2, this installation only includes the parser engine and its
299
298
  ### Docker
300
299
  You can also install a Docker image with all extras and browsers with the following command:
301
300
  ```bash
302
- docker pull scrapling
301
+ docker pull pyd4vinci/scrapling
303
302
  ```
304
303
  This image is automatically built and pushed to Docker Hub through GitHub actions right here.
305
304
 
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
5
5
  [project]
6
6
  name = "scrapling"
7
7
  # Static version instead of dynamic version so we can get better layer caching while building docker, check the docker file to understand
8
- version = "0.3.6"
8
+ version = "0.3.7"
9
9
  description = "Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!"
10
10
  readme = {file = "README.md", content-type = "text/markdown"}
11
11
  license = {file = "LICENSE"}
@@ -74,7 +74,7 @@ fetchers = [
74
74
  "msgspec>=0.19.0",
75
75
  ]
76
76
  ai = [
77
- "mcp>=1.15.0",
77
+ "mcp>=1.16.0",
78
78
  "markdownify>=1.2.0",
79
79
  "scrapling[fetchers]",
80
80
  ]
@@ -1,5 +1,5 @@
1
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
- __version__ = "0.3.6"
2
+ __version__ = "0.3.7"
3
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
4
 
5
5
  from typing import Any, TYPE_CHECKING
@@ -12,9 +12,11 @@ from typing import (
12
12
  Generator,
13
13
  Iterable,
14
14
  List,
15
+ Set,
15
16
  Literal,
16
17
  Optional,
17
18
  Pattern,
19
+ Sequence,
18
20
  Tuple,
19
21
  TypeVar,
20
22
  Union,
@@ -22,6 +24,7 @@ from typing import (
22
24
  Mapping,
23
25
  Awaitable,
24
26
  Protocol,
27
+ Coroutine,
25
28
  SupportsIndex,
26
29
  )
27
30
 
@@ -20,6 +20,7 @@ from scrapling.core._types import (
20
20
  Mapping,
21
21
  Dict,
22
22
  List,
23
+ Any,
23
24
  SelectorWaitStates,
24
25
  Generator,
25
26
  )
@@ -171,7 +172,7 @@ class ScraplingMCPServer:
171
172
  :param stealthy_headers: If enabled (default), it creates and adds real browser headers. It also sets the referer header as if this request came from a Google search of URL's domain.
172
173
  """
173
174
  async with FetcherSession() as session:
174
- tasks = [
175
+ tasks: List[Any] = [
175
176
  session.get(
176
177
  url,
177
178
  auth=auth,
@@ -5,6 +5,7 @@ from re import compile as re_compile, UNICODE, IGNORECASE
5
5
  from orjson import dumps, loads
6
6
 
7
7
  from scrapling.core._types import (
8
+ Any,
8
9
  cast,
9
10
  Dict,
10
11
  List,
@@ -14,7 +15,6 @@ from scrapling.core._types import (
14
15
  Literal,
15
16
  Pattern,
16
17
  Iterable,
17
- Optional,
18
18
  Generator,
19
19
  SupportsIndex,
20
20
  )
@@ -33,23 +33,20 @@ class TextHandler(str):
33
33
 
34
34
  def __getitem__(self, key: SupportsIndex | slice) -> "TextHandler": # pragma: no cover
35
35
  lst = super().__getitem__(key)
36
- return cast(_TextHandlerType, TextHandler(lst))
36
+ return TextHandler(lst)
37
37
 
38
- def split(self, sep: str = None, maxsplit: SupportsIndex = -1) -> "TextHandlers": # pragma: no cover
39
- return TextHandlers(
40
- cast(
41
- List[_TextHandlerType],
42
- [TextHandler(s) for s in super().split(sep, maxsplit)],
43
- )
44
- )
38
+ def split(
39
+ self, sep: str | None = None, maxsplit: SupportsIndex = -1
40
+ ) -> Union[List, "TextHandlers"]: # pragma: no cover
41
+ return TextHandlers([TextHandler(s) for s in super().split(sep, maxsplit)])
45
42
 
46
- def strip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
43
+ def strip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
47
44
  return TextHandler(super().strip(chars))
48
45
 
49
- def lstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
46
+ def lstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
50
47
  return TextHandler(super().lstrip(chars))
51
48
 
52
- def rstrip(self, chars: str = None) -> Union[str, "TextHandler"]: # pragma: no cover
49
+ def rstrip(self, chars: str | None = None) -> Union[str, "TextHandler"]: # pragma: no cover
53
50
  return TextHandler(super().rstrip(chars))
54
51
 
55
52
  def capitalize(self) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -64,7 +61,7 @@ class TextHandler(str):
64
61
  def expandtabs(self, tabsize: SupportsIndex = 8) -> Union[str, "TextHandler"]: # pragma: no cover
65
62
  return TextHandler(super().expandtabs(tabsize))
66
63
 
67
- def format(self, *args: str, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
64
+ def format(self, *args: object, **kwargs: str) -> Union[str, "TextHandler"]: # pragma: no cover
68
65
  return TextHandler(super().format(*args, **kwargs))
69
66
 
70
67
  def format_map(self, mapping) -> Union[str, "TextHandler"]: # pragma: no cover
@@ -131,10 +128,11 @@ class TextHandler(str):
131
128
  def re(
132
129
  self,
133
130
  regex: str | Pattern,
134
- check_match: Literal[True],
135
131
  replace_entities: bool = True,
136
132
  clean_match: bool = False,
137
133
  case_sensitive: bool = True,
134
+ *,
135
+ check_match: Literal[True],
138
136
  ) -> bool: ...
139
137
 
140
138
  @overload
@@ -179,19 +177,14 @@ class TextHandler(str):
179
177
  results = flatten(results)
180
178
 
181
179
  if not replace_entities:
182
- return TextHandlers(cast(List[_TextHandlerType], [TextHandler(string) for string in results]))
180
+ return TextHandlers([TextHandler(string) for string in results])
183
181
 
184
- return TextHandlers(
185
- cast(
186
- List[_TextHandlerType],
187
- [TextHandler(_replace_entities(s)) for s in results],
188
- )
189
- )
182
+ return TextHandlers([TextHandler(_replace_entities(s)) for s in results])
190
183
 
191
184
  def re_first(
192
185
  self,
193
186
  regex: str | Pattern,
194
- default=None,
187
+ default: Any = None,
195
188
  replace_entities: bool = True,
196
189
  clean_match: bool = False,
197
190
  case_sensitive: bool = True,
@@ -232,8 +225,8 @@ class TextHandlers(List[TextHandler]):
232
225
  def __getitem__(self, pos: SupportsIndex | slice) -> Union[TextHandler, "TextHandlers"]:
233
226
  lst = super().__getitem__(pos)
234
227
  if isinstance(pos, slice):
235
- return TextHandlers(cast(List[_TextHandlerType], lst))
236
- return cast(_TextHandlerType, TextHandler(lst))
228
+ return TextHandlers(cast(List[TextHandler], lst))
229
+ return TextHandler(cast(TextHandler, lst))
237
230
 
238
231
  def re(
239
232
  self,
@@ -256,7 +249,7 @@ class TextHandlers(List[TextHandler]):
256
249
  def re_first(
257
250
  self,
258
251
  regex: str | Pattern,
259
- default=None,
252
+ default: Any = None,
260
253
  replace_entities: bool = True,
261
254
  clean_match: bool = False,
262
255
  case_sensitive: bool = True,
@@ -309,9 +302,9 @@ class AttributesHandler(Mapping[str, _TextHandlerType]):
309
302
  )
310
303
 
311
304
  # Fastest read-only mapping type
312
- self._data = MappingProxyType(mapping)
305
+ self._data: Mapping[str, Any] = MappingProxyType(mapping)
313
306
 
314
- def get(self, key: str, default: Optional[str] = None) -> Optional[_TextHandlerType]:
307
+ def get(self, key: str, default: Any = None) -> _TextHandlerType:
315
308
  """Acts like the standard dictionary `.get()` method"""
316
309
  return self._data.get(key, default)
317
310
 
@@ -1,3 +1,9 @@
1
+ from scrapling.core._types import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from scrapling.parser import Selector
5
+
6
+
1
7
  class SelectorsGeneration:
2
8
  """
3
9
  Functions for generating selectors
@@ -5,7 +11,7 @@ class SelectorsGeneration:
5
11
  Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591
6
12
  """
7
13
 
8
- def __general_selection(self, selection: str = "css", full_path: bool = False) -> str:
14
+ def _general_selection(self: "Selector", selection: str = "css", full_path: bool = False) -> str: # type: ignore[name-defined]
9
15
  """Generate a selector for the current element.
10
16
  :return: A string of the generated selector.
11
17
  """
@@ -47,29 +53,29 @@ class SelectorsGeneration:
47
53
  return " > ".join(reversed(selectorPath)) if css else "//" + "/".join(reversed(selectorPath))
48
54
 
49
55
  @property
50
- def generate_css_selector(self) -> str:
56
+ def generate_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
51
57
  """Generate a CSS selector for the current element
52
58
  :return: A string of the generated selector.
53
59
  """
54
- return self.__general_selection()
60
+ return self._general_selection()
55
61
 
56
62
  @property
57
- def generate_full_css_selector(self) -> str:
63
+ def generate_full_css_selector(self: "Selector") -> str: # type: ignore[name-defined]
58
64
  """Generate a complete CSS selector for the current element
59
65
  :return: A string of the generated selector.
60
66
  """
61
- return self.__general_selection(full_path=True)
67
+ return self._general_selection(full_path=True)
62
68
 
63
69
  @property
64
- def generate_xpath_selector(self) -> str:
70
+ def generate_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
65
71
  """Generate an XPath selector for the current element
66
72
  :return: A string of the generated selector.
67
73
  """
68
- return self.__general_selection("xpath")
74
+ return self._general_selection("xpath")
69
75
 
70
76
  @property
71
- def generate_full_xpath_selector(self) -> str:
77
+ def generate_full_xpath_selector(self: "Selector") -> str: # type: ignore[name-defined]
72
78
  """Generate a complete XPath selector for the current element
73
79
  :return: A string of the generated selector.
74
80
  """
75
- return self.__general_selection("xpath", full_path=True)
81
+ return self._general_selection("xpath", full_path=True)
@@ -31,6 +31,7 @@ from scrapling.core._types import (
31
31
  Optional,
32
32
  Dict,
33
33
  Any,
34
+ cast,
34
35
  extraction_types,
35
36
  Generator,
36
37
  )
@@ -540,15 +541,15 @@ class Convertor:
540
541
  raise ValueError(f"Unknown extraction type: {extraction_type}")
541
542
  else:
542
543
  if main_content_only:
543
- page = page.css_first("body") or page
544
+ page = cast(Selector, page.css_first("body")) or page
544
545
 
545
- pages = [page] if not css_selector else page.css(css_selector)
546
+ pages = [page] if not css_selector else cast(Selectors, page.css(css_selector))
546
547
  for page in pages:
547
548
  match extraction_type:
548
549
  case "markdown":
549
550
  yield cls._convert_to_markdown(page.html_content)
550
551
  case "html":
551
- yield page.body
552
+ yield page.html_content
552
553
  case "text":
553
554
  txt_content = page.get_all_text(strip=True)
554
555
  for s in (
@@ -56,13 +56,13 @@ class StorageSystemMixin(ABC): # pragma: no cover
56
56
  @lru_cache(128, typed=True)
57
57
  def _get_hash(identifier: str) -> str:
58
58
  """If you want to hash identifier in your storage system, use this safer"""
59
- identifier = identifier.lower().strip()
60
- if isinstance(identifier, str):
59
+ _identifier = identifier.lower().strip()
60
+ if isinstance(_identifier, str):
61
61
  # Hash functions have to take bytes
62
- identifier = identifier.encode("utf-8")
62
+ _identifier = _identifier.encode("utf-8")
63
63
 
64
- hash_value = sha256(identifier).hexdigest()
65
- return f"{hash_value}_{len(identifier)}" # Length to reduce collision chance
64
+ hash_value = sha256(_identifier).hexdigest()
65
+ return f"{hash_value}_{len(_identifier)}" # Length to reduce collision chance
66
66
 
67
67
 
68
68
  @lru_cache(1, typed=True)
@@ -10,24 +10,23 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
10
10
 
11
11
  from functools import lru_cache
12
12
 
13
- from cssselect.xpath import ExpressionError
14
- from cssselect.xpath import XPathExpr as OriginalXPathExpr
15
13
  from cssselect import HTMLTranslator as OriginalHTMLTranslator
14
+ from cssselect.xpath import ExpressionError, XPathExpr as OriginalXPathExpr
16
15
  from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
17
16
 
18
- from scrapling.core._types import Any, Optional, Protocol, Self
17
+ from scrapling.core._types import Any, Protocol, Self
19
18
 
20
19
 
21
20
  class XPathExpr(OriginalXPathExpr):
22
21
  textnode: bool = False
23
- attribute: Optional[str] = None
22
+ attribute: str | None = None
24
23
 
25
24
  @classmethod
26
25
  def from_xpath(
27
26
  cls,
28
27
  xpath: OriginalXPathExpr,
29
28
  textnode: bool = False,
30
- attribute: Optional[str] = None,
29
+ attribute: str | None = None,
31
30
  ) -> Self:
32
31
  x = cls(path=xpath.path, element=xpath.element, condition=xpath.condition)
33
32
  x.textnode = textnode
@@ -71,10 +70,10 @@ class XPathExpr(OriginalXPathExpr):
71
70
 
72
71
  # e.g. cssselect.GenericTranslator, cssselect.HTMLTranslator
73
72
  class TranslatorProtocol(Protocol):
74
- def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pragma: no cover
73
+ def xpath_element(self, selector: Element) -> OriginalXPathExpr: # pyright: ignore # pragma: no cover
75
74
  pass
76
75
 
77
- def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pragma: no cover
76
+ def css_to_xpath(self, css: str, prefix: str = ...) -> str: # pyright: ignore # pragma: no cover
78
77
  pass
79
78
 
80
79
 
@@ -121,9 +120,15 @@ class TranslatorMixin:
121
120
 
122
121
 
123
122
  class HTMLTranslator(TranslatorMixin, OriginalHTMLTranslator):
124
- @lru_cache(maxsize=256)
125
123
  def css_to_xpath(self, css: str, prefix: str = "descendant-or-self::") -> str:
126
124
  return super().css_to_xpath(css, prefix)
127
125
 
128
126
 
129
127
  translator = HTMLTranslator()
128
+ # Using a function instead of the translator directly to avoid Pyright override error
129
+
130
+
131
+ @lru_cache(maxsize=256)
132
+ def css_to_xpath(query: str) -> str:
133
+ """Return translated XPath version of a given CSS query"""
134
+ return translator.css_to_xpath(query)
@@ -7,14 +7,12 @@ from playwright.async_api import (
7
7
  BrowserContext as AsyncBrowserContext,
8
8
  Playwright as AsyncPlaywright,
9
9
  )
10
- from camoufox.utils import (
11
- launch_options as generate_launch_options,
12
- installed_verstr as camoufox_version,
13
- )
10
+ from camoufox.pkgman import installed_verstr as camoufox_version
11
+ from camoufox.utils import launch_options as generate_launch_options
14
12
 
15
13
  from ._page import PageInfo, PagePool
16
14
  from scrapling.parser import Selector
17
- from scrapling.core._types import Dict, Optional
15
+ from scrapling.core._types import Any, cast, Dict, Optional, TYPE_CHECKING
18
16
  from scrapling.engines.toolbelt.fingerprints import get_os_name
19
17
  from ._validators import validate, PlaywrightConfig, CamoufoxConfig
20
18
  from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
@@ -41,6 +39,7 @@ class SyncSession:
41
39
  """Get a new page to use"""
42
40
 
43
41
  # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
42
+ assert self.context is not None, "Browser context not initialized"
44
43
  page = self.context.new_page()
45
44
  page.set_default_navigation_timeout(timeout)
46
45
  page.set_default_timeout(timeout)
@@ -65,11 +64,14 @@ class SyncSession:
65
64
  }
66
65
 
67
66
 
68
- class AsyncSession(SyncSession):
67
+ class AsyncSession:
69
68
  def __init__(self, max_pages: int = 1):
70
- super().__init__(max_pages)
69
+ self.max_pages = max_pages
70
+ self.page_pool = PagePool(max_pages)
71
+ self._max_wait_for_page = 60
71
72
  self.playwright: Optional[AsyncPlaywright] = None
72
73
  self.context: Optional[AsyncBrowserContext] = None
74
+ self._closed = False
73
75
  self._lock = Lock()
74
76
 
75
77
  async def _get_page(
@@ -79,6 +81,9 @@ class AsyncSession(SyncSession):
79
81
  disable_resources: bool,
80
82
  ) -> PageInfo: # pragma: no cover
81
83
  """Get a new page to use"""
84
+ if TYPE_CHECKING:
85
+ assert self.context is not None, "Browser context not initialized"
86
+
82
87
  async with self._lock:
83
88
  # If we're at max capacity after cleanup, wait for busy pages to finish
84
89
  if self.page_pool.pages_count >= self.max_pages:
@@ -92,6 +97,7 @@ class AsyncSession(SyncSession):
92
97
  f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
93
98
  )
94
99
 
100
+ assert self.context is not None, "Browser context not initialized"
95
101
  page = await self.context.new_page()
96
102
  page.set_default_navigation_timeout(timeout)
97
103
  page.set_default_timeout(timeout)
@@ -107,6 +113,14 @@ class AsyncSession(SyncSession):
107
113
 
108
114
  return self.page_pool.add_page(page)
109
115
 
116
+ def get_pool_stats(self) -> Dict[str, int]:
117
+ """Get statistics about the current page pool"""
118
+ return {
119
+ "total_pages": self.page_pool.pages_count,
120
+ "busy_pages": self.page_pool.busy_count,
121
+ "max_pages": self.max_pages,
122
+ }
123
+
110
124
 
111
125
  class DynamicSessionMixin:
112
126
  def __validate__(self, **params):
@@ -134,11 +148,16 @@ class DynamicSessionMixin:
134
148
  self.init_script = config.init_script
135
149
  self.wait_selector_state = config.wait_selector_state
136
150
  self.selector_config = config.selector_config
151
+ self.additional_args = config.additional_args
137
152
  self.page_action = config.page_action
138
- self._headers_keys = set(map(str.lower, self.extra_headers.keys())) if self.extra_headers else set()
153
+ self.user_data_dir = config.user_data_dir
154
+ self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
139
155
  self.__initiate_browser_options__()
140
156
 
141
157
  def __initiate_browser_options__(self):
158
+ if TYPE_CHECKING:
159
+ assert isinstance(self.proxy, tuple)
160
+
142
161
  if not self.cdp_url:
143
162
  # `launch_options` is used with persistent context
144
163
  self.launch_options = dict(
@@ -156,6 +175,8 @@ class DynamicSessionMixin:
156
175
  )
157
176
  self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
158
177
  self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
178
+ self.launch_options["user_data_dir"] = self.user_data_dir
179
+ self.launch_options.update(cast(Dict, self.additional_args))
159
180
  self.context_options = dict()
160
181
  else:
161
182
  # while `context_options` is left to be used when cdp mode is enabled
@@ -171,11 +192,12 @@ class DynamicSessionMixin:
171
192
  )
172
193
  self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
173
194
  self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
195
+ self.context_options.update(cast(Dict, self.additional_args))
174
196
 
175
197
 
176
198
  class StealthySessionMixin:
177
199
  def __validate__(self, **params):
178
- config = validate(params, model=CamoufoxConfig)
200
+ config: CamoufoxConfig = validate(params, model=CamoufoxConfig)
179
201
 
180
202
  self.max_pages = config.max_pages
181
203
  self.headless = config.headless
@@ -204,15 +226,16 @@ class StealthySessionMixin:
204
226
  self.selector_config = config.selector_config
205
227
  self.additional_args = config.additional_args
206
228
  self.page_action = config.page_action
207
- self._headers_keys = set(map(str.lower, self.extra_headers.keys())) if self.extra_headers else set()
229
+ self.user_data_dir = config.user_data_dir
230
+ self._headers_keys = {header.lower() for header in self.extra_headers.keys()} if self.extra_headers else set()
208
231
  self.__initiate_browser_options__()
209
232
 
210
233
  def __initiate_browser_options__(self):
211
234
  """Initiate browser options."""
212
- self.launch_options = generate_launch_options(
235
+ self.launch_options: Dict[str, Any] = generate_launch_options(
213
236
  **{
214
237
  "geoip": self.geoip,
215
- "proxy": dict(self.proxy) if self.proxy else self.proxy,
238
+ "proxy": dict(self.proxy) if self.proxy and isinstance(self.proxy, tuple) else self.proxy,
216
239
  "addons": self.addons,
217
240
  "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
218
241
  "headless": self.headless,
@@ -222,7 +245,7 @@ class StealthySessionMixin:
222
245
  "block_webrtc": self.block_webrtc,
223
246
  "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
224
247
  "os": None if self.os_randomize else get_os_name(),
225
- "user_data_dir": "",
248
+ "user_data_dir": self.user_data_dir,
226
249
  "ff_version": __ff_version_str__,
227
250
  "firefox_user_prefs": {
228
251
  # This is what enabling `enable_cache` does internally, so we do it from here instead
@@ -232,7 +255,7 @@ class StealthySessionMixin:
232
255
  "browser.cache.disk_cache_ssl": True,
233
256
  "browser.cache.disk.smart_size.enabled": True,
234
257
  },
235
- **self.additional_args,
258
+ **cast(Dict, self.additional_args),
236
259
  }
237
260
  )
238
261