scrapling 0.3.4__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {scrapling-0.3.4/scrapling.egg-info → scrapling-0.3.5}/PKG-INFO +10 -10
  2. {scrapling-0.3.4 → scrapling-0.3.5}/README.md +5 -5
  3. {scrapling-0.3.4 → scrapling-0.3.5}/pyproject.toml +4 -4
  4. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/__init__.py +1 -1
  5. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/cli.py +4 -4
  6. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/custom_types.py +2 -2
  7. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/shell.py +4 -4
  8. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_base.py +2 -28
  9. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_camoufox.py +39 -38
  10. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_controllers.py +41 -50
  11. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_page.py +1 -42
  12. scrapling-0.3.5/scrapling/engines/_browsers/_validators.py +229 -0
  13. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/static.py +2 -4
  14. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/navigation.py +1 -1
  15. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/parser.py +3 -3
  16. {scrapling-0.3.4 → scrapling-0.3.5/scrapling.egg-info}/PKG-INFO +10 -10
  17. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/requires.txt +4 -4
  18. {scrapling-0.3.4 → scrapling-0.3.5}/setup.cfg +1 -1
  19. scrapling-0.3.4/scrapling/engines/_browsers/_validators.py +0 -164
  20. {scrapling-0.3.4 → scrapling-0.3.5}/LICENSE +0 -0
  21. {scrapling-0.3.4 → scrapling-0.3.5}/MANIFEST.in +0 -0
  22. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/__init__.py +0 -0
  23. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/_html_utils.py +0 -0
  24. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/_types.py +0 -0
  25. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/ai.py +0 -0
  26. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/mixins.py +0 -0
  27. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/storage.py +0 -0
  28. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/translator.py +0 -0
  29. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/utils/__init__.py +0 -0
  30. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/utils/_shell.py +0 -0
  31. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/core/utils/_utils.py +0 -0
  32. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/__init__.py +0 -0
  33. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/__init__.py +0 -0
  34. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/_browsers/_config_tools.py +0 -0
  35. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/constants.py +0 -0
  36. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/__init__.py +0 -0
  37. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  38. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  39. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  40. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  41. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  42. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  43. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/convertor.py +0 -0
  44. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/custom.py +0 -0
  45. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/engines/toolbelt/fingerprints.py +0 -0
  46. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/fetchers.py +0 -0
  47. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling/py.typed +0 -0
  48. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/SOURCES.txt +0 -0
  49. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/dependency_links.txt +0 -0
  50. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/entry_points.txt +0 -0
  51. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/not-zip-safe +0 -0
  52. {scrapling-0.3.4 → scrapling-0.3.5}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrapling
3
- Version: 0.3.4
3
+ Version: 0.3.5
4
4
  Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -69,15 +69,15 @@ Requires-Dist: cssselect>=1.3.0
69
69
  Requires-Dist: orjson>=3.11.3
70
70
  Requires-Dist: tldextract>=5.3.0
71
71
  Provides-Extra: fetchers
72
- Requires-Dist: click>=8.2.1; extra == "fetchers"
72
+ Requires-Dist: click>=8.3.0; extra == "fetchers"
73
73
  Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
74
- Requires-Dist: playwright>=1.52.0; extra == "fetchers"
75
- Requires-Dist: rebrowser-playwright>=1.52.0; extra == "fetchers"
74
+ Requires-Dist: playwright>=1.55.0; extra == "fetchers"
75
+ Requires-Dist: patchright>=1.55.2; extra == "fetchers"
76
76
  Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
77
77
  Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
78
78
  Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
79
79
  Provides-Extra: ai
80
- Requires-Dist: mcp>=1.14.0; extra == "ai"
80
+ Requires-Dist: mcp>=1.14.1; extra == "ai"
81
81
  Requires-Dist: markdownify>=1.2.0; extra == "ai"
82
82
  Requires-Dist: scrapling[fetchers]; extra == "ai"
83
83
  Provides-Extra: shell
@@ -157,12 +157,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
157
157
 
158
158
  <!-- sponsors -->
159
159
 
160
+ <a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
160
161
  <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
161
- <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
162
162
  <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
163
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
163
164
  <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
164
- <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
165
165
  <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
166
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
166
167
 
167
168
  <!-- /sponsors -->
168
169
 
@@ -411,10 +412,9 @@ This project includes code adapted from:
411
412
  ## Thanks and References
412
413
 
413
414
  - [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
414
- - [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
415
+ - [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
415
416
  - [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
416
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
417
- - [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
417
+ - [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
418
418
 
419
419
  ---
420
420
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -67,12 +67,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
67
67
 
68
68
  <!-- sponsors -->
69
69
 
70
+ <a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
70
71
  <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
71
- <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
72
72
  <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
73
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
73
74
  <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
74
- <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
75
75
  <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
76
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
76
77
 
77
78
  <!-- /sponsors -->
78
79
 
@@ -321,10 +322,9 @@ This project includes code adapted from:
321
322
  ## Thanks and References
322
323
 
323
324
  - [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
324
- - [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
325
+ - [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
325
326
  - [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
326
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
327
- - [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
327
+ - [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
328
328
 
329
329
  ---
330
330
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -64,16 +64,16 @@ dependencies = [
64
64
 
65
65
  [project.optional-dependencies]
66
66
  fetchers = [
67
- "click>=8.2.1",
67
+ "click>=8.3.0",
68
68
  "curl_cffi>=0.13.0",
69
- "playwright>=1.52.0",
70
- "rebrowser-playwright>=1.52.0",
69
+ "playwright>=1.55.0",
70
+ "patchright>=1.55.2",
71
71
  "camoufox>=0.4.11",
72
72
  "geoip2>=5.1.0",
73
73
  "msgspec>=0.19.0",
74
74
  ]
75
75
  ai = [
76
- "mcp>=1.14.0",
76
+ "mcp>=1.14.1",
77
77
  "markdownify>=1.2.0",
78
78
  "scrapling[fetchers]",
79
79
  ]
@@ -1,5 +1,5 @@
1
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
- __version__ = "0.3.4"
2
+ __version__ = "0.3.5"
3
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
4
 
5
5
 
@@ -32,8 +32,8 @@ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any
32
32
 
33
33
  try:
34
34
  return json_loads(json_string)
35
- except JSONDecodeError as e: # pragma: no cover
36
- raise ValueError(f"Invalid JSON data '{json_string}': {e}")
35
+ except JSONDecodeError as err: # pragma: no cover
36
+ raise ValueError(f"Invalid JSON data '{json_string}': {err}")
37
37
 
38
38
 
39
39
  def __Request_and_Save(
@@ -65,8 +65,8 @@ def __ParseExtractArguments(
65
65
  for key, value in _CookieParser(cookies):
66
66
  try:
67
67
  parsed_cookies[key] = value
68
- except Exception as e:
69
- raise ValueError(f"Could not parse cookies '{cookies}': {e}")
68
+ except Exception as err:
69
+ raise ValueError(f"Could not parse cookies '{cookies}': {err}")
70
70
 
71
71
  parsed_json = __ParseJSONData(json)
72
72
  parsed_params = {}
@@ -145,7 +145,7 @@ class TextHandler(str):
145
145
  clean_match: bool = False,
146
146
  case_sensitive: bool = True,
147
147
  check_match: Literal[False] = False,
148
- ) -> "TextHandlers[TextHandler]": ...
148
+ ) -> "TextHandlers": ...
149
149
 
150
150
  def re(
151
151
  self,
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
241
241
  replace_entities: bool = True,
242
242
  clean_match: bool = False,
243
243
  case_sensitive: bool = True,
244
- ) -> "TextHandlers[TextHandler]":
244
+ ) -> "TextHandlers":
245
245
  """Call the ``.re()`` method for each element in this list and return
246
246
  their results flattened as TextHandlers.
247
247
 
@@ -201,7 +201,7 @@ class CurlParser:
201
201
  data_payload = parsed_args.data_binary # Fallback to string
202
202
 
203
203
  elif parsed_args.data_raw is not None:
204
- data_payload = parsed_args.data_raw
204
+ data_payload = parsed_args.data_raw.lstrip("$")
205
205
 
206
206
  elif parsed_args.data is not None:
207
207
  data_payload = parsed_args.data
@@ -317,8 +317,8 @@ def show_page_in_browser(page: Selector): # pragma: no cover
317
317
 
318
318
  try:
319
319
  fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
320
- with open(fd, "wb") as f:
321
- f.write(page.body)
320
+ with open(fd, "w", encoding=page.encoding) as f:
321
+ f.write(page.html_content)
322
322
 
323
323
  open_in_browser(f"file://{fname}")
324
324
  except IOError as e:
@@ -545,7 +545,7 @@ class Convertor:
545
545
  for page in pages:
546
546
  match extraction_type:
547
547
  case "markdown":
548
- yield cls._convert_to_markdown(page.body)
548
+ yield cls._convert_to_markdown(page.html_content)
549
549
  case "html":
550
550
  yield page.body
551
551
  case "text":
@@ -1,4 +1,4 @@
1
- from time import time, sleep
1
+ from time import time
2
2
  from asyncio import sleep as asyncio_sleep, Lock
3
3
 
4
4
  from camoufox import DefaultAddons
@@ -44,23 +44,7 @@ class SyncSession:
44
44
  ) -> PageInfo: # pragma: no cover
45
45
  """Get a new page to use"""
46
46
 
47
- # Close all finished pages to ensure clean state
48
- self.page_pool.close_all_finished_pages()
49
-
50
- # If we're at max capacity after cleanup, wait for busy pages to finish
51
- if self.page_pool.pages_count >= self.max_pages:
52
- start_time = time()
53
- while time() - start_time < self._max_wait_for_page:
54
- # Wait for any pages to finish, then clean them up
55
- sleep(0.05)
56
- self.page_pool.close_all_finished_pages()
57
- if self.page_pool.pages_count < self.max_pages:
58
- break
59
- else:
60
- raise TimeoutError(
61
- f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
62
- )
63
-
47
+ # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
64
48
  page = self.context.new_page()
65
49
  page.set_default_navigation_timeout(timeout)
66
50
  page.set_default_timeout(timeout)
@@ -76,11 +60,6 @@ class SyncSession:
76
60
 
77
61
  return self.page_pool.add_page(page)
78
62
 
79
- @staticmethod
80
- def _get_with_precedence(request_value: Any, session_value: Any, sentinel_value: object) -> Any:
81
- """Get value with request-level priority over session-level"""
82
- return request_value if request_value is not sentinel_value else session_value
83
-
84
63
  def get_pool_stats(self) -> Dict[str, int]:
85
64
  """Get statistics about the current page pool"""
86
65
  return {
@@ -105,16 +84,11 @@ class AsyncSession(SyncSession):
105
84
  ) -> PageInfo: # pragma: no cover
106
85
  """Get a new page to use"""
107
86
  async with self._lock:
108
- # Close all finished pages to ensure clean state
109
- await self.page_pool.aclose_all_finished_pages()
110
-
111
87
  # If we're at max capacity after cleanup, wait for busy pages to finish
112
88
  if self.page_pool.pages_count >= self.max_pages:
113
89
  start_time = time()
114
90
  while time() - start_time < self._max_wait_for_page:
115
- # Wait for any pages to finish, then clean them up
116
91
  await asyncio_sleep(0.05)
117
- await self.page_pool.aclose_all_finished_pages()
118
92
  if self.page_pool.pages_count < self.max_pages:
119
93
  break
120
94
  else:
@@ -16,7 +16,7 @@ from playwright.async_api import (
16
16
  )
17
17
  from playwright._impl._errors import Error as PlaywrightError
18
18
 
19
- from ._validators import validate, CamoufoxConfig
19
+ from ._validators import validate_fetch as _validate
20
20
  from ._base import SyncSession, AsyncSession, StealthySessionMixin
21
21
  from scrapling.core.utils import log
22
22
  from scrapling.core._types import (
@@ -297,23 +297,22 @@ class StealthySession(StealthySessionMixin, SyncSession):
297
297
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
298
298
  :return: A `Response` object.
299
299
  """
300
- # Validate all resolved parameters
301
- params = validate(
302
- dict(
303
- google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
304
- timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
305
- wait=self._get_with_precedence(wait, self.wait, _UNSET),
306
- page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
307
- extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
308
- disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
309
- wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
310
- wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
311
- network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
312
- load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
313
- solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
314
- selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
315
- ),
316
- CamoufoxConfig,
300
+ params = _validate(
301
+ [
302
+ ("google_search", google_search, self.google_search),
303
+ ("timeout", timeout, self.timeout),
304
+ ("wait", wait, self.wait),
305
+ ("page_action", page_action, self.page_action),
306
+ ("extra_headers", extra_headers, self.extra_headers),
307
+ ("disable_resources", disable_resources, self.disable_resources),
308
+ ("wait_selector", wait_selector, self.wait_selector),
309
+ ("wait_selector_state", wait_selector_state, self.wait_selector_state),
310
+ ("network_idle", network_idle, self.network_idle),
311
+ ("load_dom", load_dom, self.load_dom),
312
+ ("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
313
+ ("selector_config", selector_config, self.selector_config),
314
+ ],
315
+ _UNSET,
317
316
  )
318
317
 
319
318
  if self._closed: # pragma: no cover
@@ -381,8 +380,9 @@ class StealthySession(StealthySessionMixin, SyncSession):
381
380
  page_info.page, first_response, final_response, params.selector_config
382
381
  )
383
382
 
384
- # Mark the page as finished for next use
385
- page_info.mark_finished()
383
+ # Close the page, to free up resources
384
+ page_info.page.close()
385
+ self.page_pool.pages.remove(page_info)
386
386
 
387
387
  return response
388
388
 
@@ -616,22 +616,22 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
616
616
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
617
617
  :return: A `Response` object.
618
618
  """
619
- params = validate(
620
- dict(
621
- google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
622
- timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
623
- wait=self._get_with_precedence(wait, self.wait, _UNSET),
624
- page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
625
- extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
626
- disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
627
- wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
628
- wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
629
- network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
630
- load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
631
- solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
632
- selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
633
- ),
634
- CamoufoxConfig,
619
+ params = _validate(
620
+ [
621
+ ("google_search", google_search, self.google_search),
622
+ ("timeout", timeout, self.timeout),
623
+ ("wait", wait, self.wait),
624
+ ("page_action", page_action, self.page_action),
625
+ ("extra_headers", extra_headers, self.extra_headers),
626
+ ("disable_resources", disable_resources, self.disable_resources),
627
+ ("wait_selector", wait_selector, self.wait_selector),
628
+ ("wait_selector_state", wait_selector_state, self.wait_selector_state),
629
+ ("network_idle", network_idle, self.network_idle),
630
+ ("load_dom", load_dom, self.load_dom),
631
+ ("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
632
+ ("selector_config", selector_config, self.selector_config),
633
+ ],
634
+ _UNSET,
635
635
  )
636
636
 
637
637
  if self._closed: # pragma: no cover
@@ -701,8 +701,9 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
701
701
  page_info.page, first_response, final_response, params.selector_config
702
702
  )
703
703
 
704
- # Mark the page as finished for next use
705
- page_info.mark_finished()
704
+ # Close the page, to free up resources
705
+ await page_info.page.close()
706
+ self.page_pool.pages.remove(page_info)
706
707
 
707
708
  return response
708
709
 
@@ -11,14 +11,12 @@ from playwright.async_api import (
11
11
  Playwright as AsyncPlaywright,
12
12
  Locator as AsyncLocator,
13
13
  )
14
- from rebrowser_playwright.sync_api import sync_playwright as sync_rebrowser_playwright
15
- from rebrowser_playwright.async_api import (
16
- async_playwright as async_rebrowser_playwright,
17
- )
14
+ from patchright.sync_api import sync_playwright as sync_patchright
15
+ from patchright.async_api import async_playwright as async_patchright
18
16
 
19
17
  from scrapling.core.utils import log
20
18
  from ._base import SyncSession, AsyncSession, DynamicSessionMixin
21
- from ._validators import validate, PlaywrightConfig
19
+ from ._validators import validate_fetch as _validate
22
20
  from scrapling.core._types import (
23
21
  Dict,
24
22
  List,
@@ -154,10 +152,7 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
154
152
 
155
153
  def __create__(self):
156
154
  """Create a browser for this instance and context."""
157
- sync_context = sync_rebrowser_playwright
158
- if not self.stealth or self.real_chrome:
159
- # Because rebrowser_playwright doesn't play well with real browsers
160
- sync_context = sync_playwright
155
+ sync_context = sync_patchright if self.stealth else sync_playwright
161
156
 
162
157
  self.playwright: Playwright = sync_context().start()
163
158
 
@@ -229,22 +224,21 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
229
224
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
230
225
  :return: A `Response` object.
231
226
  """
232
- # Validate all resolved parameters
233
- params = validate(
234
- dict(
235
- google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
236
- timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
237
- wait=self._get_with_precedence(wait, self.wait, _UNSET),
238
- page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
239
- extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
240
- disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
241
- wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
242
- wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
243
- network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
244
- load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
245
- selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
246
- ),
247
- PlaywrightConfig,
227
+ params = _validate(
228
+ [
229
+ ("google_search", google_search, self.google_search),
230
+ ("timeout", timeout, self.timeout),
231
+ ("wait", wait, self.wait),
232
+ ("page_action", page_action, self.page_action),
233
+ ("extra_headers", extra_headers, self.extra_headers),
234
+ ("disable_resources", disable_resources, self.disable_resources),
235
+ ("wait_selector", wait_selector, self.wait_selector),
236
+ ("wait_selector_state", wait_selector_state, self.wait_selector_state),
237
+ ("network_idle", network_idle, self.network_idle),
238
+ ("load_dom", load_dom, self.load_dom),
239
+ ("selector_config", selector_config, self.selector_config),
240
+ ],
241
+ _UNSET,
248
242
  )
249
243
 
250
244
  if self._closed: # pragma: no cover
@@ -305,8 +299,9 @@ class DynamicSession(DynamicSessionMixin, SyncSession):
305
299
  page_info.page, first_response, final_response, params.selector_config
306
300
  )
307
301
 
308
- # Mark the page as finished for next use
309
- page_info.mark_finished()
302
+ # Close the page, to free up resources
303
+ page_info.page.close()
304
+ self.page_pool.pages.remove(page_info)
310
305
 
311
306
  return response
312
307
 
@@ -402,10 +397,7 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
402
397
 
403
398
  async def __create__(self):
404
399
  """Create a browser for this instance and context."""
405
- async_context = async_rebrowser_playwright
406
- if not self.stealth or self.real_chrome:
407
- # Because rebrowser_playwright doesn't play well with real browsers
408
- async_context = async_playwright
400
+ async_context = async_patchright if self.stealth else async_playwright
409
401
 
410
402
  self.playwright: AsyncPlaywright = await async_context().start()
411
403
 
@@ -478,22 +470,21 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
478
470
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
479
471
  :return: A `Response` object.
480
472
  """
481
- # Validate all resolved parameters
482
- params = validate(
483
- dict(
484
- google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
485
- timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
486
- wait=self._get_with_precedence(wait, self.wait, _UNSET),
487
- page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
488
- extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
489
- disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
490
- wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
491
- wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
492
- network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
493
- load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
494
- selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
495
- ),
496
- PlaywrightConfig,
473
+ params = _validate(
474
+ [
475
+ ("google_search", google_search, self.google_search),
476
+ ("timeout", timeout, self.timeout),
477
+ ("wait", wait, self.wait),
478
+ ("page_action", page_action, self.page_action),
479
+ ("extra_headers", extra_headers, self.extra_headers),
480
+ ("disable_resources", disable_resources, self.disable_resources),
481
+ ("wait_selector", wait_selector, self.wait_selector),
482
+ ("wait_selector_state", wait_selector_state, self.wait_selector_state),
483
+ ("network_idle", network_idle, self.network_idle),
484
+ ("load_dom", load_dom, self.load_dom),
485
+ ("selector_config", selector_config, self.selector_config),
486
+ ],
487
+ _UNSET,
497
488
  )
498
489
 
499
490
  if self._closed: # pragma: no cover
@@ -554,9 +545,9 @@ class AsyncDynamicSession(DynamicSessionMixin, AsyncSession):
554
545
  page_info.page, first_response, final_response, params.selector_config
555
546
  )
556
547
 
557
- # Mark the page as finished for next use
558
- page_info.mark_finished()
559
-
548
+ # Close the page, to free up resources
549
+ await page_info.page.close()
550
+ self.page_pool.pages.remove(page_info)
560
551
  return response
561
552
 
562
553
  except Exception as e: # pragma: no cover
@@ -6,7 +6,7 @@ from playwright.async_api import Page as AsyncPage
6
6
 
7
7
  from scrapling.core._types import Optional, List, Literal
8
8
 
9
- PageState = Literal["finished", "ready", "busy", "error"] # States that a page can be in
9
+ PageState = Literal["ready", "busy", "error"] # States that a page can be in
10
10
 
11
11
 
12
12
  @dataclass
@@ -23,11 +23,6 @@ class PageInfo:
23
23
  self.state = "busy"
24
24
  self.url = url
25
25
 
26
- def mark_finished(self):
27
- """Mark the page as finished for new requests"""
28
- self.state = "finished"
29
- self.url = ""
30
-
31
26
  def mark_error(self):
32
27
  """Mark the page as having an error"""
33
28
  self.state = "error"
@@ -67,12 +62,6 @@ class PagePool:
67
62
  """Get the total number of pages"""
68
63
  return len(self.pages)
69
64
 
70
- @property
71
- def finished_count(self) -> int:
72
- """Get the number of finished pages"""
73
- with self._lock:
74
- return sum(1 for p in self.pages if p.state == "finished")
75
-
76
65
  @property
77
66
  def busy_count(self) -> int:
78
67
  """Get the number of busy pages"""
@@ -83,33 +72,3 @@ class PagePool:
83
72
  """Remove pages in error state"""
84
73
  with self._lock:
85
74
  self.pages = [p for p in self.pages if p.state != "error"]
86
-
87
- def close_all_finished_pages(self):
88
- """Close all pages in finished state and remove them from the pool"""
89
- with self._lock:
90
- pages_to_remove = []
91
- for page_info in self.pages:
92
- if page_info.state == "finished":
93
- try:
94
- page_info.page.close()
95
- except Exception:
96
- pass
97
- pages_to_remove.append(page_info)
98
-
99
- for page_info in pages_to_remove:
100
- self.pages.remove(page_info)
101
-
102
- async def aclose_all_finished_pages(self):
103
- """Async version: Close all pages in finished state and remove them from the pool"""
104
- with self._lock:
105
- pages_to_remove = []
106
- for page_info in self.pages:
107
- if page_info.state == "finished":
108
- try:
109
- await page_info.page.close()
110
- except Exception:
111
- pass
112
- pages_to_remove.append(page_info)
113
-
114
- for page_info in pages_to_remove:
115
- self.pages.remove(page_info)
@@ -0,0 +1,229 @@
1
+ from pathlib import Path
2
+ from typing import Annotated
3
+ from dataclasses import dataclass
4
+ from urllib.parse import urlparse
5
+
6
+ from msgspec import Struct, Meta, convert, ValidationError
7
+
8
+ from scrapling.core._types import (
9
+ Dict,
10
+ List,
11
+ Tuple,
12
+ Optional,
13
+ Callable,
14
+ SelectorWaitStates,
15
+ )
16
+ from scrapling.engines.toolbelt.navigation import construct_proxy_dict
17
+
18
+
19
+ # Custom validators for msgspec
20
+ def _validate_file_path(value: str):
21
+ """Fast file path validation"""
22
+ path = Path(value)
23
+ if not path.exists():
24
+ raise ValueError(f"Init script path not found: {value}")
25
+ if not path.is_file():
26
+ raise ValueError(f"Init script is not a file: {value}")
27
+ if not path.is_absolute():
28
+ raise ValueError(f"Init script is not a absolute path: {value}")
29
+
30
+
31
+ def _validate_addon_path(value: str):
32
+ """Fast addon path validation"""
33
+ path = Path(value)
34
+ if not path.exists():
35
+ raise FileNotFoundError(f"Addon path not found: {value}")
36
+ if not path.is_dir():
37
+ raise ValueError(f"Addon path must be a directory of the extracted addon: {value}")
38
+
39
+
40
+ def _validate_cdp_url(cdp_url: str):
41
+ """Fast CDP URL validation"""
42
+ try:
43
+ # Check the scheme
44
+ if not cdp_url.startswith(("ws://", "wss://")):
45
+ raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
46
+
47
+ # Validate hostname and port
48
+ if not urlparse(cdp_url).netloc:
49
+ raise ValueError("Invalid hostname for the CDP URL")
50
+
51
+ except AttributeError as e:
52
+ raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
53
+
54
+ except Exception as e:
55
+ raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
56
+
57
+
58
+ # Type aliases for cleaner annotations
59
+ PagesCount = Annotated[int, Meta(ge=1, le=50)]
60
+ Seconds = Annotated[int, float, Meta(ge=0)]
61
+
62
+
63
+ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
64
+ """Configuration struct for validation"""
65
+
66
+ max_pages: PagesCount = 1
67
+ cdp_url: Optional[str] = None
68
+ headless: bool = True
69
+ google_search: bool = True
70
+ hide_canvas: bool = False
71
+ disable_webgl: bool = False
72
+ real_chrome: bool = False
73
+ stealth: bool = False
74
+ wait: Seconds = 0
75
+ page_action: Optional[Callable] = None
76
+ proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
77
+ locale: str = "en-US"
78
+ extra_headers: Optional[Dict[str, str]] = None
79
+ useragent: Optional[str] = None
80
+ timeout: Seconds = 30000
81
+ init_script: Optional[str] = None
82
+ disable_resources: bool = False
83
+ wait_selector: Optional[str] = None
84
+ cookies: Optional[List[Dict]] = None
85
+ network_idle: bool = False
86
+ load_dom: bool = True
87
+ wait_selector_state: SelectorWaitStates = "attached"
88
+ selector_config: Optional[Dict] = None
89
+
90
+ def __post_init__(self):
91
+ """Custom validation after msgspec validation"""
92
+ if self.page_action and not callable(self.page_action):
93
+ raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
94
+ if self.proxy:
95
+ self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
96
+ if self.cdp_url:
97
+ _validate_cdp_url(self.cdp_url)
98
+
99
+ if not self.cookies:
100
+ self.cookies = []
101
+ if not self.selector_config:
102
+ self.selector_config = {}
103
+
104
+ if self.init_script is not None:
105
+ _validate_file_path(self.init_script)
106
+
107
+
108
+ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
109
+ """Configuration struct for validation"""
110
+
111
+ max_pages: PagesCount = 1
112
+ headless: bool = True # noqa: F821
113
+ block_images: bool = False
114
+ disable_resources: bool = False
115
+ block_webrtc: bool = False
116
+ allow_webgl: bool = True
117
+ network_idle: bool = False
118
+ load_dom: bool = True
119
+ humanize: bool | float = True
120
+ solve_cloudflare: bool = False
121
+ wait: Seconds = 0
122
+ timeout: Seconds = 30000
123
+ init_script: Optional[str] = None
124
+ page_action: Optional[Callable] = None
125
+ wait_selector: Optional[str] = None
126
+ addons: Optional[List[str]] = None
127
+ wait_selector_state: SelectorWaitStates = "attached"
128
+ cookies: Optional[List[Dict]] = None
129
+ google_search: bool = True
130
+ extra_headers: Optional[Dict[str, str]] = None
131
+ proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
132
+ os_randomize: bool = False
133
+ disable_ads: bool = False
134
+ geoip: bool = False
135
+ selector_config: Optional[Dict] = None
136
+ additional_args: Optional[Dict] = None
137
+
138
+ def __post_init__(self):
139
+ """Custom validation after msgspec validation"""
140
+ if self.page_action and not callable(self.page_action):
141
+ raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
142
+ if self.proxy:
143
+ self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
144
+
145
+ if self.addons and isinstance(self.addons, list):
146
+ for addon in self.addons:
147
+ _validate_addon_path(addon)
148
+ else:
149
+ self.addons = []
150
+
151
+ if self.init_script is not None:
152
+ _validate_file_path(self.init_script)
153
+
154
+ if not self.cookies:
155
+ self.cookies = []
156
+ # Cloudflare timeout adjustment
157
+ if self.solve_cloudflare and self.timeout < 60_000:
158
+ self.timeout = 60_000
159
+ if not self.selector_config:
160
+ self.selector_config = {}
161
+ if not self.additional_args:
162
+ self.additional_args = {}
163
+
164
+
165
+ # Code parts to validate `fetch` in the least possible numbers of lines overall
166
+ class FetchConfig(Struct, kw_only=True):
167
+ """Configuration struct for `fetch` calls validation"""
168
+
169
+ google_search: bool = True
170
+ timeout: Seconds = 30000
171
+ wait: Seconds = 0
172
+ page_action: Optional[Callable] = None
173
+ extra_headers: Optional[Dict[str, str]] = None
174
+ disable_resources: bool = False
175
+ wait_selector: Optional[str] = None
176
+ wait_selector_state: SelectorWaitStates = "attached"
177
+ network_idle: bool = False
178
+ load_dom: bool = True
179
+ solve_cloudflare: bool = False
180
+ selector_config: Optional[Dict] = {}
181
+
182
+ def to_dict(self):
183
+ return {f: getattr(self, f) for f in self.__struct_fields__}
184
+
185
+
186
+ @dataclass
187
+ class _fetch_params:
188
+ """A dataclass of all parameters used by `fetch` calls"""
189
+
190
+ google_search: bool
191
+ timeout: Seconds
192
+ wait: Seconds
193
+ page_action: Optional[Callable]
194
+ extra_headers: Optional[Dict[str, str]]
195
+ disable_resources: bool
196
+ wait_selector: Optional[str]
197
+ wait_selector_state: SelectorWaitStates
198
+ network_idle: bool
199
+ load_dom: bool
200
+ solve_cloudflare: bool
201
+ selector_config: Optional[Dict]
202
+
203
+
204
+ def validate_fetch(params: List[Tuple], sentinel=None) -> _fetch_params:
205
+ result = {}
206
+ overrides = {}
207
+
208
+ for arg, request_value, session_value in params:
209
+ if request_value is not sentinel:
210
+ overrides[arg] = request_value
211
+ else:
212
+ result[arg] = session_value
213
+
214
+ if overrides:
215
+ overrides = validate(overrides, FetchConfig).to_dict()
216
+ overrides.update(result)
217
+ return _fetch_params(**overrides)
218
+
219
+ if not result.get("solve_cloudflare"):
220
+ result["solve_cloudflare"] = False
221
+
222
+ return _fetch_params(**result)
223
+
224
+
225
+ def validate(params: Dict, model) -> PlaywrightConfig | CamoufoxConfig | FetchConfig:
226
+ try:
227
+ return convert(params, model)
228
+ except ValidationError as e:
229
+ raise TypeError(f"Invalid argument type: {e}") from e
@@ -94,8 +94,8 @@ class FetcherSession:
94
94
  self.default_http3 = http3
95
95
  self.selector_config = selector_config or {}
96
96
 
97
- self._curl_session: Optional[CurlSession] = None
98
- self._async_curl_session: Optional[AsyncCurlSession] = None
97
+ self._curl_session: Optional[CurlSession] | bool = None
98
+ self._async_curl_session: Optional[AsyncCurlSession] | bool = None
99
99
 
100
100
  def _merge_request_args(self, **kwargs) -> Dict[str, Any]:
101
101
  """Merge request-specific arguments with default session arguments."""
@@ -239,7 +239,6 @@ class FetcherSession:
239
239
  Perform an HTTP request using the configured session.
240
240
 
241
241
  :param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
242
- :param url: Target URL for the request.
243
242
  :param request_args: Arguments to be passed to the session's `request()` method.
244
243
  :param max_retries: Maximum number of retries for the request.
245
244
  :param retry_delay: Number of seconds to wait between retries.
@@ -280,7 +279,6 @@ class FetcherSession:
280
279
  Perform an HTTP request using the configured session.
281
280
 
282
281
  :param method: HTTP method to be used, supported methods are ["GET", "POST", "PUT", "DELETE"]
283
- :param url: Target URL for the request.
284
282
  :param request_args: Arguments to be passed to the session's `request()` method.
285
283
  :param max_retries: Maximum number of retries for the request.
286
284
  :param retry_delay: Number of seconds to wait between retries.
@@ -4,7 +4,7 @@ Functions related to files and URLs
4
4
 
5
5
  from pathlib import Path
6
6
  from functools import lru_cache
7
- from urllib.parse import urlencode, urlparse
7
+ from urllib.parse import urlparse
8
8
 
9
9
  from playwright.async_api import Route as async_Route
10
10
  from msgspec import Struct, structs, convert, ValidationError
@@ -239,7 +239,7 @@ class Selector(SelectorsGeneration):
239
239
  )
240
240
 
241
241
  def __handle_element(
242
- self, element: HtmlElement | _ElementUnicodeResult
242
+ self, element: Optional[HtmlElement | _ElementUnicodeResult]
243
243
  ) -> Optional[Union[TextHandler, "Selector"]]:
244
244
  """Used internally in all functions to convert a single element to type (Selector|TextHandler) when possible"""
245
245
  if element is None:
@@ -345,7 +345,7 @@ class Selector(SelectorsGeneration):
345
345
  return TextHandler(content)
346
346
 
347
347
  @property
348
- def body(self):
348
+ def body(self) -> str | bytes:
349
349
  """Return the raw body of the current `Selector` without any processing. Useful for binary and non-HTML requests."""
350
350
  return self._raw_body
351
351
 
@@ -1259,7 +1259,7 @@ class Selectors(List[Selector]):
1259
1259
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1260
1260
  :param case_sensitive: if disabled, the function will set the regex to ignore the letters case while compiling it
1261
1261
  """
1262
- results = [n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
1262
+ results = [n.re(regex, replace_entities, clean_match, case_sensitive) for n in self]
1263
1263
  return TextHandlers(flatten(results))
1264
1264
 
1265
1265
  def re_first(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrapling
3
- Version: 0.3.4
3
+ Version: 0.3.5
4
4
  Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -69,15 +69,15 @@ Requires-Dist: cssselect>=1.3.0
69
69
  Requires-Dist: orjson>=3.11.3
70
70
  Requires-Dist: tldextract>=5.3.0
71
71
  Provides-Extra: fetchers
72
- Requires-Dist: click>=8.2.1; extra == "fetchers"
72
+ Requires-Dist: click>=8.3.0; extra == "fetchers"
73
73
  Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
74
- Requires-Dist: playwright>=1.52.0; extra == "fetchers"
75
- Requires-Dist: rebrowser-playwright>=1.52.0; extra == "fetchers"
74
+ Requires-Dist: playwright>=1.55.0; extra == "fetchers"
75
+ Requires-Dist: patchright>=1.55.2; extra == "fetchers"
76
76
  Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
77
77
  Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
78
78
  Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
79
79
  Provides-Extra: ai
80
- Requires-Dist: mcp>=1.14.0; extra == "ai"
80
+ Requires-Dist: mcp>=1.14.1; extra == "ai"
81
81
  Requires-Dist: markdownify>=1.2.0; extra == "ai"
82
82
  Requires-Dist: scrapling[fetchers]; extra == "ai"
83
83
  Provides-Extra: shell
@@ -157,12 +157,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
157
157
 
158
158
  <!-- sponsors -->
159
159
 
160
+ <a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
160
161
  <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
161
- <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
162
162
  <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
163
+ <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
163
164
  <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
164
- <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
165
165
  <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
166
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
166
167
 
167
168
  <!-- /sponsors -->
168
169
 
@@ -411,10 +412,9 @@ This project includes code adapted from:
411
412
  ## Thanks and References
412
413
 
413
414
  - [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
414
- - [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
415
+ - [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
415
416
  - [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
416
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
417
- - [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
417
+ - [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
418
418
 
419
419
  ---
420
420
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -4,7 +4,7 @@ orjson>=3.11.3
4
4
  tldextract>=5.3.0
5
5
 
6
6
  [ai]
7
- mcp>=1.14.0
7
+ mcp>=1.14.1
8
8
  markdownify>=1.2.0
9
9
  scrapling[fetchers]
10
10
 
@@ -12,10 +12,10 @@ scrapling[fetchers]
12
12
  scrapling[ai,shell]
13
13
 
14
14
  [fetchers]
15
- click>=8.2.1
15
+ click>=8.3.0
16
16
  curl_cffi>=0.13.0
17
- playwright>=1.52.0
18
- rebrowser-playwright>=1.52.0
17
+ playwright>=1.55.0
18
+ patchright>=1.55.2
19
19
  camoufox>=0.4.11
20
20
  geoip2>=5.1.0
21
21
  msgspec>=0.19.0
@@ -1,6 +1,6 @@
1
1
  [metadata]
2
2
  name = scrapling
3
- version = 0.3.4
3
+ version = 0.3.5
4
4
  author = Karim Shoair
5
5
  author_email = karim.shoair@pm.me
6
6
  description = Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
@@ -1,164 +0,0 @@
1
- from msgspec import Struct, convert, ValidationError
2
- from urllib.parse import urlparse
3
- from pathlib import Path
4
-
5
- from scrapling.core._types import (
6
- Optional,
7
- Dict,
8
- Callable,
9
- List,
10
- SelectorWaitStates,
11
- )
12
- from scrapling.engines.toolbelt.navigation import construct_proxy_dict
13
-
14
-
15
- class PlaywrightConfig(Struct, kw_only=True, frozen=False):
16
- """Configuration struct for validation"""
17
-
18
- max_pages: int = 1
19
- cdp_url: Optional[str] = None
20
- headless: bool = True
21
- google_search: bool = True
22
- hide_canvas: bool = False
23
- disable_webgl: bool = False
24
- real_chrome: bool = False
25
- stealth: bool = False
26
- wait: int | float = 0
27
- page_action: Optional[Callable] = None
28
- proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
29
- locale: str = "en-US"
30
- extra_headers: Optional[Dict[str, str]] = None
31
- useragent: Optional[str] = None
32
- timeout: int | float = 30000
33
- init_script: Optional[str] = None
34
- disable_resources: bool = False
35
- wait_selector: Optional[str] = None
36
- cookies: Optional[List[Dict]] = None
37
- network_idle: bool = False
38
- load_dom: bool = True
39
- wait_selector_state: SelectorWaitStates = "attached"
40
- selector_config: Optional[Dict] = None
41
-
42
- def __post_init__(self):
43
- """Custom validation after msgspec validation"""
44
- if self.max_pages < 1 or self.max_pages > 50:
45
- raise ValueError("max_pages must be between 1 and 50")
46
- if self.timeout < 0:
47
- raise ValueError("timeout must be >= 0")
48
- if self.page_action and not callable(self.page_action):
49
- raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
50
- if self.proxy:
51
- self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
52
- if self.cdp_url:
53
- self.__validate_cdp(self.cdp_url)
54
- if not self.cookies:
55
- self.cookies = []
56
- if not self.selector_config:
57
- self.selector_config = {}
58
-
59
- if self.init_script is not None:
60
- script_path = Path(self.init_script)
61
- if not script_path.exists():
62
- raise ValueError("Init script path not found")
63
- elif not script_path.is_file():
64
- raise ValueError("Init script is not a file")
65
- elif not script_path.is_absolute():
66
- raise ValueError("Init script is not a absolute path")
67
-
68
- @staticmethod
69
- def __validate_cdp(cdp_url):
70
- try:
71
- # Check the scheme
72
- if not cdp_url.startswith(("ws://", "wss://")):
73
- raise ValueError("CDP URL must use 'ws://' or 'wss://' scheme")
74
-
75
- # Validate hostname and port
76
- if not urlparse(cdp_url).netloc:
77
- raise ValueError("Invalid hostname for the CDP URL")
78
-
79
- except AttributeError as e:
80
- raise ValueError(f"Malformed CDP URL: {cdp_url}: {str(e)}")
81
-
82
- except Exception as e:
83
- raise ValueError(f"Invalid CDP URL '{cdp_url}': {str(e)}")
84
-
85
-
86
- class CamoufoxConfig(Struct, kw_only=True, frozen=False):
87
- """Configuration struct for validation"""
88
-
89
- max_pages: int = 1
90
- headless: bool = True # noqa: F821
91
- block_images: bool = False
92
- disable_resources: bool = False
93
- block_webrtc: bool = False
94
- allow_webgl: bool = True
95
- network_idle: bool = False
96
- load_dom: bool = True
97
- humanize: bool | float = True
98
- solve_cloudflare: bool = False
99
- wait: int | float = 0
100
- timeout: int | float = 30000
101
- init_script: Optional[str] = None
102
- page_action: Optional[Callable] = None
103
- wait_selector: Optional[str] = None
104
- addons: Optional[List[str]] = None
105
- wait_selector_state: SelectorWaitStates = "attached"
106
- cookies: Optional[List[Dict]] = None
107
- google_search: bool = True
108
- extra_headers: Optional[Dict[str, str]] = None
109
- proxy: Optional[str | Dict[str, str]] = None # The default value for proxy in Playwright's source is `None`
110
- os_randomize: bool = False
111
- disable_ads: bool = False
112
- geoip: bool = False
113
- selector_config: Optional[Dict] = None
114
- additional_args: Optional[Dict] = None
115
-
116
- def __post_init__(self):
117
- """Custom validation after msgspec validation"""
118
- if self.max_pages < 1 or self.max_pages > 50:
119
- raise ValueError("max_pages must be between 1 and 50")
120
- if self.timeout < 0:
121
- raise ValueError("timeout must be >= 0")
122
- if self.page_action and not callable(self.page_action):
123
- raise TypeError(f"page_action must be callable, got {type(self.page_action).__name__}")
124
- if self.proxy:
125
- self.proxy = construct_proxy_dict(self.proxy, as_tuple=True)
126
-
127
- if not self.addons:
128
- self.addons = []
129
- else:
130
- for addon in self.addons:
131
- addon_path = Path(addon)
132
- if not addon_path.exists():
133
- raise FileNotFoundError(f"Addon's path not found: {addon}")
134
- elif not addon_path.is_dir():
135
- raise ValueError(
136
- f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
137
- )
138
-
139
- if self.init_script is not None:
140
- script_path = Path(self.init_script)
141
- if not script_path.exists():
142
- raise ValueError("Init script path not found")
143
- elif not script_path.is_file():
144
- raise ValueError("Init script is not a file")
145
- elif not script_path.is_absolute():
146
- raise ValueError("Init script is not a absolute path")
147
-
148
- if not self.cookies:
149
- self.cookies = []
150
- if self.solve_cloudflare and self.timeout < 60_000:
151
- self.timeout = 60_000
152
- if not self.selector_config:
153
- self.selector_config = {}
154
- if not self.additional_args:
155
- self.additional_args = {}
156
-
157
-
158
- def validate(params, model):
159
- try:
160
- config = convert(params, model)
161
- except ValidationError as e:
162
- raise TypeError(f"Invalid argument type: {e}")
163
-
164
- return config
File without changes
File without changes
File without changes