scrapling 0.3.3__tar.gz → 0.3.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. {scrapling-0.3.3/scrapling.egg-info → scrapling-0.3.5}/PKG-INFO +18 -17
  2. {scrapling-0.3.3 → scrapling-0.3.5}/README.md +13 -12
  3. {scrapling-0.3.3 → scrapling-0.3.5}/pyproject.toml +4 -4
  4. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/__init__.py +1 -1
  5. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/cli.py +4 -4
  6. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/custom_types.py +2 -2
  7. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/shell.py +21 -6
  8. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_base.py +5 -31
  9. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_camoufox.py +74 -44
  10. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_controllers.py +41 -50
  11. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_page.py +1 -42
  12. scrapling-0.3.5/scrapling/engines/_browsers/_validators.py +229 -0
  13. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/static.py +2 -4
  14. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/navigation.py +1 -1
  15. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/parser.py +16 -12
  16. {scrapling-0.3.3 → scrapling-0.3.5/scrapling.egg-info}/PKG-INFO +18 -17
  17. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/requires.txt +4 -4
  18. {scrapling-0.3.3 → scrapling-0.3.5}/setup.cfg +1 -1
  19. scrapling-0.3.3/scrapling/engines/_browsers/_validators.py +0 -164
  20. {scrapling-0.3.3 → scrapling-0.3.5}/LICENSE +0 -0
  21. {scrapling-0.3.3 → scrapling-0.3.5}/MANIFEST.in +0 -0
  22. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/__init__.py +0 -0
  23. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/_html_utils.py +0 -0
  24. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/_types.py +0 -0
  25. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/ai.py +0 -0
  26. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/mixins.py +0 -0
  27. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/storage.py +0 -0
  28. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/translator.py +0 -0
  29. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/utils/__init__.py +0 -0
  30. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/utils/_shell.py +0 -0
  31. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/core/utils/_utils.py +0 -0
  32. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/__init__.py +0 -0
  33. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/__init__.py +0 -0
  34. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/_browsers/_config_tools.py +0 -0
  35. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/constants.py +0 -0
  36. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/__init__.py +0 -0
  37. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/navigator_plugins.js +0 -0
  38. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/notification_permission.js +0 -0
  39. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +0 -0
  40. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/screen_props.js +0 -0
  41. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/webdriver_fully.js +0 -0
  42. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/bypasses/window_chrome.js +0 -0
  43. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/convertor.py +0 -0
  44. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/custom.py +0 -0
  45. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/engines/toolbelt/fingerprints.py +0 -0
  46. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/fetchers.py +0 -0
  47. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling/py.typed +0 -0
  48. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/SOURCES.txt +0 -0
  49. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/dependency_links.txt +0 -0
  50. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/entry_points.txt +0 -0
  51. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/not-zip-safe +0 -0
  52. {scrapling-0.3.3 → scrapling-0.3.5}/scrapling.egg-info/top_level.txt +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: scrapling
3
- Version: 0.3.3
3
+ Version: 0.3.5
4
4
  Summary: Scrapling is an undetectable, powerful, flexible, high-performance Python library that makes Web Scraping easy and effortless as it should be!
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -69,15 +69,15 @@ Requires-Dist: cssselect>=1.3.0
69
69
  Requires-Dist: orjson>=3.11.3
70
70
  Requires-Dist: tldextract>=5.3.0
71
71
  Provides-Extra: fetchers
72
- Requires-Dist: click>=8.2.1; extra == "fetchers"
72
+ Requires-Dist: click>=8.3.0; extra == "fetchers"
73
73
  Requires-Dist: curl_cffi>=0.13.0; extra == "fetchers"
74
- Requires-Dist: playwright>=1.52.0; extra == "fetchers"
75
- Requires-Dist: rebrowser-playwright>=1.52.0; extra == "fetchers"
74
+ Requires-Dist: playwright>=1.55.0; extra == "fetchers"
75
+ Requires-Dist: patchright>=1.55.2; extra == "fetchers"
76
76
  Requires-Dist: camoufox>=0.4.11; extra == "fetchers"
77
77
  Requires-Dist: geoip2>=5.1.0; extra == "fetchers"
78
78
  Requires-Dist: msgspec>=0.19.0; extra == "fetchers"
79
79
  Provides-Extra: ai
80
- Requires-Dist: mcp>=1.14.0; extra == "ai"
80
+ Requires-Dist: mcp>=1.14.1; extra == "ai"
81
81
  Requires-Dist: markdownify>=1.2.0; extra == "ai"
82
82
  Requires-Dist: scrapling[fetchers]; extra == "ai"
83
83
  Provides-Extra: shell
@@ -114,14 +114,6 @@ Dynamic: license-file
114
114
  </p>
115
115
 
116
116
  <p align="center">
117
- <a href="https://scrapling.readthedocs.io/en/latest/#installation">
118
- Installation
119
- </a>
120
- ·
121
- <a href="https://scrapling.readthedocs.io/en/latest/overview/">
122
- Overview
123
- </a>
124
- ·
125
117
  <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
126
118
  Selection methods
127
119
  </a>
@@ -130,6 +122,14 @@ Dynamic: license-file
130
122
  Choosing a fetcher
131
123
  </a>
132
124
  ·
125
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
126
+ CLI
127
+ </a>
128
+ ·
129
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
130
+ MCP mode
131
+ </a>
132
+ ·
133
133
  <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
134
134
  Migrating from Beautifulsoup
135
135
  </a>
@@ -157,11 +157,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
157
157
 
158
158
  <!-- sponsors -->
159
159
 
160
+ <a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
160
161
  <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
162
+ <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
161
163
  <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
162
164
  <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
163
- <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
164
165
  <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
166
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
165
167
 
166
168
  <!-- /sponsors -->
167
169
 
@@ -410,10 +412,9 @@ This project includes code adapted from:
410
412
  ## Thanks and References
411
413
 
412
414
  - [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
413
- - [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
415
+ - [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
414
416
  - [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
415
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
416
- - [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
417
+ - [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
417
418
 
418
419
  ---
419
420
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -24,14 +24,6 @@
24
24
  </p>
25
25
 
26
26
  <p align="center">
27
- <a href="https://scrapling.readthedocs.io/en/latest/#installation">
28
- Installation
29
- </a>
30
- ·
31
- <a href="https://scrapling.readthedocs.io/en/latest/overview/">
32
- Overview
33
- </a>
34
- ·
35
27
  <a href="https://scrapling.readthedocs.io/en/latest/parsing/selection/">
36
28
  Selection methods
37
29
  </a>
@@ -40,6 +32,14 @@
40
32
  Choosing a fetcher
41
33
  </a>
42
34
  ·
35
+ <a href="https://scrapling.readthedocs.io/en/latest/cli/overview/">
36
+ CLI
37
+ </a>
38
+ ·
39
+ <a href="https://scrapling.readthedocs.io/en/latest/ai/mcp-server/">
40
+ MCP mode
41
+ </a>
42
+ ·
43
43
  <a href="https://scrapling.readthedocs.io/en/latest/tutorials/migrating_from_beautifulsoup/">
44
44
  Migrating from Beautifulsoup
45
45
  </a>
@@ -67,11 +67,13 @@ Built for the modern Web, Scrapling has its own rapid parsing engine and its fet
67
67
 
68
68
  <!-- sponsors -->
69
69
 
70
+ <a href="https://www.thordata.com/?ls=github&lk=D4Vinci" target="_blank" title="A global network of over 60M+ residential proxies with 99.7% availability, ensuring stable and reliable web data scraping to support AI, BI, and workflows."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/thordata.jpg"></a>
70
71
  <a href="https://evomi.com?utm_source=github&utm_medium=banner&utm_campaign=d4vinci-scrapling" target="_blank" title="Evomi is your Swiss Quality Proxy Provider, starting at $0.49/GB"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/evomi.png"></a>
72
+ <a href="https://visit.decodo.com/Dy6W0b" target="_blank" title="Try the Most Efficient Residential Proxies for Free"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/decodo.png"></a>
71
73
  <a href="https://petrosky.io/d4vinci" target="_blank" title="PetroSky delivers cutting-edge VPS hosting."><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/petrosky.png"></a>
72
74
  <a href="https://www.swiftproxy.net/" target="_blank" title="Unlock Reliable Proxy Services with Swiftproxy!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/swiftproxy.png"></a>
73
- <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
74
75
  <a href="https://www.nstproxy.com/?type=flow&utm_source=scrapling" target="_blank" title="One Proxy Service, Infinite Solutions at Unbeatable Prices!"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/NSTproxy.png"></a>
76
+ <a href="https://serpapi.com/?utm_source=scrapling" target="_blank" title="Scrape Google and other search engines with SerpApi"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png"></a>
75
77
 
76
78
  <!-- /sponsors -->
77
79
 
@@ -320,10 +322,9 @@ This project includes code adapted from:
320
322
  ## Thanks and References
321
323
 
322
324
  - [Daijro](https://github.com/daijro)'s brilliant work on [BrowserForge](https://github.com/daijro/browserforge) and [Camoufox](https://github.com/daijro/camoufox)
323
- - [Vinyzu](https://github.com/Vinyzu)'s work on [Botright](https://github.com/Vinyzu/Botright)
325
+ - [Vinyzu](https://github.com/Vinyzu)'s brilliant work on [Botright](https://github.com/Vinyzu/Botright) and [PatchRight](https://github.com/Kaliiiiiiiiii-Vinyzu/patchright)
324
326
  - [brotector](https://github.com/kaliiiiiiiiii/brotector) for browser detection bypass techniques
325
- - [fakebrowser](https://github.com/kkoooqq/fakebrowser) for fingerprinting research
326
- - [rebrowser-patches](https://github.com/rebrowser/rebrowser-patches) for stealth improvements
327
+ - [fakebrowser](https://github.com/kkoooqq/fakebrowser) and [BotBrowser](https://github.com/botswin/BotBrowser) for fingerprinting research
327
328
 
328
329
  ---
329
330
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -64,16 +64,16 @@ dependencies = [
64
64
 
65
65
  [project.optional-dependencies]
66
66
  fetchers = [
67
- "click>=8.2.1",
67
+ "click>=8.3.0",
68
68
  "curl_cffi>=0.13.0",
69
- "playwright>=1.52.0",
70
- "rebrowser-playwright>=1.52.0",
69
+ "playwright>=1.55.0",
70
+ "patchright>=1.55.2",
71
71
  "camoufox>=0.4.11",
72
72
  "geoip2>=5.1.0",
73
73
  "msgspec>=0.19.0",
74
74
  ]
75
75
  ai = [
76
- "mcp>=1.14.0",
76
+ "mcp>=1.14.1",
77
77
  "markdownify>=1.2.0",
78
78
  "scrapling[fetchers]",
79
79
  ]
@@ -1,5 +1,5 @@
1
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
- __version__ = "0.3.3"
2
+ __version__ = "0.3.5"
3
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
4
 
5
5
 
@@ -32,8 +32,8 @@ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any
32
32
 
33
33
  try:
34
34
  return json_loads(json_string)
35
- except JSONDecodeError as e: # pragma: no cover
36
- raise ValueError(f"Invalid JSON data '{json_string}': {e}")
35
+ except JSONDecodeError as err: # pragma: no cover
36
+ raise ValueError(f"Invalid JSON data '{json_string}': {err}")
37
37
 
38
38
 
39
39
  def __Request_and_Save(
@@ -65,8 +65,8 @@ def __ParseExtractArguments(
65
65
  for key, value in _CookieParser(cookies):
66
66
  try:
67
67
  parsed_cookies[key] = value
68
- except Exception as e:
69
- raise ValueError(f"Could not parse cookies '{cookies}': {e}")
68
+ except Exception as err:
69
+ raise ValueError(f"Could not parse cookies '{cookies}': {err}")
70
70
 
71
71
  parsed_json = __ParseJSONData(json)
72
72
  parsed_params = {}
@@ -145,7 +145,7 @@ class TextHandler(str):
145
145
  clean_match: bool = False,
146
146
  case_sensitive: bool = True,
147
147
  check_match: Literal[False] = False,
148
- ) -> "TextHandlers[TextHandler]": ...
148
+ ) -> "TextHandlers": ...
149
149
 
150
150
  def re(
151
151
  self,
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
241
241
  replace_entities: bool = True,
242
242
  clean_match: bool = False,
243
243
  case_sensitive: bool = True,
244
- ) -> "TextHandlers[TextHandler]":
244
+ ) -> "TextHandlers":
245
245
  """Call the ``.re()`` method for each element in this list and return
246
246
  their results flattened as TextHandlers.
247
247
 
@@ -201,7 +201,7 @@ class CurlParser:
201
201
  data_payload = parsed_args.data_binary # Fallback to string
202
202
 
203
203
  elif parsed_args.data_raw is not None:
204
- data_payload = parsed_args.data_raw
204
+ data_payload = parsed_args.data_raw.lstrip("$")
205
205
 
206
206
  elif parsed_args.data is not None:
207
207
  data_payload = parsed_args.data
@@ -318,7 +318,7 @@ def show_page_in_browser(page: Selector): # pragma: no cover
318
318
  try:
319
319
  fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
320
320
  with open(fd, "w", encoding=page.encoding) as f:
321
- f.write(page.body)
321
+ f.write(page.html_content)
322
322
 
323
323
  open_in_browser(f"file://{fname}")
324
324
  except IOError as e:
@@ -335,15 +335,25 @@ class CustomShell:
335
335
  from scrapling.fetchers import (
336
336
  Fetcher as __Fetcher,
337
337
  AsyncFetcher as __AsyncFetcher,
338
+ FetcherSession as __FetcherSession,
338
339
  DynamicFetcher as __DynamicFetcher,
340
+ DynamicSession as __DynamicSession,
341
+ AsyncDynamicSession as __AsyncDynamicSession,
339
342
  StealthyFetcher as __StealthyFetcher,
343
+ StealthySession as __StealthySession,
344
+ AsyncStealthySession as __AsyncStealthySession,
340
345
  )
341
346
 
342
347
  self.__InteractiveShellEmbed = __InteractiveShellEmbed
343
348
  self.__Fetcher = __Fetcher
344
349
  self.__AsyncFetcher = __AsyncFetcher
350
+ self.__FetcherSession = __FetcherSession
345
351
  self.__DynamicFetcher = __DynamicFetcher
352
+ self.__DynamicSession = __DynamicSession
353
+ self.__AsyncDynamicSession = __AsyncDynamicSession
346
354
  self.__StealthyFetcher = __StealthyFetcher
355
+ self.__StealthySession = __StealthySession
356
+ self.__AsyncStealthySession = __AsyncStealthySession
347
357
  self.code = code
348
358
  self.page = None
349
359
  self.pages = Selectors([])
@@ -379,9 +389,9 @@ class CustomShell:
379
389
  """Create a custom banner for the shell"""
380
390
  return f"""
381
391
  -> Available Scrapling objects:
382
- - Fetcher/AsyncFetcher
383
- - DynamicFetcher
384
- - StealthyFetcher
392
+ - Fetcher/AsyncFetcher/FetcherSession
393
+ - DynamicFetcher/DynamicSession/AsyncDynamicSession
394
+ - StealthyFetcher/StealthySession/AsyncStealthySession
385
395
  - Selector
386
396
 
387
397
  -> Useful shortcuts:
@@ -449,6 +459,11 @@ Type 'exit' or press Ctrl+D to exit.
449
459
  "delete": delete,
450
460
  "Fetcher": self.__Fetcher,
451
461
  "AsyncFetcher": self.__AsyncFetcher,
462
+ "FetcherSession": self.__FetcherSession,
463
+ "DynamicSession": self.__DynamicSession,
464
+ "AsyncDynamicSession": self.__AsyncDynamicSession,
465
+ "StealthySession": self.__StealthySession,
466
+ "AsyncStealthySession": self.__AsyncStealthySession,
452
467
  "fetch": dynamic_fetch,
453
468
  "DynamicFetcher": self.__DynamicFetcher,
454
469
  "stealthy_fetch": stealthy_fetch,
@@ -530,7 +545,7 @@ class Convertor:
530
545
  for page in pages:
531
546
  match extraction_type:
532
547
  case "markdown":
533
- yield cls._convert_to_markdown(page.body)
548
+ yield cls._convert_to_markdown(page.html_content)
534
549
  case "html":
535
550
  yield page.body
536
551
  case "text":
@@ -1,4 +1,4 @@
1
- from time import time, sleep
1
+ from time import time
2
2
  from asyncio import sleep as asyncio_sleep, Lock
3
3
 
4
4
  from camoufox import DefaultAddons
@@ -31,7 +31,7 @@ class SyncSession:
31
31
  def __init__(self, max_pages: int = 1):
32
32
  self.max_pages = max_pages
33
33
  self.page_pool = PagePool(max_pages)
34
- self.__max_wait_for_page = 60
34
+ self._max_wait_for_page = 60
35
35
  self.playwright: Optional[Playwright] = None
36
36
  self.context: Optional[BrowserContext] = None
37
37
  self._closed = False
@@ -44,23 +44,7 @@ class SyncSession:
44
44
  ) -> PageInfo: # pragma: no cover
45
45
  """Get a new page to use"""
46
46
 
47
- # Close all finished pages to ensure clean state
48
- self.page_pool.close_all_finished_pages()
49
-
50
- # If we're at max capacity after cleanup, wait for busy pages to finish
51
- if self.page_pool.pages_count >= self.max_pages:
52
- start_time = time()
53
- while time() - start_time < self.__max_wait_for_page:
54
- # Wait for any pages to finish, then clean them up
55
- sleep(0.05)
56
- self.page_pool.close_all_finished_pages()
57
- if self.page_pool.pages_count < self.max_pages:
58
- break
59
- else:
60
- raise TimeoutError(
61
- f"No pages finished to clear place in the pool within the {self.__max_wait_for_page}s timeout period"
62
- )
63
-
47
+ # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
64
48
  page = self.context.new_page()
65
49
  page.set_default_navigation_timeout(timeout)
66
50
  page.set_default_timeout(timeout)
@@ -76,11 +60,6 @@ class SyncSession:
76
60
 
77
61
  return self.page_pool.add_page(page)
78
62
 
79
- @staticmethod
80
- def _get_with_precedence(request_value: Any, session_value: Any, sentinel_value: object) -> Any:
81
- """Get value with request-level priority over session-level"""
82
- return request_value if request_value is not sentinel_value else session_value
83
-
84
63
  def get_pool_stats(self) -> Dict[str, int]:
85
64
  """Get statistics about the current page pool"""
86
65
  return {
@@ -105,21 +84,16 @@ class AsyncSession(SyncSession):
105
84
  ) -> PageInfo: # pragma: no cover
106
85
  """Get a new page to use"""
107
86
  async with self._lock:
108
- # Close all finished pages to ensure clean state
109
- await self.page_pool.aclose_all_finished_pages()
110
-
111
87
  # If we're at max capacity after cleanup, wait for busy pages to finish
112
88
  if self.page_pool.pages_count >= self.max_pages:
113
89
  start_time = time()
114
- while time() - start_time < self.__max_wait_for_page:
115
- # Wait for any pages to finish, then clean them up
90
+ while time() - start_time < self._max_wait_for_page:
116
91
  await asyncio_sleep(0.05)
117
- await self.page_pool.aclose_all_finished_pages()
118
92
  if self.page_pool.pages_count < self.max_pages:
119
93
  break
120
94
  else:
121
95
  raise TimeoutError(
122
- f"No pages finished to clear place in the pool within the {self.__max_wait_for_page}s timeout period"
96
+ f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
123
97
  )
124
98
 
125
99
  page = await self.context.new_page()
@@ -14,8 +14,9 @@ from playwright.async_api import (
14
14
  Locator as AsyncLocator,
15
15
  Page as async_Page,
16
16
  )
17
+ from playwright._impl._errors import Error as PlaywrightError
17
18
 
18
- from ._validators import validate, CamoufoxConfig
19
+ from ._validators import validate_fetch as _validate
19
20
  from ._base import SyncSession, AsyncSession, StealthySessionMixin
20
21
  from scrapling.core.utils import log
21
22
  from scrapling.core._types import (
@@ -201,20 +202,34 @@ class StealthySession(StealthySessionMixin, SyncSession):
201
202
 
202
203
  self._closed = True
203
204
 
205
+ @staticmethod
206
+ def _get_page_content(page: Page) -> str | None:
207
+ """
208
+ A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
209
+ :param page: The page to extract content from.
210
+ :return:
211
+ """
212
+ while True:
213
+ try:
214
+ return page.content() or ""
215
+ except PlaywrightError:
216
+ page.wait_for_timeout(1000)
217
+ continue
218
+
204
219
  def _solve_cloudflare(self, page: Page) -> None: # pragma: no cover
205
220
  """Solve the cloudflare challenge displayed on the playwright page passed
206
221
 
207
222
  :param page: The targeted page
208
223
  :return:
209
224
  """
210
- challenge_type = self._detect_cloudflare(page.content())
225
+ challenge_type = self._detect_cloudflare(self._get_page_content(page))
211
226
  if not challenge_type:
212
227
  log.error("No Cloudflare challenge found.")
213
228
  return
214
229
  else:
215
230
  log.info(f'The turnstile version discovered is "{challenge_type}"')
216
231
  if challenge_type == "non-interactive":
217
- while "<title>Just a moment...</title>" in (page.content()):
232
+ while "<title>Just a moment...</title>" in (self._get_page_content(page)):
218
233
  log.info("Waiting for Cloudflare wait page to disappear.")
219
234
  page.wait_for_timeout(1000)
220
235
  page.wait_for_load_state()
@@ -222,7 +237,7 @@ class StealthySession(StealthySessionMixin, SyncSession):
222
237
  return
223
238
 
224
239
  else:
225
- while "Verifying you are human." in page.content():
240
+ while "Verifying you are human." in self._get_page_content(page):
226
241
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
227
242
  page.wait_for_timeout(500)
228
243
 
@@ -282,23 +297,22 @@ class StealthySession(StealthySessionMixin, SyncSession):
282
297
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
283
298
  :return: A `Response` object.
284
299
  """
285
- # Validate all resolved parameters
286
- params = validate(
287
- dict(
288
- google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
289
- timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
290
- wait=self._get_with_precedence(wait, self.wait, _UNSET),
291
- page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
292
- extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
293
- disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
294
- wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
295
- wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
296
- network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
297
- load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
298
- solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
299
- selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
300
- ),
301
- CamoufoxConfig,
300
+ params = _validate(
301
+ [
302
+ ("google_search", google_search, self.google_search),
303
+ ("timeout", timeout, self.timeout),
304
+ ("wait", wait, self.wait),
305
+ ("page_action", page_action, self.page_action),
306
+ ("extra_headers", extra_headers, self.extra_headers),
307
+ ("disable_resources", disable_resources, self.disable_resources),
308
+ ("wait_selector", wait_selector, self.wait_selector),
309
+ ("wait_selector_state", wait_selector_state, self.wait_selector_state),
310
+ ("network_idle", network_idle, self.network_idle),
311
+ ("load_dom", load_dom, self.load_dom),
312
+ ("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
313
+ ("selector_config", selector_config, self.selector_config),
314
+ ],
315
+ _UNSET,
302
316
  )
303
317
 
304
318
  if self._closed: # pragma: no cover
@@ -366,8 +380,9 @@ class StealthySession(StealthySessionMixin, SyncSession):
366
380
  page_info.page, first_response, final_response, params.selector_config
367
381
  )
368
382
 
369
- # Mark the page as finished for next use
370
- page_info.mark_finished()
383
+ # Close the page, to free up resources
384
+ page_info.page.close()
385
+ self.page_pool.pages.remove(page_info)
371
386
 
372
387
  return response
373
388
 
@@ -506,20 +521,34 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
506
521
 
507
522
  self._closed = True
508
523
 
524
+ @staticmethod
525
+ async def _get_page_content(page: async_Page) -> str | None:
526
+ """
527
+ A workaround for Playwright issue with `page.content()` on Windows. Ref.: https://github.com/microsoft/playwright/issues/16108
528
+ :param page: The page to extract content from.
529
+ :return:
530
+ """
531
+ while True:
532
+ try:
533
+ return (await page.content()) or ""
534
+ except PlaywrightError:
535
+ await page.wait_for_timeout(1000)
536
+ continue
537
+
509
538
  async def _solve_cloudflare(self, page: async_Page):
510
539
  """Solve the cloudflare challenge displayed on the playwright page passed. The async version
511
540
 
512
541
  :param page: The async targeted page
513
542
  :return:
514
543
  """
515
- challenge_type = self._detect_cloudflare(await page.content())
544
+ challenge_type = self._detect_cloudflare(await self._get_page_content(page))
516
545
  if not challenge_type:
517
546
  log.error("No Cloudflare challenge found.")
518
547
  return
519
548
  else:
520
549
  log.info(f'The turnstile version discovered is "{challenge_type}"')
521
550
  if challenge_type == "non-interactive": # pragma: no cover
522
- while "<title>Just a moment...</title>" in (await page.content()):
551
+ while "<title>Just a moment...</title>" in (await self._get_page_content(page)):
523
552
  log.info("Waiting for Cloudflare wait page to disappear.")
524
553
  await page.wait_for_timeout(1000)
525
554
  await page.wait_for_load_state()
@@ -527,7 +556,7 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
527
556
  return
528
557
 
529
558
  else:
530
- while "Verifying you are human." in (await page.content()):
559
+ while "Verifying you are human." in (await self._get_page_content(page)):
531
560
  # Waiting for the verify spinner to disappear, checking every 1s if it disappeared
532
561
  await page.wait_for_timeout(500)
533
562
 
@@ -587,22 +616,22 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
587
616
  :param selector_config: The arguments that will be passed in the end while creating the final Selector's class.
588
617
  :return: A `Response` object.
589
618
  """
590
- params = validate(
591
- dict(
592
- google_search=self._get_with_precedence(google_search, self.google_search, _UNSET),
593
- timeout=self._get_with_precedence(timeout, self.timeout, _UNSET),
594
- wait=self._get_with_precedence(wait, self.wait, _UNSET),
595
- page_action=self._get_with_precedence(page_action, self.page_action, _UNSET),
596
- extra_headers=self._get_with_precedence(extra_headers, self.extra_headers, _UNSET),
597
- disable_resources=self._get_with_precedence(disable_resources, self.disable_resources, _UNSET),
598
- wait_selector=self._get_with_precedence(wait_selector, self.wait_selector, _UNSET),
599
- wait_selector_state=self._get_with_precedence(wait_selector_state, self.wait_selector_state, _UNSET),
600
- network_idle=self._get_with_precedence(network_idle, self.network_idle, _UNSET),
601
- load_dom=self._get_with_precedence(load_dom, self.load_dom, _UNSET),
602
- solve_cloudflare=self._get_with_precedence(solve_cloudflare, self.solve_cloudflare, _UNSET),
603
- selector_config=self._get_with_precedence(selector_config, self.selector_config, _UNSET),
604
- ),
605
- CamoufoxConfig,
619
+ params = _validate(
620
+ [
621
+ ("google_search", google_search, self.google_search),
622
+ ("timeout", timeout, self.timeout),
623
+ ("wait", wait, self.wait),
624
+ ("page_action", page_action, self.page_action),
625
+ ("extra_headers", extra_headers, self.extra_headers),
626
+ ("disable_resources", disable_resources, self.disable_resources),
627
+ ("wait_selector", wait_selector, self.wait_selector),
628
+ ("wait_selector_state", wait_selector_state, self.wait_selector_state),
629
+ ("network_idle", network_idle, self.network_idle),
630
+ ("load_dom", load_dom, self.load_dom),
631
+ ("solve_cloudflare", solve_cloudflare, self.solve_cloudflare),
632
+ ("selector_config", selector_config, self.selector_config),
633
+ ],
634
+ _UNSET,
606
635
  )
607
636
 
608
637
  if self._closed: # pragma: no cover
@@ -672,8 +701,9 @@ class AsyncStealthySession(StealthySessionMixin, AsyncSession):
672
701
  page_info.page, first_response, final_response, params.selector_config
673
702
  )
674
703
 
675
- # Mark the page as finished for next use
676
- page_info.mark_finished()
704
+ # Close the page, to free up resources
705
+ await page_info.page.close()
706
+ self.page_pool.pages.remove(page_info)
677
707
 
678
708
  return response
679
709