scrapling 0.3.1__py3-none-any.whl → 0.3.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +1 -1
  2. scrapling/cli.py +38 -51
  3. scrapling/core/_html_utils.py +3 -9
  4. scrapling/core/ai.py +5 -13
  5. scrapling/core/custom_types.py +19 -61
  6. scrapling/core/mixins.py +6 -28
  7. scrapling/core/shell.py +49 -127
  8. scrapling/core/storage.py +2 -8
  9. scrapling/core/translator.py +8 -20
  10. scrapling/core/utils/__init__.py +10 -0
  11. scrapling/core/utils/_shell.py +48 -0
  12. scrapling/core/{utils.py → utils/_utils.py} +5 -21
  13. scrapling/engines/__init__.py +0 -16
  14. scrapling/engines/_browsers/_base.py +297 -0
  15. scrapling/engines/_browsers/_camoufox.py +227 -296
  16. scrapling/engines/_browsers/_config_tools.py +2 -1
  17. scrapling/engines/_browsers/_controllers.py +209 -281
  18. scrapling/engines/_browsers/_page.py +37 -15
  19. scrapling/engines/_browsers/_validators.py +9 -15
  20. scrapling/engines/constants.py +3 -6
  21. scrapling/engines/static.py +25 -75
  22. scrapling/engines/toolbelt/__init__.py +1 -20
  23. scrapling/engines/toolbelt/convertor.py +95 -86
  24. scrapling/engines/toolbelt/custom.py +7 -99
  25. scrapling/engines/toolbelt/fingerprints.py +1 -3
  26. scrapling/engines/toolbelt/navigation.py +4 -58
  27. scrapling/fetchers.py +29 -24
  28. scrapling/parser.py +45 -122
  29. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/METADATA +54 -46
  30. scrapling-0.3.2.dist-info/RECORD +44 -0
  31. scrapling-0.3.1.dist-info/RECORD +0 -41
  32. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/WHEEL +0 -0
  33. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/entry_points.txt +0 -0
  34. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/licenses/LICENSE +0 -0
  35. {scrapling-0.3.1.dist-info → scrapling-0.3.2.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,297 @@
1
+ from time import time, sleep
2
+ from asyncio import sleep as asyncio_sleep, Lock
3
+
4
+ from camoufox import DefaultAddons
5
+ from playwright.sync_api import BrowserContext, Playwright
6
+ from playwright.async_api import (
7
+ BrowserContext as AsyncBrowserContext,
8
+ Playwright as AsyncPlaywright,
9
+ )
10
+ from camoufox.utils import (
11
+ launch_options as generate_launch_options,
12
+ installed_verstr as camoufox_version,
13
+ )
14
+
15
+ from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
16
+ from scrapling.core._types import (
17
+ Any,
18
+ Dict,
19
+ Optional,
20
+ )
21
+ from ._page import PageInfo, PagePool
22
+ from ._config_tools import _compiled_stealth_scripts
23
+ from ._config_tools import _launch_kwargs, _context_kwargs
24
+ from scrapling.engines.toolbelt.fingerprints import get_os_name
25
+ from ._validators import validate, PlaywrightConfig, CamoufoxConfig
26
+
27
+ __ff_version_str__ = camoufox_version().split(".", 1)[0]
28
+
29
+
30
+ class SyncSession:
31
+ def __init__(self, max_pages: int = 1):
32
+ self.max_pages = max_pages
33
+ self.page_pool = PagePool(max_pages)
34
+ self.__max_wait_for_page = 60
35
+ self.playwright: Optional[Playwright] = None
36
+ self.context: Optional[BrowserContext] = None
37
+ self._closed = False
38
+
39
+ def _get_page(
40
+ self,
41
+ timeout: int | float,
42
+ extra_headers: Optional[Dict[str, str]],
43
+ disable_resources: bool,
44
+ ) -> PageInfo: # pragma: no cover
45
+ """Get a new page to use"""
46
+
47
+ # Close all finished pages to ensure clean state
48
+ self.page_pool.close_all_finished_pages()
49
+
50
+ # If we're at max capacity after cleanup, wait for busy pages to finish
51
+ if self.page_pool.pages_count >= self.max_pages:
52
+ start_time = time()
53
+ while time() - start_time < self.__max_wait_for_page:
54
+ # Wait for any pages to finish, then clean them up
55
+ sleep(0.05)
56
+ self.page_pool.close_all_finished_pages()
57
+ if self.page_pool.pages_count < self.max_pages:
58
+ break
59
+ else:
60
+ raise TimeoutError(
61
+ f"No pages finished to clear place in the pool within the {self.__max_wait_for_page}s timeout period"
62
+ )
63
+
64
+ page = self.context.new_page()
65
+ page.set_default_navigation_timeout(timeout)
66
+ page.set_default_timeout(timeout)
67
+ if extra_headers:
68
+ page.set_extra_http_headers(extra_headers)
69
+
70
+ if disable_resources:
71
+ page.route("**/*", intercept_route)
72
+
73
+ if getattr(self, "stealth", False):
74
+ for script in _compiled_stealth_scripts():
75
+ page.add_init_script(script=script)
76
+
77
+ return self.page_pool.add_page(page)
78
+
79
+ @staticmethod
80
+ def _get_with_precedence(request_value: Any, session_value: Any, sentinel_value: object) -> Any:
81
+ """Get value with request-level priority over session-level"""
82
+ return request_value if request_value is not sentinel_value else session_value
83
+
84
+ def get_pool_stats(self) -> Dict[str, int]:
85
+ """Get statistics about the current page pool"""
86
+ return {
87
+ "total_pages": self.page_pool.pages_count,
88
+ "busy_pages": self.page_pool.busy_count,
89
+ "max_pages": self.max_pages,
90
+ }
91
+
92
+
93
+ class AsyncSession(SyncSession):
94
+ def __init__(self, max_pages: int = 1):
95
+ super().__init__(max_pages)
96
+ self.playwright: Optional[AsyncPlaywright] = None
97
+ self.context: Optional[AsyncBrowserContext] = None
98
+ self._lock = Lock()
99
+
100
+ async def _get_page(
101
+ self,
102
+ timeout: int | float,
103
+ extra_headers: Optional[Dict[str, str]],
104
+ disable_resources: bool,
105
+ ) -> PageInfo: # pragma: no cover
106
+ """Get a new page to use"""
107
+ async with self._lock:
108
+ # Close all finished pages to ensure clean state
109
+ await self.page_pool.aclose_all_finished_pages()
110
+
111
+ # If we're at max capacity after cleanup, wait for busy pages to finish
112
+ if self.page_pool.pages_count >= self.max_pages:
113
+ start_time = time()
114
+ while time() - start_time < self.__max_wait_for_page:
115
+ # Wait for any pages to finish, then clean them up
116
+ await asyncio_sleep(0.05)
117
+ await self.page_pool.aclose_all_finished_pages()
118
+ if self.page_pool.pages_count < self.max_pages:
119
+ break
120
+ else:
121
+ raise TimeoutError(
122
+ f"No pages finished to clear place in the pool within the {self.__max_wait_for_page}s timeout period"
123
+ )
124
+
125
+ page = await self.context.new_page()
126
+ page.set_default_navigation_timeout(timeout)
127
+ page.set_default_timeout(timeout)
128
+ if extra_headers:
129
+ await page.set_extra_http_headers(extra_headers)
130
+
131
+ if disable_resources:
132
+ await page.route("**/*", async_intercept_route)
133
+
134
+ if getattr(self, "stealth", False):
135
+ for script in _compiled_stealth_scripts():
136
+ await page.add_init_script(script=script)
137
+
138
+ return self.page_pool.add_page(page)
139
+
140
+
141
+ class DynamicSessionMixin:
142
+ def __validate__(self, **params):
143
+ config = validate(params, model=PlaywrightConfig)
144
+
145
+ self.max_pages = config.max_pages
146
+ self.headless = config.headless
147
+ self.hide_canvas = config.hide_canvas
148
+ self.disable_webgl = config.disable_webgl
149
+ self.real_chrome = config.real_chrome
150
+ self.stealth = config.stealth
151
+ self.google_search = config.google_search
152
+ self.wait = config.wait
153
+ self.proxy = config.proxy
154
+ self.locale = config.locale
155
+ self.extra_headers = config.extra_headers
156
+ self.useragent = config.useragent
157
+ self.timeout = config.timeout
158
+ self.cookies = config.cookies
159
+ self.disable_resources = config.disable_resources
160
+ self.cdp_url = config.cdp_url
161
+ self.network_idle = config.network_idle
162
+ self.load_dom = config.load_dom
163
+ self.wait_selector = config.wait_selector
164
+ self.init_script = config.init_script
165
+ self.wait_selector_state = config.wait_selector_state
166
+ self.selector_config = config.selector_config
167
+ self.page_action = config.page_action
168
+ self._headers_keys = set(map(str.lower, self.extra_headers.keys())) if self.extra_headers else set()
169
+ self.__initiate_browser_options__()
170
+
171
+ def __initiate_browser_options__(self):
172
+ if not self.cdp_url:
173
+ # `launch_options` is used with persistent context
174
+ self.launch_options = dict(
175
+ _launch_kwargs(
176
+ self.headless,
177
+ self.proxy,
178
+ self.locale,
179
+ tuple(self.extra_headers.items()) if self.extra_headers else tuple(),
180
+ self.useragent,
181
+ self.real_chrome,
182
+ self.stealth,
183
+ self.hide_canvas,
184
+ self.disable_webgl,
185
+ )
186
+ )
187
+ self.launch_options["extra_http_headers"] = dict(self.launch_options["extra_http_headers"])
188
+ self.launch_options["proxy"] = dict(self.launch_options["proxy"]) or None
189
+ self.context_options = dict()
190
+ else:
191
+ # while `context_options` is left to be used when cdp mode is enabled
192
+ self.launch_options = dict()
193
+ self.context_options = dict(
194
+ _context_kwargs(
195
+ self.proxy,
196
+ self.locale,
197
+ tuple(self.extra_headers.items()) if self.extra_headers else tuple(),
198
+ self.useragent,
199
+ self.stealth,
200
+ )
201
+ )
202
+ self.context_options["extra_http_headers"] = dict(self.context_options["extra_http_headers"])
203
+ self.context_options["proxy"] = dict(self.context_options["proxy"]) or None
204
+
205
+
206
+ class StealthySessionMixin:
207
+ def __validate__(self, **params):
208
+ config = validate(params, model=CamoufoxConfig)
209
+
210
+ self.max_pages = config.max_pages
211
+ self.headless = config.headless
212
+ self.block_images = config.block_images
213
+ self.disable_resources = config.disable_resources
214
+ self.block_webrtc = config.block_webrtc
215
+ self.allow_webgl = config.allow_webgl
216
+ self.network_idle = config.network_idle
217
+ self.load_dom = config.load_dom
218
+ self.humanize = config.humanize
219
+ self.solve_cloudflare = config.solve_cloudflare
220
+ self.wait = config.wait
221
+ self.timeout = config.timeout
222
+ self.page_action = config.page_action
223
+ self.wait_selector = config.wait_selector
224
+ self.init_script = config.init_script
225
+ self.addons = config.addons
226
+ self.wait_selector_state = config.wait_selector_state
227
+ self.cookies = config.cookies
228
+ self.google_search = config.google_search
229
+ self.extra_headers = config.extra_headers
230
+ self.proxy = config.proxy
231
+ self.os_randomize = config.os_randomize
232
+ self.disable_ads = config.disable_ads
233
+ self.geoip = config.geoip
234
+ self.selector_config = config.selector_config
235
+ self.additional_args = config.additional_args
236
+ self.page_action = config.page_action
237
+ self._headers_keys = set(map(str.lower, self.extra_headers.keys())) if self.extra_headers else set()
238
+ self.__initiate_browser_options__()
239
+
240
+ def __initiate_browser_options__(self):
241
+ """Initiate browser options."""
242
+ self.launch_options = generate_launch_options(
243
+ **{
244
+ "geoip": self.geoip,
245
+ "proxy": dict(self.proxy) if self.proxy else self.proxy,
246
+ "addons": self.addons,
247
+ "exclude_addons": [] if self.disable_ads else [DefaultAddons.UBO],
248
+ "headless": self.headless,
249
+ "humanize": True if self.solve_cloudflare else self.humanize,
250
+ "i_know_what_im_doing": True, # To turn warnings off with the user configurations
251
+ "allow_webgl": self.allow_webgl,
252
+ "block_webrtc": self.block_webrtc,
253
+ "block_images": self.block_images, # Careful! it makes some websites don't finish loading at all like stackoverflow even in headful mode.
254
+ "os": None if self.os_randomize else get_os_name(),
255
+ "user_data_dir": "",
256
+ "ff_version": __ff_version_str__,
257
+ "firefox_user_prefs": {
258
+ # This is what enabling `enable_cache` does internally, so we do it from here instead
259
+ "browser.sessionhistory.max_entries": 10,
260
+ "browser.sessionhistory.max_total_viewers": -1,
261
+ "browser.cache.memory.enable": True,
262
+ "browser.cache.disk_cache_ssl": True,
263
+ "browser.cache.disk.smart_size.enabled": True,
264
+ },
265
+ **self.additional_args,
266
+ }
267
+ )
268
+
269
+ @staticmethod
270
+ def _detect_cloudflare(page_content: str) -> str | None:
271
+ """
272
+ Detect the type of Cloudflare challenge present in the provided page content.
273
+
274
+ This function analyzes the given page content to identify whether a specific
275
+ type of Cloudflare challenge is present. It checks for three predefined
276
+ challenge types: non-interactive, managed, and interactive. If a challenge
277
+ type is detected, it returns the corresponding type as a string. If no
278
+ challenge type is detected, it returns None.
279
+
280
+ Args:
281
+ page_content (str): The content of the page to analyze for Cloudflare
282
+ challenge types.
283
+
284
+ Returns:
285
+ str: A string representing the detected Cloudflare challenge type, if
286
+ found. Returns None if no challenge matches.
287
+ """
288
+ challenge_types = (
289
+ "non-interactive",
290
+ "managed",
291
+ "interactive",
292
+ )
293
+ for ctype in challenge_types:
294
+ if f"cType: '{ctype}'" in page_content:
295
+ return ctype
296
+
297
+ return None