scrapling 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scrapling/__init__.py CHANGED
@@ -1,28 +1,38 @@
1
1
  __author__ = "Karim Shoair (karim.shoair@pm.me)"
2
- __version__ = "0.3.4"
2
+ __version__ = "0.3.6"
3
3
  __copyright__ = "Copyright (c) 2024 Karim Shoair"
4
4
 
5
+ from typing import Any, TYPE_CHECKING
5
6
 
6
- # A lightweight approach to create a lazy loader for each import for backward compatibility
7
- # This will reduces initial memory footprint significantly (only loads what's used)
8
- def __getattr__(name):
9
- lazy_imports = {
10
- "Fetcher": ("scrapling.fetchers", "Fetcher"),
11
- "Selector": ("scrapling.parser", "Selector"),
12
- "Selectors": ("scrapling.parser", "Selectors"),
13
- "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
14
- "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
15
- "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
16
- "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
17
- "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
18
- }
19
-
20
- if name in lazy_imports:
21
- module_path, class_name = lazy_imports[name]
7
+ if TYPE_CHECKING:
8
+ from scrapling.parser import Selector, Selectors
9
+ from scrapling.core.custom_types import AttributesHandler, TextHandler
10
+ from scrapling.fetchers import Fetcher, AsyncFetcher, StealthyFetcher, DynamicFetcher
11
+
12
+
13
+ # Lazy import mapping
14
+ _LAZY_IMPORTS = {
15
+ "Fetcher": ("scrapling.fetchers", "Fetcher"),
16
+ "Selector": ("scrapling.parser", "Selector"),
17
+ "Selectors": ("scrapling.parser", "Selectors"),
18
+ "AttributesHandler": ("scrapling.core.custom_types", "AttributesHandler"),
19
+ "TextHandler": ("scrapling.core.custom_types", "TextHandler"),
20
+ "AsyncFetcher": ("scrapling.fetchers", "AsyncFetcher"),
21
+ "StealthyFetcher": ("scrapling.fetchers", "StealthyFetcher"),
22
+ "DynamicFetcher": ("scrapling.fetchers", "DynamicFetcher"),
23
+ }
24
+ __all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
25
+
26
+
27
+ def __getattr__(name: str) -> Any:
28
+ if name in _LAZY_IMPORTS:
29
+ module_path, class_name = _LAZY_IMPORTS[name]
22
30
  module = __import__(module_path, fromlist=[class_name])
23
31
  return getattr(module, class_name)
24
32
  else:
25
- raise AttributeError(f"module 'scrapling' has no attribute '{name}'")
33
+ raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
26
34
 
27
35
 
28
- __all__ = ["Selector", "Fetcher", "AsyncFetcher", "StealthyFetcher", "DynamicFetcher"]
36
+ def __dir__() -> list[str]:
37
+ """Support for dir() and autocomplete."""
38
+ return sorted(__all__ + ["fetchers", "parser", "cli", "core", "__author__", "__version__", "__copyright__"])
scrapling/cli.py CHANGED
@@ -2,8 +2,9 @@ from pathlib import Path
2
2
  from subprocess import check_output
3
3
  from sys import executable as python_executable
4
4
 
5
+ from scrapling.core.utils import log
5
6
  from scrapling.engines.toolbelt.custom import Response
6
- from scrapling.core.utils import log, _CookieParser, _ParseHeaders
7
+ from scrapling.core.utils._shell import _CookieParser, _ParseHeaders
7
8
  from scrapling.core._types import List, Optional, Dict, Tuple, Any, Callable
8
9
 
9
10
  from orjson import loads as json_loads, JSONDecodeError
@@ -32,8 +33,8 @@ def __ParseJSONData(json_string: Optional[str] = None) -> Optional[Dict[str, Any
32
33
 
33
34
  try:
34
35
  return json_loads(json_string)
35
- except JSONDecodeError as e: # pragma: no cover
36
- raise ValueError(f"Invalid JSON data '{json_string}': {e}")
36
+ except JSONDecodeError as err: # pragma: no cover
37
+ raise ValueError(f"Invalid JSON data '{json_string}': {err}")
37
38
 
38
39
 
39
40
  def __Request_and_Save(
@@ -65,8 +66,8 @@ def __ParseExtractArguments(
65
66
  for key, value in _CookieParser(cookies):
66
67
  try:
67
68
  parsed_cookies[key] = value
68
- except Exception as e:
69
- raise ValueError(f"Could not parse cookies '{cookies}': {e}")
69
+ except Exception as err:
70
+ raise ValueError(f"Could not parse cookies '{cookies}': {err}")
70
71
 
71
72
  parsed_json = __ParseJSONData(json)
72
73
  parsed_params = {}
@@ -135,10 +136,26 @@ def install(force): # pragma: no cover
135
136
 
136
137
 
137
138
  @command(help="Run Scrapling's MCP server (Check the docs for more info).")
138
- def mcp():
139
+ @option(
140
+ "--http",
141
+ is_flag=True,
142
+ default=False,
143
+ help="Whether to run the MCP server in streamable-http transport or leave it as stdio (Default: False)",
144
+ )
145
+ @option(
146
+ "--host",
147
+ type=str,
148
+ default="0.0.0.0",
149
+ help="The host to use if streamable-http transport is enabled (Default: '0.0.0.0')",
150
+ )
151
+ @option(
152
+ "--port", type=int, default=8000, help="The port to use if streamable-http transport is enabled (Default: 8000)"
153
+ )
154
+ def mcp(http, host, port):
139
155
  from scrapling.core.ai import ScraplingMCPServer
140
156
 
141
- ScraplingMCPServer().serve()
157
+ server = ScraplingMCPServer()
158
+ server.serve(http, host, port)
142
159
 
143
160
 
144
161
  @command(help="Interactive scraping console")
@@ -766,7 +783,7 @@ def stealthy_fetch(
766
783
  :param disable_resources: Drop requests of unnecessary resources for a speed boost.
767
784
  :param block_webrtc: Blocks WebRTC entirely.
768
785
  :param humanize: Humanize the cursor movement.
769
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page.
786
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges.
770
787
  :param allow_webgl: Allow WebGL (recommended to keep enabled).
771
788
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
772
789
  :param disable_ads: Install the uBlock Origin addon on the browser.
scrapling/core/_types.py CHANGED
@@ -39,6 +39,4 @@ except ImportError: # pragma: no cover
39
39
  try:
40
40
  from typing_extensions import Self # Backport
41
41
  except ImportError:
42
- from typing import TypeVar
43
-
44
42
  Self = object
scrapling/core/ai.py CHANGED
@@ -42,10 +42,7 @@ def _ContentTranslator(content: Generator[str, None, None], page: _ScraplingResp
42
42
 
43
43
 
44
44
  class ScraplingMCPServer:
45
- _server = FastMCP(name="Scrapling")
46
-
47
45
  @staticmethod
48
- @_server.tool()
49
46
  def get(
50
47
  url: str,
51
48
  impersonate: Optional[BrowserTypeLiteral] = "chrome",
@@ -124,7 +121,6 @@ class ScraplingMCPServer:
124
121
  )
125
122
 
126
123
  @staticmethod
127
- @_server.tool()
128
124
  async def bulk_get(
129
125
  urls: Tuple[str, ...],
130
126
  impersonate: Optional[BrowserTypeLiteral] = "chrome",
@@ -211,7 +207,6 @@ class ScraplingMCPServer:
211
207
  ]
212
208
 
213
209
  @staticmethod
214
- @_server.tool()
215
210
  async def fetch(
216
211
  url: str,
217
212
  extraction_type: extraction_types = "markdown",
@@ -263,7 +258,7 @@ class ScraplingMCPServer:
263
258
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
264
259
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
265
260
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
266
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
261
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
267
262
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
268
263
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
269
264
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
@@ -300,7 +295,6 @@ class ScraplingMCPServer:
300
295
  )
301
296
 
302
297
  @staticmethod
303
- @_server.tool()
304
298
  async def bulk_fetch(
305
299
  urls: Tuple[str, ...],
306
300
  extraction_type: extraction_types = "markdown",
@@ -352,7 +346,7 @@ class ScraplingMCPServer:
352
346
  :param real_chrome: If you have a Chrome browser installed on your device, enable this, and the Fetcher will launch an instance of your browser and use it.
353
347
  :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
354
348
  :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
355
- :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
349
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers through CDP.
356
350
  :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search of this website's domain name.
357
351
  :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
358
352
  :param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
@@ -394,7 +388,6 @@ class ScraplingMCPServer:
394
388
  ]
395
389
 
396
390
  @staticmethod
397
- @_server.tool()
398
391
  async def stealthy_fetch(
399
392
  url: str,
400
393
  extraction_type: extraction_types = "markdown",
@@ -443,7 +436,7 @@ class ScraplingMCPServer:
443
436
  :param cookies: Set cookies for the next request.
444
437
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
445
438
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
446
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
439
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
447
440
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
448
441
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
449
442
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
@@ -494,7 +487,6 @@ class ScraplingMCPServer:
494
487
  )
495
488
 
496
489
  @staticmethod
497
- @_server.tool()
498
490
  async def bulk_stealthy_fetch(
499
491
  urls: Tuple[str, ...],
500
492
  extraction_type: extraction_types = "markdown",
@@ -543,7 +535,7 @@ class ScraplingMCPServer:
543
535
  :param cookies: Set cookies for the next request.
544
536
  :param addons: List of Firefox addons to use. Must be paths to extracted addons.
545
537
  :param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
546
- :param solve_cloudflare: Solves all 3 types of the Cloudflare's Turnstile wait page before returning the response to you.
538
+ :param solve_cloudflare: Solves all types of the Cloudflare's Turnstile/Interstitial challenges before returning the response to you.
547
539
  :param allow_webgl: Enabled by default. Disabling WebGL is not recommended as many WAFs now check if WebGL is enabled.
548
540
  :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
549
541
  :param disable_ads: Disabled by default, this installs the `uBlock Origin` addon on the browser if enabled.
@@ -598,6 +590,22 @@ class ScraplingMCPServer:
598
590
  for page in responses
599
591
  ]
600
592
 
601
- def serve(self):
593
+ def serve(self, http: bool, host: str, port: int):
602
594
  """Serve the MCP server."""
603
- self._server.run(transport="stdio")
595
+ server = FastMCP(name="Scrapling", host=host, port=port)
596
+ server.add_tool(self.get, title="get", description=self.get.__doc__, structured_output=True)
597
+ server.add_tool(self.bulk_get, title="bulk_get", description=self.bulk_get.__doc__, structured_output=True)
598
+ server.add_tool(self.fetch, title="fetch", description=self.fetch.__doc__, structured_output=True)
599
+ server.add_tool(
600
+ self.bulk_fetch, title="bulk_fetch", description=self.bulk_fetch.__doc__, structured_output=True
601
+ )
602
+ server.add_tool(
603
+ self.stealthy_fetch, title="stealthy_fetch", description=self.stealthy_fetch.__doc__, structured_output=True
604
+ )
605
+ server.add_tool(
606
+ self.bulk_stealthy_fetch,
607
+ title="bulk_stealthy_fetch",
608
+ description=self.bulk_stealthy_fetch.__doc__,
609
+ structured_output=True,
610
+ )
611
+ server.run(transport="stdio" if not http else "streamable-http")
@@ -145,7 +145,7 @@ class TextHandler(str):
145
145
  clean_match: bool = False,
146
146
  case_sensitive: bool = True,
147
147
  check_match: Literal[False] = False,
148
- ) -> "TextHandlers[TextHandler]": ...
148
+ ) -> "TextHandlers": ...
149
149
 
150
150
  def re(
151
151
  self,
@@ -241,7 +241,7 @@ class TextHandlers(List[TextHandler]):
241
241
  replace_entities: bool = True,
242
242
  clean_match: bool = False,
243
243
  case_sensitive: bool = True,
244
- ) -> "TextHandlers[TextHandler]":
244
+ ) -> "TextHandlers":
245
245
  """Call the ``.re()`` method for each element in this list and return
246
246
  their results flattened as TextHandlers.
247
247
 
scrapling/core/shell.py CHANGED
@@ -22,10 +22,11 @@ from logging import (
22
22
  from orjson import loads as json_loads, JSONDecodeError
23
23
 
24
24
  from scrapling import __version__
25
+ from scrapling.core.utils import log
25
26
  from scrapling.parser import Selector, Selectors
26
27
  from scrapling.core.custom_types import TextHandler
27
28
  from scrapling.engines.toolbelt.custom import Response
28
- from scrapling.core.utils import log, _ParseHeaders, _CookieParser
29
+ from scrapling.core.utils._shell import _ParseHeaders, _CookieParser
29
30
  from scrapling.core._types import (
30
31
  Optional,
31
32
  Dict,
@@ -201,7 +202,7 @@ class CurlParser:
201
202
  data_payload = parsed_args.data_binary # Fallback to string
202
203
 
203
204
  elif parsed_args.data_raw is not None:
204
- data_payload = parsed_args.data_raw
205
+ data_payload = parsed_args.data_raw.lstrip("$")
205
206
 
206
207
  elif parsed_args.data is not None:
207
208
  data_payload = parsed_args.data
@@ -317,8 +318,8 @@ def show_page_in_browser(page: Selector): # pragma: no cover
317
318
 
318
319
  try:
319
320
  fd, fname = make_temp_file(prefix="scrapling_view_", suffix=".html")
320
- with open(fd, "wb") as f:
321
- f.write(page.body)
321
+ with open(fd, "w", encoding=page.encoding) as f:
322
+ f.write(page.html_content)
322
323
 
323
324
  open_in_browser(f"file://{fname}")
324
325
  except IOError as e:
@@ -545,7 +546,7 @@ class Convertor:
545
546
  for page in pages:
546
547
  match extraction_type:
547
548
  case "markdown":
548
- yield cls._convert_to_markdown(page.body)
549
+ yield cls._convert_to_markdown(page.html_content)
549
550
  case "html":
550
551
  yield page.body
551
552
  case "text":
scrapling/core/storage.py CHANGED
@@ -6,7 +6,6 @@ from sqlite3 import connect as db_connect
6
6
 
7
7
  from orjson import dumps, loads
8
8
  from lxml.html import HtmlElement
9
- from tldextract import extract as tld
10
9
 
11
10
  from scrapling.core.utils import _StorageTools, log
12
11
  from scrapling.core._types import Dict, Optional, Any
@@ -26,6 +25,8 @@ class StorageSystemMixin(ABC): # pragma: no cover
26
25
  return default_value
27
26
 
28
27
  try:
28
+ from tldextract import extract as tld
29
+
29
30
  extracted = tld(self.url)
30
31
  return extracted.top_domain_under_public_suffix or extracted.domain or default_value
31
32
  except AttributeError:
@@ -7,4 +7,3 @@ from ._utils import (
7
7
  clean_spaces,
8
8
  html_forbidden,
9
9
  )
10
- from ._shell import _CookieParser, _ParseHeaders
@@ -1,2 +0,0 @@
1
- from ._controllers import DynamicSession, AsyncDynamicSession
2
- from ._camoufox import StealthySession, AsyncStealthySession
@@ -1,4 +1,4 @@
1
- from time import time, sleep
1
+ from time import time
2
2
  from asyncio import sleep as asyncio_sleep, Lock
3
3
 
4
4
  from camoufox import DefaultAddons
@@ -12,17 +12,13 @@ from camoufox.utils import (
12
12
  installed_verstr as camoufox_version,
13
13
  )
14
14
 
15
- from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
16
- from scrapling.core._types import (
17
- Any,
18
- Dict,
19
- Optional,
20
- )
21
15
  from ._page import PageInfo, PagePool
22
- from ._config_tools import _compiled_stealth_scripts
23
- from ._config_tools import _launch_kwargs, _context_kwargs
16
+ from scrapling.parser import Selector
17
+ from scrapling.core._types import Dict, Optional
24
18
  from scrapling.engines.toolbelt.fingerprints import get_os_name
25
19
  from ._validators import validate, PlaywrightConfig, CamoufoxConfig
20
+ from ._config_tools import _compiled_stealth_scripts, _launch_kwargs, _context_kwargs
21
+ from scrapling.engines.toolbelt.navigation import intercept_route, async_intercept_route
26
22
 
27
23
  __ff_version_str__ = camoufox_version().split(".", 1)[0]
28
24
 
@@ -44,23 +40,7 @@ class SyncSession:
44
40
  ) -> PageInfo: # pragma: no cover
45
41
  """Get a new page to use"""
46
42
 
47
- # Close all finished pages to ensure clean state
48
- self.page_pool.close_all_finished_pages()
49
-
50
- # If we're at max capacity after cleanup, wait for busy pages to finish
51
- if self.page_pool.pages_count >= self.max_pages:
52
- start_time = time()
53
- while time() - start_time < self._max_wait_for_page:
54
- # Wait for any pages to finish, then clean them up
55
- sleep(0.05)
56
- self.page_pool.close_all_finished_pages()
57
- if self.page_pool.pages_count < self.max_pages:
58
- break
59
- else:
60
- raise TimeoutError(
61
- f"No pages finished to clear place in the pool within the {self._max_wait_for_page}s timeout period"
62
- )
63
-
43
+ # No need to check if a page is available or not in sync code because the code blocked before reaching here till the page closed, ofc.
64
44
  page = self.context.new_page()
65
45
  page.set_default_navigation_timeout(timeout)
66
46
  page.set_default_timeout(timeout)
@@ -76,11 +56,6 @@ class SyncSession:
76
56
 
77
57
  return self.page_pool.add_page(page)
78
58
 
79
- @staticmethod
80
- def _get_with_precedence(request_value: Any, session_value: Any, sentinel_value: object) -> Any:
81
- """Get value with request-level priority over session-level"""
82
- return request_value if request_value is not sentinel_value else session_value
83
-
84
59
  def get_pool_stats(self) -> Dict[str, int]:
85
60
  """Get statistics about the current page pool"""
86
61
  return {
@@ -105,16 +80,11 @@ class AsyncSession(SyncSession):
105
80
  ) -> PageInfo: # pragma: no cover
106
81
  """Get a new page to use"""
107
82
  async with self._lock:
108
- # Close all finished pages to ensure clean state
109
- await self.page_pool.aclose_all_finished_pages()
110
-
111
83
  # If we're at max capacity after cleanup, wait for busy pages to finish
112
84
  if self.page_pool.pages_count >= self.max_pages:
113
85
  start_time = time()
114
86
  while time() - start_time < self._max_wait_for_page:
115
- # Wait for any pages to finish, then clean them up
116
87
  await asyncio_sleep(0.05)
117
- await self.page_pool.aclose_all_finished_pages()
118
88
  if self.page_pool.pages_count < self.max_pages:
119
89
  break
120
90
  else:
@@ -294,4 +264,9 @@ class StealthySessionMixin:
294
264
  if f"cType: '{ctype}'" in page_content:
295
265
  return ctype
296
266
 
267
+ # Check if turnstile captcha is embedded inside the page (Usually inside a closed Shadow iframe)
268
+ selector = Selector(content=page_content)
269
+ if selector.css('script[src*="challenges.cloudflare.com/turnstile/v"]'):
270
+ return "embedded"
271
+
297
272
  return None