scrapling 0.1.2__py3-none-any.whl → 0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ import json
2
+ import logging
3
+ from scrapling.core._types import Union, Callable, Optional, List, Dict
4
+
5
+ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
6
+ from scrapling.engines.toolbelt import (
7
+ Response,
8
+ do_nothing,
9
+ js_bypass_path,
10
+ intercept_route,
11
+ generate_headers,
12
+ check_type_validity,
13
+ construct_cdp_url,
14
+ generate_convincing_referer,
15
+ )
16
+
17
+
18
+ class PlaywrightEngine:
19
+ def __init__(
20
+ self, headless: Union[bool, str] = True,
21
+ disable_resources: bool = False,
22
+ useragent: Optional[str] = None,
23
+ network_idle: Optional[bool] = False,
24
+ timeout: Optional[float] = 30000,
25
+ page_action: Callable = do_nothing,
26
+ wait_selector: Optional[str] = None,
27
+ wait_selector_state: Optional[str] = 'attached',
28
+ stealth: bool = False,
29
+ hide_canvas: bool = True,
30
+ disable_webgl: bool = False,
31
+ cdp_url: Optional[str] = None,
32
+ nstbrowser_mode: bool = False,
33
+ nstbrowser_config: Optional[Dict] = None,
34
+ google_search: Optional[bool] = True,
35
+ extra_headers: Optional[Dict[str, str]] = None,
36
+ adaptor_arguments: Dict = None
37
+ ):
38
+ """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
39
+
40
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
41
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
42
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
43
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
44
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
45
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
46
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
47
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
48
+ :param wait_selector: Wait for a specific css selector to be in a specific state.
49
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
50
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
51
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
52
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
53
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
54
+ :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
55
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
56
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
57
+ :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
58
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
59
+ """
60
+ self.headless = headless
61
+ self.disable_resources = disable_resources
62
+ self.network_idle = bool(network_idle)
63
+ self.stealth = bool(stealth)
64
+ self.hide_canvas = bool(hide_canvas)
65
+ self.disable_webgl = bool(disable_webgl)
66
+ self.google_search = bool(google_search)
67
+ self.extra_headers = extra_headers or {}
68
+ self.cdp_url = cdp_url
69
+ self.useragent = useragent
70
+ self.timeout = check_type_validity(timeout, [int, float], 30000)
71
+ if callable(page_action):
72
+ self.page_action = page_action
73
+ else:
74
+ self.page_action = do_nothing
75
+ logging.error('[Ignored] Argument "page_action" must be callable')
76
+
77
+ self.wait_selector = wait_selector
78
+ self.wait_selector_state = wait_selector_state
79
+ self.nstbrowser_mode = bool(nstbrowser_mode)
80
+ self.nstbrowser_config = nstbrowser_config
81
+ self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
82
+
83
+ def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
84
+ """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
85
+
86
+ :param flags: Chrome flags to be added to NSTBrowser query
87
+ :return: CDP URL
88
+ """
89
+ cdp_url = self.cdp_url
90
+ if self.nstbrowser_mode:
91
+ if self.nstbrowser_config and type(self.nstbrowser_config) is Dict:
92
+ config = self.nstbrowser_config
93
+ else:
94
+ query = NSTBROWSER_DEFAULT_QUERY.copy()
95
+ if flags:
96
+ query.update({
97
+ "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
98
+ })
99
+
100
+ config = {
101
+ 'config': json.dumps(query),
102
+ # 'token': ''
103
+ }
104
+ cdp_url = construct_cdp_url(cdp_url, config)
105
+ else:
106
+ # To validate it
107
+ cdp_url = construct_cdp_url(cdp_url)
108
+
109
+ return cdp_url
110
+
111
+ def fetch(self, url: str) -> Response:
112
+ """Opens up the browser and do your request based on your chosen options.
113
+
114
+ :param url: Target url.
115
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
116
+ """
117
+ if not self.stealth:
118
+ from playwright.sync_api import sync_playwright
119
+ else:
120
+ from rebrowser_playwright.sync_api import sync_playwright
121
+
122
+ with sync_playwright() as p:
123
+ # Handle the UserAgent early
124
+ if self.useragent:
125
+ extra_headers = {}
126
+ useragent = self.useragent
127
+ else:
128
+ extra_headers = generate_headers(browser_mode=True)
129
+ useragent = extra_headers.get('User-Agent')
130
+
131
+ # Prepare the flags before diving
132
+ flags = DEFAULT_STEALTH_FLAGS
133
+ if self.hide_canvas:
134
+ flags += ['--fingerprinting-canvas-image-data-noise']
135
+ if self.disable_webgl:
136
+ flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
137
+
138
+ # Creating the browser
139
+ if self.cdp_url:
140
+ cdp_url = self._cdp_url_logic(flags if self.stealth else None)
141
+ browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
142
+ else:
143
+ if self.stealth:
144
+ browser = p.chromium.launch(headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True)
145
+ else:
146
+ browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
147
+
148
+ # Creating the context
149
+ if self.stealth:
150
+ context = browser.new_context(
151
+ locale='en-US',
152
+ is_mobile=False,
153
+ has_touch=False,
154
+ color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
155
+ user_agent=useragent,
156
+ device_scale_factor=2,
157
+ # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
158
+ service_workers="allow",
159
+ ignore_https_errors=True,
160
+ extra_http_headers=extra_headers,
161
+ screen={"width": 1920, "height": 1080},
162
+ viewport={"width": 1920, "height": 1080},
163
+ permissions=["geolocation", 'notifications'],
164
+ )
165
+ else:
166
+ context = browser.new_context(
167
+ color_scheme='dark',
168
+ user_agent=useragent,
169
+ device_scale_factor=2,
170
+ extra_http_headers=extra_headers
171
+ )
172
+
173
+ # Finally we are in business
174
+ page = context.new_page()
175
+ page.set_default_navigation_timeout(self.timeout)
176
+ page.set_default_timeout(self.timeout)
177
+
178
+ if self.extra_headers:
179
+ page.set_extra_http_headers(self.extra_headers)
180
+
181
+ if self.disable_resources:
182
+ page.route("**/*", intercept_route)
183
+
184
+ if self.stealth:
185
+ # Basic bypasses nothing fancy as I'm still working on it
186
+ # But with adding these bypasses to the above config, it bypasses many online tests like
187
+ # https://bot.sannysoft.com/
188
+ # https://kaliiiiiiiiii.github.io/brotector/
189
+ # https://pixelscan.net/
190
+ # https://iphey.com/
191
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
192
+ # https://arh.antoinevastel.com/bots/areyouheadless/
193
+ # https://prescience-data.github.io/execution-monitor.html
194
+ page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
195
+ page.add_init_script(path=js_bypass_path('window_chrome.js'))
196
+ page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
197
+ page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
198
+ page.add_init_script(path=js_bypass_path('notification_permission.js'))
199
+ page.add_init_script(path=js_bypass_path('screen_props.js'))
200
+ page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
201
+
202
+ res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
203
+ page.wait_for_load_state(state="domcontentloaded")
204
+ if self.network_idle:
205
+ page.wait_for_load_state('networkidle')
206
+
207
+ page = self.page_action(page)
208
+
209
+ if self.wait_selector and type(self.wait_selector) is str:
210
+ waiter = page.locator(self.wait_selector)
211
+ waiter.wait_for(state=self.wait_selector_state)
212
+
213
+ content_type = res.headers.get('content-type', '')
214
+ # Parse charset from content-type
215
+ encoding = 'utf-8' # default encoding
216
+ if 'charset=' in content_type.lower():
217
+ encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
218
+
219
+ response = Response(
220
+ url=res.url,
221
+ text=page.content(),
222
+ content=res.body(),
223
+ status=res.status,
224
+ reason=res.status_text,
225
+ encoding=encoding,
226
+ cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
227
+ headers=res.all_headers(),
228
+ request_headers=res.request.all_headers(),
229
+ adaptor_arguments=self.adaptor_arguments
230
+ )
231
+ page.close()
232
+ return response
@@ -0,0 +1,112 @@
1
+ import logging
2
+
3
+ from scrapling.core._types import Union, Optional, Dict
4
+ from .toolbelt import Response, generate_convincing_referer, generate_headers
5
+
6
+ import httpx
7
+ from httpx._models import Response as httpxResponse
8
+
9
+
10
+ class StaticEngine:
11
+ def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
12
+ """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
13
+
14
+ :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
15
+ :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
16
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
17
+ """
18
+ self.timeout = timeout
19
+ self.follow_redirects = bool(follow_redirects)
20
+ self._extra_headers = generate_headers(browser_mode=False)
21
+ self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
22
+
23
+ @staticmethod
24
+ def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
25
+ """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
26
+ finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
27
+
28
+ :param headers: Current headers in the request if the user passed any
29
+ :param url: The Target URL.
30
+ :param stealth: Whether stealth mode is enabled or not.
31
+ :return: A dictionary of the new headers.
32
+ """
33
+ headers = headers or {}
34
+
35
+ # Validate headers
36
+ if not headers.get('user-agent') and not headers.get('User-Agent'):
37
+ headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
38
+ logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
39
+
40
+ if stealth:
41
+ extra_headers = generate_headers(browser_mode=False)
42
+ headers.update(extra_headers)
43
+ headers.update({'referer': generate_convincing_referer(url)})
44
+
45
+ return headers
46
+
47
+ def _prepare_response(self, response: httpxResponse) -> Response:
48
+ """Takes httpx response and generates `Response` object from it.
49
+
50
+ :param response: httpx response object
51
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
52
+ """
53
+ return Response(
54
+ url=str(response.url),
55
+ text=response.text,
56
+ content=response.content,
57
+ status=response.status_code,
58
+ reason=response.reason_phrase,
59
+ encoding=response.encoding or 'utf-8',
60
+ cookies=dict(response.cookies),
61
+ headers=dict(response.headers),
62
+ request_headers=dict(response.request.headers),
63
+ adaptor_arguments=self.adaptor_arguments
64
+ )
65
+
66
+ def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
67
+ """Make basic HTTP GET request for you but with some added flavors.
68
+ :param url: Target url.
69
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
70
+ create a referer header as if this request had came from Google's search of this URL's domain.
71
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
72
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
73
+ """
74
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
75
+ request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
76
+ return self._prepare_response(request)
77
+
78
+ def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
79
+ """Make basic HTTP POST request for you but with some added flavors.
80
+ :param url: Target url.
81
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
82
+ create a referer header as if this request had came from Google's search of this URL's domain.
83
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
84
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
85
+ """
86
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
87
+ request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
88
+ return self._prepare_response(request)
89
+
90
+ def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
91
+ """Make basic HTTP DELETE request for you but with some added flavors.
92
+ :param url: Target url.
93
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
94
+ create a referer header as if this request had came from Google's search of this URL's domain.
95
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
96
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
97
+ """
98
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
99
+ request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
100
+ return self._prepare_response(request)
101
+
102
+ def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
103
+ """Make basic HTTP PUT request for you but with some added flavors.
104
+ :param url: Target url.
105
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
106
+ create a referer header as if this request had came from Google's search of this URL's domain.
107
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
108
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
109
+ """
110
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
111
+ request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
112
+ return self._prepare_response(request)
@@ -0,0 +1,18 @@
1
+ from .fingerprints import (
2
+ get_os_name,
3
+ generate_headers,
4
+ generate_convincing_referer,
5
+ )
6
+ from .custom import (
7
+ Response,
8
+ do_nothing,
9
+ BaseFetcher,
10
+ get_variable_name,
11
+ check_type_validity,
12
+ check_if_engine_usable,
13
+ )
14
+ from .navigation import (
15
+ js_bypass_path,
16
+ intercept_route,
17
+ construct_cdp_url,
18
+ )
@@ -0,0 +1,168 @@
1
+ """
2
+ Functions related to custom types or type checking
3
+ """
4
+ import inspect
5
+ import logging
6
+ from dataclasses import dataclass, field
7
+
8
+ from scrapling.core.utils import setup_basic_logging
9
+ from scrapling.parser import Adaptor, SQLiteStorageSystem
10
+ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Response:
15
+ """This class is returned by all engines as a way to unify response type between different libraries."""
16
+ url: str
17
+ text: str
18
+ content: bytes
19
+ status: int
20
+ reason: str
21
+ encoding: str = 'utf-8' # default encoding
22
+ cookies: Dict = field(default_factory=dict)
23
+ headers: Dict = field(default_factory=dict)
24
+ request_headers: Dict = field(default_factory=dict)
25
+ adaptor_arguments: Dict = field(default_factory=dict)
26
+
27
+ @property
28
+ def adaptor(self) -> Union[Adaptor, None]:
29
+ """Generate Adaptor instance from this response if possible, otherwise return None"""
30
+ automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
31
+ if self.text:
32
+ # For playwright that will be the response after all JS executed
33
+ return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
34
+ elif self.content:
35
+ # For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
36
+ # To get response Bytes after the load states
37
+ # Reference: https://playwright.dev/python/docs/api/class-page
38
+ return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
39
+ return None
40
+
41
+ def __repr__(self):
42
+ return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
43
+
44
+
45
+ class BaseFetcher:
46
+ def __init__(
47
+ self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
48
+ storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
49
+ automatch_domain: Optional[str] = None,
50
+ ):
51
+ """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
52
+ are detected and passed automatically from the Fetcher based on the response for accessibility.
53
+
54
+ :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
55
+ libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
56
+ :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
57
+ :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
58
+ priority over all auto-match related arguments/functions in the class.
59
+ :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
60
+ :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
61
+ If empty, default values will be used.
62
+ :param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
63
+ Otherwise, the domain of the request is used by default.
64
+ :param debug: Enable debug mode
65
+ """
66
+ # Adaptor class parameters
67
+ # I won't validate Adaptor's class parameters here again, I will leave it to be validated later
68
+ self.adaptor_arguments = dict(
69
+ huge_tree=huge_tree,
70
+ keep_comments=keep_comments,
71
+ auto_match=auto_match,
72
+ storage=storage,
73
+ storage_args=storage_args,
74
+ debug=debug,
75
+ )
76
+ # If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
77
+ setup_basic_logging(level='debug' if debug else 'info')
78
+ if automatch_domain:
79
+ if type(automatch_domain) is not str:
80
+ logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
81
+ else:
82
+ self.adaptor_arguments.update({'automatch_domain': automatch_domain})
83
+
84
+
85
+ def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
86
+ """This function check if the passed engine can be used by a Fetcher-type class or not.
87
+
88
+ :param engine: The engine class itself
89
+ :return: The engine class again if all checks out, otherwise raises error
90
+ :raise TypeError: If engine class don't have fetch method, If engine class have fetch attribute not method, or If engine class have fetch function but it doesn't take arguments
91
+ """
92
+ # if isinstance(engine, type):
93
+ # raise TypeError("Expected an engine instance, not a class definition of the engine")
94
+
95
+ if hasattr(engine, 'fetch'):
96
+ fetch_function = getattr(engine, "fetch")
97
+ if callable(fetch_function):
98
+ if len(inspect.signature(fetch_function).parameters) > 0:
99
+ return engine
100
+ else:
101
+ # raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
102
+ raise TypeError("Engine class must have a callable method 'fetch' with the first argument used for the url.")
103
+ else:
104
+ # raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
105
+ raise TypeError("Invalid engine class! Engine class must have a callable method 'fetch'")
106
+ else:
107
+ # raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
108
+ raise TypeError("Invalid engine class! Engine class must have the method 'fetch'")
109
+
110
+
111
+ def get_variable_name(var: Any) -> Optional[str]:
112
+ """Get the name of a variable using global and local scopes.
113
+ :param var: The variable to find the name for
114
+ :return: The name of the variable if found, None otherwise
115
+ """
116
+ for scope in [globals(), locals()]:
117
+ for name, value in scope.items():
118
+ if value is var:
119
+ return name
120
+ return None
121
+
122
+
123
+ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], default_value: Any = None, critical: bool = False, param_name: Optional[str] = None) -> Any:
124
+ """Check if a variable matches the specified type constraints.
125
+ :param variable: The variable to check
126
+ :param valid_types: List of valid types for the variable
127
+ :param default_value: Value to return if type check fails
128
+ :param critical: If True, raises TypeError instead of logging error
129
+ :param param_name: Optional parameter name for error messages
130
+ :return: The original variable if valid, default_value if invalid
131
+ :raise TypeError: If critical=True and type check fails
132
+ """
133
+ # Use provided param_name or try to get it automatically
134
+ var_name = param_name or get_variable_name(variable) or "Unknown"
135
+
136
+ # Convert valid_types to a list if None
137
+ valid_types = valid_types or []
138
+
139
+ # Handle None value
140
+ if variable is None:
141
+ if type(None) in valid_types:
142
+ return variable
143
+ error_msg = f'Argument "{var_name}" cannot be None'
144
+ if critical:
145
+ raise TypeError(error_msg)
146
+ logging.error(f'[Ignored] {error_msg}')
147
+ return default_value
148
+
149
+ # If no valid_types specified and variable has a value, return it
150
+ if not valid_types:
151
+ return variable
152
+
153
+ # Check if variable type matches any of the valid types
154
+ if not any(isinstance(variable, t) for t in valid_types):
155
+ type_names = [t.__name__ for t in valid_types]
156
+ error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
157
+ if critical:
158
+ raise TypeError(error_msg)
159
+ logging.error(f'[Ignored] {error_msg}')
160
+ return default_value
161
+
162
+ return variable
163
+
164
+
165
+ # Pew Pew
166
+ def do_nothing(page):
167
+ # Just works as a filler for `page_action` argument in browser engines
168
+ return page
@@ -0,0 +1,81 @@
1
+ """
2
+ Functions related to generating headers and fingerprints generally
3
+ """
4
+
5
+ import platform
6
+
7
+ from scrapling.core.utils import cache
8
+ from scrapling.core._types import Union, Dict
9
+
10
+ from tldextract import extract
11
+ from browserforge.headers import HeaderGenerator, Browser
12
+ from browserforge.fingerprints import FingerprintGenerator, Fingerprint
13
+
14
+
15
+ @cache(None, typed=True)
16
+ def generate_convincing_referer(url: str) -> str:
17
+ """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
18
+
19
+ >>> generate_convincing_referer('https://www.somewebsite.com/blah')
20
+ 'https://www.google.com/search?q=somewebsite'
21
+
22
+ :param url: The URL you are about to fetch.
23
+ :return: Google's search URL of the domain name
24
+ """
25
+ website_name = extract(url).domain
26
+ return f'https://www.google.com/search?q={website_name}'
27
+
28
+
29
+ @cache(None, typed=True)
30
+ def get_os_name() -> Union[str, None]:
31
+ """Get the current OS name in the same format needed for browserforge
32
+
33
+ :return: Current OS name or `None` otherwise
34
+ """
35
+ #
36
+ os_name = platform.system()
37
+ return {
38
+ 'Linux': 'linux',
39
+ 'Darwin': 'macos',
40
+ 'Windows': 'windows',
41
+ # For the future? because why not
42
+ 'iOS': 'ios',
43
+ }.get(os_name)
44
+
45
+
46
+ def generate_suitable_fingerprint() -> Fingerprint:
47
+ """Generates a browserforge's fingerprint that matches current OS, desktop device, and Chrome with version 128 at least.
48
+
49
+ This function was originally created to test Browserforge's injector.
50
+ :return: `Fingerprint` object
51
+ """
52
+ return FingerprintGenerator(
53
+ browser=[Browser(name='chrome', min_version=128)],
54
+ os=get_os_name(), # None is ignored
55
+ device='desktop'
56
+ ).generate()
57
+
58
+
59
+ def generate_headers(browser_mode: bool = False) -> Dict:
60
+ """Generate real browser-like headers using browserforge's generator
61
+
62
+ :param browser_mode: If enabled, the headers created are used for playwright so it have to match everything
63
+ :return: A dictionary of the generated headers
64
+ """
65
+ if browser_mode:
66
+ # In this mode we don't care about anything other than matching the OS and the browser type with the browser we are using
67
+ # So we don't raise any inconsistency red flags while websites fingerprinting us
68
+ os_name = get_os_name()
69
+ return HeaderGenerator(
70
+ browser=[Browser(name='chrome', min_version=128)],
71
+ os=os_name, # None is ignored
72
+ device='desktop'
73
+ ).generate()
74
+ else:
75
+ # Here it's used for normal requests that aren't done through browsers so we can take it lightly
76
+ browsers = [
77
+ Browser(name='chrome', min_version=120),
78
+ Browser(name='firefox', min_version=120),
79
+ Browser(name='edge', min_version=120),
80
+ ]
81
+ return HeaderGenerator(browser=browsers, device='desktop').generate()