scrapling 0.1.2__py3-none-any.whl → 0.2__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,232 @@
1
+ import json
2
+ import logging
3
+ from scrapling.core._types import Union, Callable, Optional, List, Dict
4
+
5
+ from scrapling.engines.constants import DEFAULT_STEALTH_FLAGS, NSTBROWSER_DEFAULT_QUERY
6
+ from scrapling.engines.toolbelt import (
7
+ Response,
8
+ do_nothing,
9
+ js_bypass_path,
10
+ intercept_route,
11
+ generate_headers,
12
+ check_type_validity,
13
+ construct_cdp_url,
14
+ generate_convincing_referer,
15
+ )
16
+
17
+
18
+ class PlaywrightEngine:
19
+ def __init__(
20
+ self, headless: Union[bool, str] = True,
21
+ disable_resources: bool = False,
22
+ useragent: Optional[str] = None,
23
+ network_idle: Optional[bool] = False,
24
+ timeout: Optional[float] = 30000,
25
+ page_action: Callable = do_nothing,
26
+ wait_selector: Optional[str] = None,
27
+ wait_selector_state: Optional[str] = 'attached',
28
+ stealth: bool = False,
29
+ hide_canvas: bool = True,
30
+ disable_webgl: bool = False,
31
+ cdp_url: Optional[str] = None,
32
+ nstbrowser_mode: bool = False,
33
+ nstbrowser_config: Optional[Dict] = None,
34
+ google_search: Optional[bool] = True,
35
+ extra_headers: Optional[Dict[str, str]] = None,
36
+ adaptor_arguments: Dict = None
37
+ ):
38
+ """An engine that utilizes PlayWright library, check the `PlayWrightFetcher` class for more documentation.
39
+
40
+ :param headless: Run the browser in headless/hidden (default), or headful/visible mode.
41
+ :param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
42
+ Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
43
+ This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
44
+ :param useragent: Pass a useragent string to be used. Otherwise the fetcher will generate a real Useragent of the same browser and use it.
45
+ :param network_idle: Wait for the page until there are no network connections for at least 500 ms.
46
+ :param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
47
+ :param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
48
+ :param wait_selector: Wait for a specific css selector to be in a specific state.
49
+ :param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
50
+ :param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
51
+ :param hide_canvas: Add random noise to canvas operations to prevent fingerprinting.
52
+ :param disable_webgl: Disables WebGL and WebGL 2.0 support entirely.
53
+ :param cdp_url: Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP.
54
+ :param nstbrowser_mode: Enables NSTBrowser mode, it have to be used with `cdp_url` argument or it will get completely ignored.
55
+ :param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
56
+ :param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
57
+ :param nstbrowser_config: The config you want to send with requests to the NSTBrowser. If left empty, Scrapling defaults to an optimized NSTBrowser's docker browserless config.
58
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
59
+ """
60
+ self.headless = headless
61
+ self.disable_resources = disable_resources
62
+ self.network_idle = bool(network_idle)
63
+ self.stealth = bool(stealth)
64
+ self.hide_canvas = bool(hide_canvas)
65
+ self.disable_webgl = bool(disable_webgl)
66
+ self.google_search = bool(google_search)
67
+ self.extra_headers = extra_headers or {}
68
+ self.cdp_url = cdp_url
69
+ self.useragent = useragent
70
+ self.timeout = check_type_validity(timeout, [int, float], 30000)
71
+ if callable(page_action):
72
+ self.page_action = page_action
73
+ else:
74
+ self.page_action = do_nothing
75
+ logging.error('[Ignored] Argument "page_action" must be callable')
76
+
77
+ self.wait_selector = wait_selector
78
+ self.wait_selector_state = wait_selector_state
79
+ self.nstbrowser_mode = bool(nstbrowser_mode)
80
+ self.nstbrowser_config = nstbrowser_config
81
+ self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
82
+
83
+ def _cdp_url_logic(self, flags: Optional[List] = None) -> str:
84
+ """Constructs new CDP URL if NSTBrowser is enabled otherwise return CDP URL as it is
85
+
86
+ :param flags: Chrome flags to be added to NSTBrowser query
87
+ :return: CDP URL
88
+ """
89
+ cdp_url = self.cdp_url
90
+ if self.nstbrowser_mode:
91
+ if self.nstbrowser_config and type(self.nstbrowser_config) is Dict:
92
+ config = self.nstbrowser_config
93
+ else:
94
+ query = NSTBROWSER_DEFAULT_QUERY.copy()
95
+ if flags:
96
+ query.update({
97
+ "args": dict(zip(flags, [''] * len(flags))), # browser args should be a dictionary
98
+ })
99
+
100
+ config = {
101
+ 'config': json.dumps(query),
102
+ # 'token': ''
103
+ }
104
+ cdp_url = construct_cdp_url(cdp_url, config)
105
+ else:
106
+ # To validate it
107
+ cdp_url = construct_cdp_url(cdp_url)
108
+
109
+ return cdp_url
110
+
111
+ def fetch(self, url: str) -> Response:
112
+ """Opens up the browser and do your request based on your chosen options.
113
+
114
+ :param url: Target url.
115
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
116
+ """
117
+ if not self.stealth:
118
+ from playwright.sync_api import sync_playwright
119
+ else:
120
+ from rebrowser_playwright.sync_api import sync_playwright
121
+
122
+ with sync_playwright() as p:
123
+ # Handle the UserAgent early
124
+ if self.useragent:
125
+ extra_headers = {}
126
+ useragent = self.useragent
127
+ else:
128
+ extra_headers = generate_headers(browser_mode=True)
129
+ useragent = extra_headers.get('User-Agent')
130
+
131
+ # Prepare the flags before diving
132
+ flags = DEFAULT_STEALTH_FLAGS
133
+ if self.hide_canvas:
134
+ flags += ['--fingerprinting-canvas-image-data-noise']
135
+ if self.disable_webgl:
136
+ flags += ['--disable-webgl', '--disable-webgl-image-chromium', '--disable-webgl2']
137
+
138
+ # Creating the browser
139
+ if self.cdp_url:
140
+ cdp_url = self._cdp_url_logic(flags if self.stealth else None)
141
+ browser = p.chromium.connect_over_cdp(endpoint_url=cdp_url)
142
+ else:
143
+ if self.stealth:
144
+ browser = p.chromium.launch(headless=self.headless, args=flags, ignore_default_args=['--enable-automation'], chromium_sandbox=True)
145
+ else:
146
+ browser = p.chromium.launch(headless=self.headless, ignore_default_args=['--enable-automation'])
147
+
148
+ # Creating the context
149
+ if self.stealth:
150
+ context = browser.new_context(
151
+ locale='en-US',
152
+ is_mobile=False,
153
+ has_touch=False,
154
+ color_scheme='dark', # Bypasses the 'prefersLightColor' check in creepjs
155
+ user_agent=useragent,
156
+ device_scale_factor=2,
157
+ # I'm thinking about disabling it to rest from all Service Workers headache but let's keep it as it is for now
158
+ service_workers="allow",
159
+ ignore_https_errors=True,
160
+ extra_http_headers=extra_headers,
161
+ screen={"width": 1920, "height": 1080},
162
+ viewport={"width": 1920, "height": 1080},
163
+ permissions=["geolocation", 'notifications'],
164
+ )
165
+ else:
166
+ context = browser.new_context(
167
+ color_scheme='dark',
168
+ user_agent=useragent,
169
+ device_scale_factor=2,
170
+ extra_http_headers=extra_headers
171
+ )
172
+
173
+ # Finally we are in business
174
+ page = context.new_page()
175
+ page.set_default_navigation_timeout(self.timeout)
176
+ page.set_default_timeout(self.timeout)
177
+
178
+ if self.extra_headers:
179
+ page.set_extra_http_headers(self.extra_headers)
180
+
181
+ if self.disable_resources:
182
+ page.route("**/*", intercept_route)
183
+
184
+ if self.stealth:
185
+ # Basic bypasses nothing fancy as I'm still working on it
186
+ # But with adding these bypasses to the above config, it bypasses many online tests like
187
+ # https://bot.sannysoft.com/
188
+ # https://kaliiiiiiiiii.github.io/brotector/
189
+ # https://pixelscan.net/
190
+ # https://iphey.com/
191
+ # https://www.browserscan.net/bot-detection <== this one also checks for the CDP runtime fingerprint
192
+ # https://arh.antoinevastel.com/bots/areyouheadless/
193
+ # https://prescience-data.github.io/execution-monitor.html
194
+ page.add_init_script(path=js_bypass_path('webdriver_fully.js'))
195
+ page.add_init_script(path=js_bypass_path('window_chrome.js'))
196
+ page.add_init_script(path=js_bypass_path('navigator_plugins.js'))
197
+ page.add_init_script(path=js_bypass_path('pdf_viewer.js'))
198
+ page.add_init_script(path=js_bypass_path('notification_permission.js'))
199
+ page.add_init_script(path=js_bypass_path('screen_props.js'))
200
+ page.add_init_script(path=js_bypass_path('playwright_fingerprint.js'))
201
+
202
+ res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
203
+ page.wait_for_load_state(state="domcontentloaded")
204
+ if self.network_idle:
205
+ page.wait_for_load_state('networkidle')
206
+
207
+ page = self.page_action(page)
208
+
209
+ if self.wait_selector and type(self.wait_selector) is str:
210
+ waiter = page.locator(self.wait_selector)
211
+ waiter.wait_for(state=self.wait_selector_state)
212
+
213
+ content_type = res.headers.get('content-type', '')
214
+ # Parse charset from content-type
215
+ encoding = 'utf-8' # default encoding
216
+ if 'charset=' in content_type.lower():
217
+ encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
218
+
219
+ response = Response(
220
+ url=res.url,
221
+ text=page.content(),
222
+ content=res.body(),
223
+ status=res.status,
224
+ reason=res.status_text,
225
+ encoding=encoding,
226
+ cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
227
+ headers=res.all_headers(),
228
+ request_headers=res.request.all_headers(),
229
+ adaptor_arguments=self.adaptor_arguments
230
+ )
231
+ page.close()
232
+ return response
@@ -0,0 +1,112 @@
1
+ import logging
2
+
3
+ from scrapling.core._types import Union, Optional, Dict
4
+ from .toolbelt import Response, generate_convincing_referer, generate_headers
5
+
6
+ import httpx
7
+ from httpx._models import Response as httpxResponse
8
+
9
+
10
+ class StaticEngine:
11
+ def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
12
+ """An engine that utilizes httpx library, check the `Fetcher` class for more documentation.
13
+
14
+ :param follow_redirects: As the name says -- if enabled (default), redirects will be followed.
15
+ :param timeout: The time to wait for the request to finish in seconds. The default is 10 seconds.
16
+ :param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
17
+ """
18
+ self.timeout = timeout
19
+ self.follow_redirects = bool(follow_redirects)
20
+ self._extra_headers = generate_headers(browser_mode=False)
21
+ self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
22
+
23
+ @staticmethod
24
+ def _headers_job(headers: Optional[Dict], url: str, stealth: bool) -> Dict:
25
+ """Adds useragent to headers if it doesn't exist, generates real headers and append it to current headers, and
26
+ finally generates a referer header that looks like if this request came from Google's search of the current URL's domain.
27
+
28
+ :param headers: Current headers in the request if the user passed any
29
+ :param url: The Target URL.
30
+ :param stealth: Whether stealth mode is enabled or not.
31
+ :return: A dictionary of the new headers.
32
+ """
33
+ headers = headers or {}
34
+
35
+ # Validate headers
36
+ if not headers.get('user-agent') and not headers.get('User-Agent'):
37
+ headers['User-Agent'] = generate_headers(browser_mode=False).get('User-Agent')
38
+ logging.info(f"Can't find useragent in headers so '{headers['User-Agent']}' was used.")
39
+
40
+ if stealth:
41
+ extra_headers = generate_headers(browser_mode=False)
42
+ headers.update(extra_headers)
43
+ headers.update({'referer': generate_convincing_referer(url)})
44
+
45
+ return headers
46
+
47
+ def _prepare_response(self, response: httpxResponse) -> Response:
48
+ """Takes httpx response and generates `Response` object from it.
49
+
50
+ :param response: httpx response object
51
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
52
+ """
53
+ return Response(
54
+ url=str(response.url),
55
+ text=response.text,
56
+ content=response.content,
57
+ status=response.status_code,
58
+ reason=response.reason_phrase,
59
+ encoding=response.encoding or 'utf-8',
60
+ cookies=dict(response.cookies),
61
+ headers=dict(response.headers),
62
+ request_headers=dict(response.request.headers),
63
+ adaptor_arguments=self.adaptor_arguments
64
+ )
65
+
66
+ def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
67
+ """Make basic HTTP GET request for you but with some added flavors.
68
+ :param url: Target url.
69
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
70
+ create a referer header as if this request had came from Google's search of this URL's domain.
71
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.get()` function so check httpx documentation for details.
72
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
73
+ """
74
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
75
+ request = httpx.get(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
76
+ return self._prepare_response(request)
77
+
78
+ def post(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
79
+ """Make basic HTTP POST request for you but with some added flavors.
80
+ :param url: Target url.
81
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
82
+ create a referer header as if this request had came from Google's search of this URL's domain.
83
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.post()` function so check httpx documentation for details.
84
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
85
+ """
86
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
87
+ request = httpx.post(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
88
+ return self._prepare_response(request)
89
+
90
+ def delete(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
91
+ """Make basic HTTP DELETE request for you but with some added flavors.
92
+ :param url: Target url.
93
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
94
+ create a referer header as if this request had came from Google's search of this URL's domain.
95
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.delete()` function so check httpx documentation for details.
96
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
97
+ """
98
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
99
+ request = httpx.delete(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
100
+ return self._prepare_response(request)
101
+
102
+ def put(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
103
+ """Make basic HTTP PUT request for you but with some added flavors.
104
+ :param url: Target url.
105
+ :param stealthy_headers: If enabled (default), Fetcher will create and add real browser's headers and
106
+ create a referer header as if this request had came from Google's search of this URL's domain.
107
+ :param kwargs: Any additional keyword arguments are passed directly to `httpx.put()` function so check httpx documentation for details.
108
+ :return: A Response object with `url`, `text`, `content`, `status`, `reason`, `encoding`, `cookies`, `headers`, `request_headers`, and the `adaptor` class for parsing, of course.
109
+ """
110
+ headers = self._headers_job(kwargs.get('headers'), url, stealthy_headers)
111
+ request = httpx.put(url=url, headers=headers, follow_redirects=self.follow_redirects, timeout=self.timeout, **kwargs)
112
+ return self._prepare_response(request)
@@ -0,0 +1,18 @@
1
+ from .fingerprints import (
2
+ get_os_name,
3
+ generate_headers,
4
+ generate_convincing_referer,
5
+ )
6
+ from .custom import (
7
+ Response,
8
+ do_nothing,
9
+ BaseFetcher,
10
+ get_variable_name,
11
+ check_type_validity,
12
+ check_if_engine_usable,
13
+ )
14
+ from .navigation import (
15
+ js_bypass_path,
16
+ intercept_route,
17
+ construct_cdp_url,
18
+ )
@@ -0,0 +1,168 @@
1
+ """
2
+ Functions related to custom types or type checking
3
+ """
4
+ import inspect
5
+ import logging
6
+ from dataclasses import dataclass, field
7
+
8
+ from scrapling.core.utils import setup_basic_logging
9
+ from scrapling.parser import Adaptor, SQLiteStorageSystem
10
+ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class Response:
15
+ """This class is returned by all engines as a way to unify response type between different libraries."""
16
+ url: str
17
+ text: str
18
+ content: bytes
19
+ status: int
20
+ reason: str
21
+ encoding: str = 'utf-8' # default encoding
22
+ cookies: Dict = field(default_factory=dict)
23
+ headers: Dict = field(default_factory=dict)
24
+ request_headers: Dict = field(default_factory=dict)
25
+ adaptor_arguments: Dict = field(default_factory=dict)
26
+
27
+ @property
28
+ def adaptor(self) -> Union[Adaptor, None]:
29
+ """Generate Adaptor instance from this response if possible, otherwise return None"""
30
+ automatch_domain = self.adaptor_arguments.pop('automatch_domain', None)
31
+ if self.text:
32
+ # For playwright that will be the response after all JS executed
33
+ return Adaptor(text=self.text, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
34
+ elif self.content:
35
+ # For playwright, that's after all JS is loaded but not all of them executed, because playwright doesn't offer something like page.content()
36
+ # To get response Bytes after the load states
37
+ # Reference: https://playwright.dev/python/docs/api/class-page
38
+ return Adaptor(body=self.content, url=automatch_domain or self.url, encoding=self.encoding, **self.adaptor_arguments)
39
+ return None
40
+
41
+ def __repr__(self):
42
+ return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
43
+
44
+
45
+ class BaseFetcher:
46
+ def __init__(
47
+ self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
48
+ storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = True,
49
+ automatch_domain: Optional[str] = None,
50
+ ):
51
+ """Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
52
+ are detected and passed automatically from the Fetcher based on the response for accessibility.
53
+
54
+ :param huge_tree: Enabled by default, should always be enabled when parsing large HTML documents. This controls
55
+ libxml2 feature that forbids parsing certain large documents to protect from possible memory exhaustion.
56
+ :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
57
+ :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
58
+ priority over all auto-match related arguments/functions in the class.
59
+ :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
60
+ :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
61
+ If empty, default values will be used.
62
+ :param automatch_domain: For cases where you want to automatch selectors across different websites as if they were on the same website, use this argument to unify them.
63
+ Otherwise, the domain of the request is used by default.
64
+ :param debug: Enable debug mode
65
+ """
66
+ # Adaptor class parameters
67
+ # I won't validate Adaptor's class parameters here again, I will leave it to be validated later
68
+ self.adaptor_arguments = dict(
69
+ huge_tree=huge_tree,
70
+ keep_comments=keep_comments,
71
+ auto_match=auto_match,
72
+ storage=storage,
73
+ storage_args=storage_args,
74
+ debug=debug,
75
+ )
76
+ # If the user used fetchers first, then configure the logger from here instead of the `Adaptor` class
77
+ setup_basic_logging(level='debug' if debug else 'info')
78
+ if automatch_domain:
79
+ if type(automatch_domain) is not str:
80
+ logging.warning('[Ignored] The argument "automatch_domain" must be of string type')
81
+ else:
82
+ self.adaptor_arguments.update({'automatch_domain': automatch_domain})
83
+
84
+
85
+ def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
86
+ """This function check if the passed engine can be used by a Fetcher-type class or not.
87
+
88
+ :param engine: The engine class itself
89
+ :return: The engine class again if all checks out, otherwise raises error
90
+ :raise TypeError: If engine class don't have fetch method, If engine class have fetch attribute not method, or If engine class have fetch function but it doesn't take arguments
91
+ """
92
+ # if isinstance(engine, type):
93
+ # raise TypeError("Expected an engine instance, not a class definition of the engine")
94
+
95
+ if hasattr(engine, 'fetch'):
96
+ fetch_function = getattr(engine, "fetch")
97
+ if callable(fetch_function):
98
+ if len(inspect.signature(fetch_function).parameters) > 0:
99
+ return engine
100
+ else:
101
+ # raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
102
+ raise TypeError("Engine class must have a callable method 'fetch' with the first argument used for the url.")
103
+ else:
104
+ # raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
105
+ raise TypeError("Invalid engine class! Engine class must have a callable method 'fetch'")
106
+ else:
107
+ # raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
108
+ raise TypeError("Invalid engine class! Engine class must have the method 'fetch'")
109
+
110
+
111
+ def get_variable_name(var: Any) -> Optional[str]:
112
+ """Get the name of a variable using global and local scopes.
113
+ :param var: The variable to find the name for
114
+ :return: The name of the variable if found, None otherwise
115
+ """
116
+ for scope in [globals(), locals()]:
117
+ for name, value in scope.items():
118
+ if value is var:
119
+ return name
120
+ return None
121
+
122
+
123
+ def check_type_validity(variable: Any, valid_types: Union[List[Type], None], default_value: Any = None, critical: bool = False, param_name: Optional[str] = None) -> Any:
124
+ """Check if a variable matches the specified type constraints.
125
+ :param variable: The variable to check
126
+ :param valid_types: List of valid types for the variable
127
+ :param default_value: Value to return if type check fails
128
+ :param critical: If True, raises TypeError instead of logging error
129
+ :param param_name: Optional parameter name for error messages
130
+ :return: The original variable if valid, default_value if invalid
131
+ :raise TypeError: If critical=True and type check fails
132
+ """
133
+ # Use provided param_name or try to get it automatically
134
+ var_name = param_name or get_variable_name(variable) or "Unknown"
135
+
136
+ # Convert valid_types to a list if None
137
+ valid_types = valid_types or []
138
+
139
+ # Handle None value
140
+ if variable is None:
141
+ if type(None) in valid_types:
142
+ return variable
143
+ error_msg = f'Argument "{var_name}" cannot be None'
144
+ if critical:
145
+ raise TypeError(error_msg)
146
+ logging.error(f'[Ignored] {error_msg}')
147
+ return default_value
148
+
149
+ # If no valid_types specified and variable has a value, return it
150
+ if not valid_types:
151
+ return variable
152
+
153
+ # Check if variable type matches any of the valid types
154
+ if not any(isinstance(variable, t) for t in valid_types):
155
+ type_names = [t.__name__ for t in valid_types]
156
+ error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
157
+ if critical:
158
+ raise TypeError(error_msg)
159
+ logging.error(f'[Ignored] {error_msg}')
160
+ return default_value
161
+
162
+ return variable
163
+
164
+
165
+ # Pew Pew
166
+ def do_nothing(page):
167
+ # Just works as a filler for `page_action` argument in browser engines
168
+ return page
@@ -0,0 +1,81 @@
1
+ """
2
+ Functions related to generating headers and fingerprints generally
3
+ """
4
+
5
+ import platform
6
+
7
+ from scrapling.core.utils import cache
8
+ from scrapling.core._types import Union, Dict
9
+
10
+ from tldextract import extract
11
+ from browserforge.headers import HeaderGenerator, Browser
12
+ from browserforge.fingerprints import FingerprintGenerator, Fingerprint
13
+
14
+
15
+ @cache(None, typed=True)
16
+ def generate_convincing_referer(url: str) -> str:
17
+ """Takes the domain from the URL without the subdomain/suffix and make it look like you were searching google for this website
18
+
19
+ >>> generate_convincing_referer('https://www.somewebsite.com/blah')
20
+ 'https://www.google.com/search?q=somewebsite'
21
+
22
+ :param url: The URL you are about to fetch.
23
+ :return: Google's search URL of the domain name
24
+ """
25
+ website_name = extract(url).domain
26
+ return f'https://www.google.com/search?q={website_name}'
27
+
28
+
29
+ @cache(None, typed=True)
30
+ def get_os_name() -> Union[str, None]:
31
+ """Get the current OS name in the same format needed for browserforge
32
+
33
+ :return: Current OS name or `None` otherwise
34
+ """
35
+ #
36
+ os_name = platform.system()
37
+ return {
38
+ 'Linux': 'linux',
39
+ 'Darwin': 'macos',
40
+ 'Windows': 'windows',
41
+ # For the future? because why not
42
+ 'iOS': 'ios',
43
+ }.get(os_name)
44
+
45
+
46
+ def generate_suitable_fingerprint() -> Fingerprint:
47
+ """Generates a browserforge's fingerprint that matches current OS, desktop device, and Chrome with version 128 at least.
48
+
49
+ This function was originally created to test Browserforge's injector.
50
+ :return: `Fingerprint` object
51
+ """
52
+ return FingerprintGenerator(
53
+ browser=[Browser(name='chrome', min_version=128)],
54
+ os=get_os_name(), # None is ignored
55
+ device='desktop'
56
+ ).generate()
57
+
58
+
59
+ def generate_headers(browser_mode: bool = False) -> Dict:
60
+ """Generate real browser-like headers using browserforge's generator
61
+
62
+ :param browser_mode: If enabled, the headers created are used for playwright so it have to match everything
63
+ :return: A dictionary of the generated headers
64
+ """
65
+ if browser_mode:
66
+ # In this mode we don't care about anything other than matching the OS and the browser type with the browser we are using
67
+ # So we don't raise any inconsistency red flags while websites fingerprinting us
68
+ os_name = get_os_name()
69
+ return HeaderGenerator(
70
+ browser=[Browser(name='chrome', min_version=128)],
71
+ os=os_name, # None is ignored
72
+ device='desktop'
73
+ ).generate()
74
+ else:
75
+ # Here it's used for normal requests that aren't done through browsers so we can take it lightly
76
+ browsers = [
77
+ Browser(name='chrome', min_version=120),
78
+ Browser(name='firefox', min_version=120),
79
+ Browser(name='edge', min_version=120),
80
+ ]
81
+ return HeaderGenerator(browser=browsers, device='desktop').generate()