scrapling 0.2.99__py3-none-any.whl → 0.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +18 -31
- scrapling/cli.py +818 -20
- scrapling/core/_html_utils.py +348 -0
- scrapling/core/_types.py +34 -17
- scrapling/core/ai.py +611 -0
- scrapling/core/custom_types.py +183 -100
- scrapling/core/mixins.py +27 -19
- scrapling/core/shell.py +647 -0
- scrapling/core/{storage_adaptors.py → storage.py} +41 -33
- scrapling/core/translator.py +20 -26
- scrapling/core/utils.py +49 -54
- scrapling/engines/__init__.py +15 -6
- scrapling/engines/_browsers/__init__.py +2 -0
- scrapling/engines/_browsers/_camoufox.py +759 -0
- scrapling/engines/_browsers/_config_tools.py +130 -0
- scrapling/engines/_browsers/_controllers.py +644 -0
- scrapling/engines/_browsers/_page.py +93 -0
- scrapling/engines/_browsers/_validators.py +170 -0
- scrapling/engines/constants.py +101 -88
- scrapling/engines/static.py +667 -110
- scrapling/engines/toolbelt/__init__.py +20 -6
- scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js +2 -1
- scrapling/engines/toolbelt/convertor.py +254 -0
- scrapling/engines/toolbelt/custom.py +158 -175
- scrapling/engines/toolbelt/fingerprints.py +32 -46
- scrapling/engines/toolbelt/navigation.py +68 -39
- scrapling/fetchers.py +239 -333
- scrapling/parser.py +781 -449
- scrapling-0.3.1.dist-info/METADATA +411 -0
- scrapling-0.3.1.dist-info/RECORD +41 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/WHEEL +1 -1
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/top_level.txt +0 -1
- scrapling/defaults.py +0 -25
- scrapling/engines/camo.py +0 -339
- scrapling/engines/pw.py +0 -465
- scrapling/engines/toolbelt/bypasses/pdf_viewer.js +0 -5
- scrapling-0.2.99.dist-info/METADATA +0 -290
- scrapling-0.2.99.dist-info/RECORD +0 -49
- tests/__init__.py +0 -1
- tests/fetchers/__init__.py +0 -1
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +0 -97
- tests/fetchers/async/test_httpx.py +0 -85
- tests/fetchers/async/test_playwright.py +0 -101
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +0 -70
- tests/fetchers/sync/test_httpx.py +0 -84
- tests/fetchers/sync/test_playwright.py +0 -89
- tests/fetchers/test_utils.py +0 -97
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +0 -111
- tests/parser/test_general.py +0 -330
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/entry_points.txt +0 -0
- {scrapling-0.2.99.dist-info → scrapling-0.3.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,19 +1,29 @@
|
|
1
1
|
"""
|
2
2
|
Functions related to custom types or type checking
|
3
3
|
"""
|
4
|
-
|
4
|
+
|
5
5
|
from email.message import Message
|
6
6
|
|
7
|
-
from scrapling.core._types import (
|
8
|
-
|
7
|
+
from scrapling.core._types import (
|
8
|
+
Any,
|
9
|
+
Dict,
|
10
|
+
List,
|
11
|
+
Optional,
|
12
|
+
Tuple,
|
13
|
+
)
|
9
14
|
from scrapling.core.custom_types import MappingProxyType
|
10
15
|
from scrapling.core.utils import log, lru_cache
|
11
|
-
from scrapling.parser import
|
16
|
+
from scrapling.parser import Selector, SQLiteStorageSystem
|
12
17
|
|
13
18
|
|
14
19
|
class ResponseEncoding:
|
15
20
|
__DEFAULT_ENCODING = "utf-8"
|
16
|
-
__ISO_8859_1_CONTENT_TYPES = {
|
21
|
+
__ISO_8859_1_CONTENT_TYPES = {
|
22
|
+
"text/plain",
|
23
|
+
"text/html",
|
24
|
+
"text/css",
|
25
|
+
"text/javascript",
|
26
|
+
}
|
17
27
|
|
18
28
|
@classmethod
|
19
29
|
@lru_cache(maxsize=128)
|
@@ -27,19 +37,21 @@ class ResponseEncoding:
|
|
27
37
|
"""
|
28
38
|
# Create a Message object and set the Content-Type header then get the content type and parameters
|
29
39
|
msg = Message()
|
30
|
-
msg[
|
40
|
+
msg["content-type"] = header_value
|
31
41
|
|
32
42
|
content_type = msg.get_content_type()
|
33
43
|
params = dict(msg.get_params(failobj=[]))
|
34
44
|
|
35
45
|
# Remove the content-type from params if present somehow
|
36
|
-
params.pop(
|
46
|
+
params.pop("content-type", None)
|
37
47
|
|
38
48
|
return content_type, params
|
39
49
|
|
40
50
|
@classmethod
|
41
51
|
@lru_cache(maxsize=128)
|
42
|
-
def get_value(
|
52
|
+
def get_value(
|
53
|
+
cls, content_type: Optional[str], text: Optional[str] = "test"
|
54
|
+
) -> str:
|
43
55
|
"""Determine the appropriate character encoding from a content-type header.
|
44
56
|
|
45
57
|
The encoding is determined by these rules in order:
|
@@ -72,7 +84,9 @@ class ResponseEncoding:
|
|
72
84
|
encoding = cls.__DEFAULT_ENCODING
|
73
85
|
|
74
86
|
if encoding:
|
75
|
-
_ = text.encode(
|
87
|
+
_ = text.encode(
|
88
|
+
encoding
|
89
|
+
) # Validate encoding and validate it can encode the given text
|
76
90
|
return encoding
|
77
91
|
|
78
92
|
return cls.__DEFAULT_ENCODING
|
@@ -81,48 +95,74 @@ class ResponseEncoding:
|
|
81
95
|
return cls.__DEFAULT_ENCODING
|
82
96
|
|
83
97
|
|
84
|
-
class Response(
|
98
|
+
class Response(Selector):
|
85
99
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
86
100
|
|
87
|
-
def __init__(
|
88
|
-
|
89
|
-
|
101
|
+
def __init__(
|
102
|
+
self,
|
103
|
+
url: str,
|
104
|
+
content: str | bytes,
|
105
|
+
status: int,
|
106
|
+
reason: str,
|
107
|
+
cookies: Tuple[Dict[str, str], ...] | Dict[str, str],
|
108
|
+
headers: Dict,
|
109
|
+
request_headers: Dict,
|
110
|
+
encoding: str = "utf-8",
|
111
|
+
method: str = "GET",
|
112
|
+
history: List = None,
|
113
|
+
**selector_config: Dict,
|
114
|
+
):
|
115
|
+
adaptive_domain = selector_config.pop("adaptive_domain", None)
|
90
116
|
self.status = status
|
91
117
|
self.reason = reason
|
92
118
|
self.cookies = cookies
|
93
119
|
self.headers = headers
|
94
120
|
self.request_headers = request_headers
|
95
121
|
self.history = history or []
|
96
|
-
encoding = ResponseEncoding.get_value(
|
97
|
-
|
98
|
-
|
99
|
-
|
122
|
+
encoding = ResponseEncoding.get_value(
|
123
|
+
encoding, content.decode("utf-8") if isinstance(content, bytes) else content
|
124
|
+
)
|
125
|
+
super().__init__(
|
126
|
+
content=content,
|
127
|
+
url=adaptive_domain or url,
|
128
|
+
encoding=encoding,
|
129
|
+
**selector_config,
|
130
|
+
)
|
100
131
|
# For easier debugging while working from a Python shell
|
101
|
-
log.info(
|
102
|
-
|
103
|
-
|
104
|
-
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'
|
132
|
+
log.info(
|
133
|
+
f"Fetched ({status}) <{method} {url}> (referer: {request_headers.get('referer')})"
|
134
|
+
)
|
105
135
|
|
106
136
|
|
107
137
|
class BaseFetcher:
|
108
138
|
__slots__ = ()
|
109
139
|
huge_tree: bool = True
|
110
|
-
|
140
|
+
adaptive: Optional[bool] = False
|
111
141
|
storage: Any = SQLiteStorageSystem
|
112
142
|
keep_cdata: Optional[bool] = False
|
113
143
|
storage_args: Optional[Dict] = None
|
114
144
|
keep_comments: Optional[bool] = False
|
115
|
-
|
116
|
-
parser_keywords: Tuple = (
|
145
|
+
adaptive_domain: Optional[str] = None
|
146
|
+
parser_keywords: Tuple = (
|
147
|
+
"huge_tree",
|
148
|
+
"adaptive",
|
149
|
+
"storage",
|
150
|
+
"keep_cdata",
|
151
|
+
"storage_args",
|
152
|
+
"keep_comments",
|
153
|
+
"adaptive_domain",
|
154
|
+
) # Left open for the user
|
117
155
|
|
118
156
|
def __init__(self, *args, **kwargs):
|
119
157
|
# For backward-compatibility before 0.2.99
|
120
|
-
args_str = ", ".join(args) or
|
121
|
-
kwargs_str = ", ".join(f
|
158
|
+
args_str = ", ".join(args) or ""
|
159
|
+
kwargs_str = ", ".join(f"{k}={v}" for k, v in kwargs.items()) or ""
|
122
160
|
if args_str:
|
123
|
-
args_str +=
|
161
|
+
args_str += ", "
|
124
162
|
|
125
|
-
log.warning(
|
163
|
+
log.warning(
|
164
|
+
f"This logic is deprecated now, and have no effect; It will be removed with v0.3. Use `{self.__class__.__name__}.configure({args_str}{kwargs_str})` instead before fetching"
|
165
|
+
)
|
126
166
|
pass
|
127
167
|
|
128
168
|
@classmethod
|
@@ -131,17 +171,17 @@ class BaseFetcher:
|
|
131
171
|
huge_tree=cls.huge_tree,
|
132
172
|
keep_comments=cls.keep_comments,
|
133
173
|
keep_cdata=cls.keep_cdata,
|
134
|
-
|
174
|
+
adaptive=cls.adaptive,
|
135
175
|
storage=cls.storage,
|
136
176
|
storage_args=cls.storage_args,
|
137
|
-
|
177
|
+
adaptive_domain=cls.adaptive_domain,
|
138
178
|
)
|
139
179
|
|
140
180
|
@classmethod
|
141
181
|
def configure(cls, **kwargs):
|
142
182
|
"""Set multiple arguments for the parser at once globally
|
143
183
|
|
144
|
-
:param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata,
|
184
|
+
:param kwargs: The keywords can be any arguments of the following: huge_tree, keep_comments, keep_cdata, adaptive, storage, storage_args, adaptive_domain
|
145
185
|
"""
|
146
186
|
for key, value in kwargs.items():
|
147
187
|
key = key.strip().lower()
|
@@ -150,30 +190,38 @@ class BaseFetcher:
|
|
150
190
|
setattr(cls, key, value)
|
151
191
|
else:
|
152
192
|
# Yup, no fun allowed LOL
|
153
|
-
raise AttributeError(
|
193
|
+
raise AttributeError(
|
194
|
+
f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
|
195
|
+
)
|
154
196
|
else:
|
155
|
-
raise ValueError(
|
197
|
+
raise ValueError(
|
198
|
+
f'Unknown parser argument: "{key}"; maybe you meant {cls.parser_keywords}?'
|
199
|
+
)
|
156
200
|
|
157
201
|
if not kwargs:
|
158
|
-
raise AttributeError(
|
202
|
+
raise AttributeError(
|
203
|
+
f"You must pass a keyword to configure, current keywords: {cls.parser_keywords}?"
|
204
|
+
)
|
159
205
|
|
160
206
|
@classmethod
|
161
207
|
def _generate_parser_arguments(cls) -> Dict:
|
162
|
-
#
|
163
|
-
# I won't validate
|
208
|
+
# Selector class parameters
|
209
|
+
# I won't validate Selector's class parameters here again, I will leave it to be validated later
|
164
210
|
parser_arguments = dict(
|
165
211
|
huge_tree=cls.huge_tree,
|
166
212
|
keep_comments=cls.keep_comments,
|
167
213
|
keep_cdata=cls.keep_cdata,
|
168
|
-
|
214
|
+
adaptive=cls.adaptive,
|
169
215
|
storage=cls.storage,
|
170
|
-
storage_args=cls.storage_args
|
216
|
+
storage_args=cls.storage_args,
|
171
217
|
)
|
172
|
-
if cls.
|
173
|
-
if
|
174
|
-
log.warning(
|
218
|
+
if cls.adaptive_domain:
|
219
|
+
if not isinstance(cls.adaptive_domain, str):
|
220
|
+
log.warning(
|
221
|
+
'[Ignored] The argument "adaptive_domain" must be of string type'
|
222
|
+
)
|
175
223
|
else:
|
176
|
-
parser_arguments.update({
|
224
|
+
parser_arguments.update({"adaptive_domain": cls.adaptive_domain})
|
177
225
|
|
178
226
|
return parser_arguments
|
179
227
|
|
@@ -181,72 +229,75 @@ class BaseFetcher:
|
|
181
229
|
class StatusText:
|
182
230
|
"""A class that gets the status text of response status code.
|
183
231
|
|
184
|
-
|
232
|
+
Reference: https://developer.mozilla.org/en-US/docs/Web/HTTP/Status
|
185
233
|
"""
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
234
|
+
|
235
|
+
_phrases = MappingProxyType(
|
236
|
+
{
|
237
|
+
100: "Continue",
|
238
|
+
101: "Switching Protocols",
|
239
|
+
102: "Processing",
|
240
|
+
103: "Early Hints",
|
241
|
+
200: "OK",
|
242
|
+
201: "Created",
|
243
|
+
202: "Accepted",
|
244
|
+
203: "Non-Authoritative Information",
|
245
|
+
204: "No Content",
|
246
|
+
205: "Reset Content",
|
247
|
+
206: "Partial Content",
|
248
|
+
207: "Multi-Status",
|
249
|
+
208: "Already Reported",
|
250
|
+
226: "IM Used",
|
251
|
+
300: "Multiple Choices",
|
252
|
+
301: "Moved Permanently",
|
253
|
+
302: "Found",
|
254
|
+
303: "See Other",
|
255
|
+
304: "Not Modified",
|
256
|
+
305: "Use Proxy",
|
257
|
+
307: "Temporary Redirect",
|
258
|
+
308: "Permanent Redirect",
|
259
|
+
400: "Bad Request",
|
260
|
+
401: "Unauthorized",
|
261
|
+
402: "Payment Required",
|
262
|
+
403: "Forbidden",
|
263
|
+
404: "Not Found",
|
264
|
+
405: "Method Not Allowed",
|
265
|
+
406: "Not Acceptable",
|
266
|
+
407: "Proxy Authentication Required",
|
267
|
+
408: "Request Timeout",
|
268
|
+
409: "Conflict",
|
269
|
+
410: "Gone",
|
270
|
+
411: "Length Required",
|
271
|
+
412: "Precondition Failed",
|
272
|
+
413: "Payload Too Large",
|
273
|
+
414: "URI Too Long",
|
274
|
+
415: "Unsupported Media Type",
|
275
|
+
416: "Range Not Satisfiable",
|
276
|
+
417: "Expectation Failed",
|
277
|
+
418: "I'm a teapot",
|
278
|
+
421: "Misdirected Request",
|
279
|
+
422: "Unprocessable Entity",
|
280
|
+
423: "Locked",
|
281
|
+
424: "Failed Dependency",
|
282
|
+
425: "Too Early",
|
283
|
+
426: "Upgrade Required",
|
284
|
+
428: "Precondition Required",
|
285
|
+
429: "Too Many Requests",
|
286
|
+
431: "Request Header Fields Too Large",
|
287
|
+
451: "Unavailable For Legal Reasons",
|
288
|
+
500: "Internal Server Error",
|
289
|
+
501: "Not Implemented",
|
290
|
+
502: "Bad Gateway",
|
291
|
+
503: "Service Unavailable",
|
292
|
+
504: "Gateway Timeout",
|
293
|
+
505: "HTTP Version Not Supported",
|
294
|
+
506: "Variant Also Negotiates",
|
295
|
+
507: "Insufficient Storage",
|
296
|
+
508: "Loop Detected",
|
297
|
+
510: "Not Extended",
|
298
|
+
511: "Network Authentication Required",
|
299
|
+
}
|
300
|
+
)
|
250
301
|
|
251
302
|
@classmethod
|
252
303
|
@lru_cache(maxsize=128)
|
@@ -255,32 +306,6 @@ class StatusText:
|
|
255
306
|
return cls._phrases.get(status_code, "Unknown Status Code")
|
256
307
|
|
257
308
|
|
258
|
-
def check_if_engine_usable(engine: Callable) -> Union[Callable, None]:
|
259
|
-
"""This function check if the passed engine can be used by a Fetcher-type class or not.
|
260
|
-
|
261
|
-
:param engine: The engine class itself
|
262
|
-
:return: The engine class again if all checks out, otherwise raises error
|
263
|
-
:raise TypeError: If engine class don't have fetch method, If engine class have fetch attribute not method, or If engine class have fetch function but it doesn't take arguments
|
264
|
-
"""
|
265
|
-
# if isinstance(engine, type):
|
266
|
-
# raise TypeError("Expected an engine instance, not a class definition of the engine")
|
267
|
-
|
268
|
-
if hasattr(engine, 'fetch'):
|
269
|
-
fetch_function = getattr(engine, "fetch")
|
270
|
-
if callable(fetch_function):
|
271
|
-
if len(inspect.signature(fetch_function).parameters) > 0:
|
272
|
-
return engine
|
273
|
-
else:
|
274
|
-
# raise TypeError("Engine class instance must have a callable method 'fetch' with the first argument used for the url.")
|
275
|
-
raise TypeError("Engine class must have a callable method 'fetch' with the first argument used for the url.")
|
276
|
-
else:
|
277
|
-
# raise TypeError("Invalid engine instance! Engine class must have a callable method 'fetch'")
|
278
|
-
raise TypeError("Invalid engine class! Engine class must have a callable method 'fetch'")
|
279
|
-
else:
|
280
|
-
# raise TypeError("Invalid engine instance! Engine class must have the method 'fetch'")
|
281
|
-
raise TypeError("Invalid engine class! Engine class must have the method 'fetch'")
|
282
|
-
|
283
|
-
|
284
309
|
def get_variable_name(var: Any) -> Optional[str]:
|
285
310
|
"""Get the name of a variable using global and local scopes.
|
286
311
|
:param var: The variable to find the name for
|
@@ -291,45 +316,3 @@ def get_variable_name(var: Any) -> Optional[str]:
|
|
291
316
|
if value is var:
|
292
317
|
return name
|
293
318
|
return None
|
294
|
-
|
295
|
-
|
296
|
-
def check_type_validity(variable: Any, valid_types: Union[List[Type], None], default_value: Any = None, critical: bool = False, param_name: Optional[str] = None) -> Any:
|
297
|
-
"""Check if a variable matches the specified type constraints.
|
298
|
-
:param variable: The variable to check
|
299
|
-
:param valid_types: List of valid types for the variable
|
300
|
-
:param default_value: Value to return if type check fails
|
301
|
-
:param critical: If True, raises TypeError instead of logging error
|
302
|
-
:param param_name: Optional parameter name for error messages
|
303
|
-
:return: The original variable if valid, default_value if invalid
|
304
|
-
:raise TypeError: If critical=True and type check fails
|
305
|
-
"""
|
306
|
-
# Use provided param_name or try to get it automatically
|
307
|
-
var_name = param_name or get_variable_name(variable) or "Unknown"
|
308
|
-
|
309
|
-
# Convert valid_types to a list if None
|
310
|
-
valid_types = valid_types or []
|
311
|
-
|
312
|
-
# Handle None value
|
313
|
-
if variable is None:
|
314
|
-
if type(None) in valid_types:
|
315
|
-
return variable
|
316
|
-
error_msg = f'Argument "{var_name}" cannot be None'
|
317
|
-
if critical:
|
318
|
-
raise TypeError(error_msg)
|
319
|
-
log.error(f'[Ignored] {error_msg}')
|
320
|
-
return default_value
|
321
|
-
|
322
|
-
# If no valid_types specified and variable has a value, return it
|
323
|
-
if not valid_types:
|
324
|
-
return variable
|
325
|
-
|
326
|
-
# Check if variable type matches any of the valid types
|
327
|
-
if not any(isinstance(variable, t) for t in valid_types):
|
328
|
-
type_names = [t.__name__ for t in valid_types]
|
329
|
-
error_msg = f'Argument "{var_name}" must be of type {" or ".join(type_names)}'
|
330
|
-
if critical:
|
331
|
-
raise TypeError(error_msg)
|
332
|
-
log.error(f'[Ignored] {error_msg}')
|
333
|
-
return default_value
|
334
|
-
|
335
|
-
return variable
|
@@ -2,19 +2,20 @@
|
|
2
2
|
Functions related to generating headers and fingerprints generally
|
3
3
|
"""
|
4
4
|
|
5
|
-
import
|
5
|
+
from platform import system as platform_system
|
6
6
|
|
7
|
-
from browserforge.fingerprints import Fingerprint, FingerprintGenerator
|
8
|
-
from browserforge.headers import Browser, HeaderGenerator
|
9
7
|
from tldextract import extract
|
8
|
+
from browserforge.headers import Browser, HeaderGenerator
|
10
9
|
|
11
|
-
from scrapling.core._types import Dict,
|
10
|
+
from scrapling.core._types import Dict, Optional
|
12
11
|
from scrapling.core.utils import lru_cache
|
13
12
|
|
13
|
+
__OS_NAME__ = platform_system()
|
14
|
+
|
14
15
|
|
15
16
|
@lru_cache(10, typed=True)
|
16
17
|
def generate_convincing_referer(url: str) -> str:
|
17
|
-
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching
|
18
|
+
"""Takes the domain from the URL without the subdomain/suffix and make it look like you were searching Google for this website
|
18
19
|
|
19
20
|
>>> generate_convincing_referer('https://www.somewebsite.com/blah')
|
20
21
|
'https://www.google.com/search?q=somewebsite'
|
@@ -23,59 +24,44 @@ def generate_convincing_referer(url: str) -> str:
|
|
23
24
|
:return: Google's search URL of the domain name
|
24
25
|
"""
|
25
26
|
website_name = extract(url).domain
|
26
|
-
return f
|
27
|
+
return f"https://www.google.com/search?q={website_name}"
|
27
28
|
|
28
29
|
|
29
30
|
@lru_cache(1, typed=True)
|
30
|
-
def get_os_name() ->
|
31
|
+
def get_os_name() -> Optional[str]:
|
31
32
|
"""Get the current OS name in the same format needed for browserforge
|
32
33
|
|
33
34
|
:return: Current OS name or `None` otherwise
|
34
35
|
"""
|
35
|
-
#
|
36
|
-
os_name = platform.system()
|
37
36
|
return {
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
# For the future? because why not
|
42
|
-
|
43
|
-
}.get(
|
44
|
-
|
45
|
-
|
46
|
-
def generate_suitable_fingerprint() -> Fingerprint:
|
47
|
-
"""Generates a browserforge's fingerprint that matches current OS, desktop device, and Chrome with version 128 at least.
|
48
|
-
|
49
|
-
This function was originally created to test Browserforge's injector.
|
50
|
-
:return: `Fingerprint` object
|
51
|
-
"""
|
52
|
-
return FingerprintGenerator(
|
53
|
-
browser=[Browser(name='chrome', min_version=128)],
|
54
|
-
os=get_os_name(), # None is ignored
|
55
|
-
device='desktop'
|
56
|
-
).generate()
|
37
|
+
"Linux": "linux",
|
38
|
+
"Darwin": "macos",
|
39
|
+
"Windows": "windows",
|
40
|
+
# For the future? because why not?
|
41
|
+
"iOS": "ios",
|
42
|
+
}.get(__OS_NAME__)
|
57
43
|
|
58
44
|
|
59
45
|
def generate_headers(browser_mode: bool = False) -> Dict:
|
60
46
|
"""Generate real browser-like headers using browserforge's generator
|
61
47
|
|
62
|
-
:param browser_mode: If enabled, the headers created are used for playwright so it
|
48
|
+
:param browser_mode: If enabled, the headers created are used for playwright, so it has to match everything
|
63
49
|
:return: A dictionary of the generated headers
|
64
50
|
"""
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
51
|
+
# In the browser mode, we don't care about anything other than matching the OS and the browser type with the browser we are using,
|
52
|
+
# So we don't raise any inconsistency red flags while websites fingerprinting us
|
53
|
+
os_name = get_os_name()
|
54
|
+
browsers = [Browser(name="chrome", min_version=130)]
|
55
|
+
if not browser_mode:
|
56
|
+
os_name = ("windows", "macos", "linux")
|
57
|
+
browsers.extend(
|
58
|
+
[
|
59
|
+
Browser(name="firefox", min_version=130),
|
60
|
+
Browser(name="edge", min_version=130),
|
61
|
+
]
|
62
|
+
)
|
63
|
+
|
64
|
+
return HeaderGenerator(browser=browsers, os=os_name, device="desktop").generate()
|
65
|
+
|
66
|
+
|
67
|
+
__default_useragent__ = generate_headers(browser_mode=False).get("User-Agent")
|