scrapling 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +14 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +128 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +237 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +19 -0
- scrapling/engines/toolbelt/custom.py +154 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +108 -0
- scrapling/fetchers.py +198 -0
- scrapling/parser.py +223 -70
- scrapling/py.typed +1 -0
- scrapling-0.2.1.dist-info/METADATA +835 -0
- scrapling-0.2.1.dist-info/RECORD +33 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/LICENSE +0 -0
scrapling/__init__.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# Declare top-level shortcuts
|
2
|
+
from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
|
2
3
|
from scrapling.parser import Adaptor, Adaptors
|
3
|
-
from scrapling.custom_types import TextHandler, AttributesHandler
|
4
|
+
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
4
5
|
|
5
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
6
|
-
__version__ = "0.1
|
7
|
+
__version__ = "0.2.1"
|
7
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
8
9
|
|
9
10
|
|
10
|
-
__all__ = ['Adaptor', '
|
11
|
+
__all__ = ['Adaptor', 'Fetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
File without changes
|
scrapling/core/_types.py
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
"""
|
2
|
+
Type definitions for type checking purposes.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import (
|
6
|
+
Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
|
7
|
+
)
|
8
|
+
|
9
|
+
try:
|
10
|
+
from typing import Protocol
|
11
|
+
except ImportError:
|
12
|
+
# Added in Python 3.8
|
13
|
+
Protocol = object
|
14
|
+
|
15
|
+
try:
|
16
|
+
from typing import SupportsIndex
|
17
|
+
except ImportError:
|
18
|
+
# 'SupportsIndex' got added in Python 3.8
|
19
|
+
SupportsIndex = None
|
20
|
+
|
21
|
+
if TYPE_CHECKING:
|
22
|
+
# typing.Self requires Python 3.11
|
23
|
+
from typing_extensions import Self
|
24
|
+
else:
|
25
|
+
Self = object
|
@@ -1,9 +1,9 @@
|
|
1
1
|
import re
|
2
2
|
from types import MappingProxyType
|
3
3
|
from collections.abc import Mapping
|
4
|
-
from typing import Dict, List, Union, Pattern
|
5
4
|
|
6
|
-
from scrapling.utils import _is_iterable, flatten
|
5
|
+
from scrapling.core.utils import _is_iterable, flatten
|
6
|
+
from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
|
7
7
|
|
8
8
|
from orjson import loads, dumps
|
9
9
|
from w3lib.html import replace_entities as _replace_entities
|
@@ -69,7 +69,7 @@ class TextHandler(str):
|
|
69
69
|
return [TextHandler(_replace_entities(s)) for s in results]
|
70
70
|
|
71
71
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
72
|
-
clean_match: bool = False, case_sensitive: bool = False,
|
72
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
73
73
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
74
74
|
|
75
75
|
:param regex: Can be either a compiled regular expression or a string.
|
@@ -83,6 +83,51 @@ class TextHandler(str):
|
|
83
83
|
return result[0] if result else default
|
84
84
|
|
85
85
|
|
86
|
+
class TextHandlers(List[TextHandler]):
|
87
|
+
"""
|
88
|
+
The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
89
|
+
"""
|
90
|
+
__slots__ = ()
|
91
|
+
|
92
|
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers[TextHandler]"]:
|
93
|
+
lst = super().__getitem__(pos)
|
94
|
+
if isinstance(pos, slice):
|
95
|
+
return self.__class__(lst)
|
96
|
+
else:
|
97
|
+
return lst
|
98
|
+
|
99
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
100
|
+
case_sensitive: bool = False) -> 'List[str]':
|
101
|
+
"""Call the ``.re()`` method for each element in this list and return
|
102
|
+
their results flattened as TextHandlers.
|
103
|
+
|
104
|
+
:param regex: Can be either a compiled regular expression or a string.
|
105
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
106
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
107
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
108
|
+
"""
|
109
|
+
results = [
|
110
|
+
n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
111
|
+
]
|
112
|
+
return flatten(results)
|
113
|
+
|
114
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
115
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
116
|
+
"""Call the ``.re_first()`` method for each element in this list and return
|
117
|
+
the first result or the default value otherwise.
|
118
|
+
|
119
|
+
:param regex: Can be either a compiled regular expression or a string.
|
120
|
+
:param default: The default value to be returned if there is no match
|
121
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
122
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
123
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
124
|
+
"""
|
125
|
+
for n in self:
|
126
|
+
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
127
|
+
return result
|
128
|
+
return default
|
129
|
+
|
130
|
+
|
86
131
|
class AttributesHandler(Mapping):
|
87
132
|
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
88
133
|
at the same time I use it to add more functionalities.
|
@@ -4,7 +4,7 @@ class SelectorsGeneration:
|
|
4
4
|
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
5
5
|
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
|
6
6
|
|
7
|
-
def __general_selection(self, selection: str = 'css') -> str:
|
7
|
+
def __general_selection(self, selection: str = 'css', full_path=False) -> str:
|
8
8
|
"""Generate a selector for the current element.
|
9
9
|
:return: A string of the generated selector.
|
10
10
|
"""
|
@@ -20,10 +20,11 @@ class SelectorsGeneration:
|
|
20
20
|
else f"[@id='{target.attrib['id']}']"
|
21
21
|
)
|
22
22
|
selectorPath.append(part)
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
23
|
+
if not full_path:
|
24
|
+
return (
|
25
|
+
" > ".join(reversed(selectorPath)) if css
|
26
|
+
else '//*' + "/".join(reversed(selectorPath))
|
27
|
+
)
|
27
28
|
else:
|
28
29
|
part = f'{target.tag}'
|
29
30
|
# We won't use classes anymore because I some websites share exact classes between elements
|
@@ -60,15 +61,29 @@ class SelectorsGeneration:
|
|
60
61
|
)
|
61
62
|
|
62
63
|
@property
|
63
|
-
def
|
64
|
+
def generate_css_selector(self) -> str:
|
64
65
|
"""Generate a CSS selector for the current element
|
65
66
|
:return: A string of the generated selector.
|
66
67
|
"""
|
67
68
|
return self.__general_selection()
|
68
69
|
|
69
70
|
@property
|
70
|
-
def
|
71
|
+
def generate_full_css_selector(self) -> str:
|
72
|
+
"""Generate a complete CSS selector for the current element
|
73
|
+
:return: A string of the generated selector.
|
74
|
+
"""
|
75
|
+
return self.__general_selection(full_path=True)
|
76
|
+
|
77
|
+
@property
|
78
|
+
def generate_xpath_selector(self) -> str:
|
71
79
|
"""Generate a XPath selector for the current element
|
72
80
|
:return: A string of the generated selector.
|
73
81
|
"""
|
74
82
|
return self.__general_selection('xpath')
|
83
|
+
|
84
|
+
@property
|
85
|
+
def generate_full_xpath_selector(self) -> str:
|
86
|
+
"""Generate a complete XPath selector for the current element
|
87
|
+
:return: A string of the generated selector.
|
88
|
+
"""
|
89
|
+
return self.__general_selection('xpath', full_path=True)
|
@@ -4,9 +4,9 @@ import logging
|
|
4
4
|
import threading
|
5
5
|
from hashlib import sha256
|
6
6
|
from abc import ABC, abstractmethod
|
7
|
-
from typing import Dict, Optional, Union
|
8
7
|
|
9
|
-
from scrapling.
|
8
|
+
from scrapling.core._types import Dict, Optional, Union
|
9
|
+
from scrapling.core.utils import _StorageTools, cache
|
10
10
|
|
11
11
|
from lxml import html
|
12
12
|
from tldextract import extract as tld
|
@@ -9,24 +9,14 @@ which will be important in future releases but most importantly...
|
|
9
9
|
import re
|
10
10
|
|
11
11
|
from w3lib.html import HTML5_WHITESPACE
|
12
|
-
from
|
13
|
-
|
14
|
-
from typing import Protocol
|
15
|
-
except ImportError:
|
16
|
-
# Added in Python 3.8
|
17
|
-
Protocol = object
|
18
|
-
|
19
|
-
from scrapling.utils import cache
|
12
|
+
from scrapling.core.utils import cache
|
13
|
+
from scrapling.core._types import Any, Optional, Protocol, Self
|
20
14
|
|
21
15
|
from cssselect.xpath import ExpressionError
|
22
16
|
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
23
17
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
24
18
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
25
19
|
|
26
|
-
if TYPE_CHECKING:
|
27
|
-
# typing.Self requires Python 3.11
|
28
|
-
from typing_extensions import Self
|
29
|
-
|
30
20
|
|
31
21
|
regex = f"[{HTML5_WHITESPACE}]+"
|
32
22
|
replace_html5_whitespaces = re.compile(regex).sub
|
@@ -1,14 +1,14 @@
|
|
1
1
|
import re
|
2
|
-
import os
|
3
2
|
import logging
|
4
3
|
from itertools import chain
|
5
|
-
from logging import handlers
|
6
4
|
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
7
5
|
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
8
6
|
|
9
|
-
from
|
7
|
+
from scrapling.core._types import Dict, Iterable, Any, Union
|
10
8
|
|
9
|
+
import orjson
|
11
10
|
from lxml import html
|
11
|
+
|
12
12
|
html_forbidden = {html.HtmlComment, }
|
13
13
|
logging.basicConfig(
|
14
14
|
level=logging.ERROR,
|
@@ -19,6 +19,17 @@ logging.basicConfig(
|
|
19
19
|
)
|
20
20
|
|
21
21
|
|
22
|
+
def is_jsonable(content: Union[bytes, str]) -> bool:
|
23
|
+
if type(content) is bytes:
|
24
|
+
content = content.decode()
|
25
|
+
|
26
|
+
try:
|
27
|
+
_ = orjson.loads(content)
|
28
|
+
return True
|
29
|
+
except orjson.JSONDecodeError:
|
30
|
+
return False
|
31
|
+
|
32
|
+
|
22
33
|
@cache(None, typed=True)
|
23
34
|
def setup_basic_logging(level: str = 'debug'):
|
24
35
|
levels = {
|
@@ -45,64 +56,6 @@ def _is_iterable(s: Any):
|
|
45
56
|
return isinstance(s, (list, tuple,))
|
46
57
|
|
47
58
|
|
48
|
-
@cache(None, typed=True)
|
49
|
-
class _Logger(object):
|
50
|
-
# I will leave this class here for now in case I decide I want to come back to use it :)
|
51
|
-
__slots__ = ('console_logger', 'logger_file_path',)
|
52
|
-
levels = {
|
53
|
-
'debug': logging.DEBUG,
|
54
|
-
'info': logging.INFO,
|
55
|
-
'warning': logging.WARNING,
|
56
|
-
'error': logging.ERROR,
|
57
|
-
'critical': logging.CRITICAL
|
58
|
-
}
|
59
|
-
|
60
|
-
def __init__(self, filename: str = 'debug.log', level: str = 'debug', when: str = 'midnight', backcount: int = 1):
|
61
|
-
os.makedirs(os.path.join(os.path.dirname(__file__), 'logs'), exist_ok=True)
|
62
|
-
format_str = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
|
63
|
-
|
64
|
-
# on-screen output
|
65
|
-
lvl = self.levels[level.lower()]
|
66
|
-
self.console_logger = logging.getLogger('Scrapling')
|
67
|
-
self.console_logger.setLevel(lvl)
|
68
|
-
console_handler = logging.StreamHandler()
|
69
|
-
console_handler.setLevel(lvl)
|
70
|
-
console_handler.setFormatter(format_str)
|
71
|
-
self.console_logger.addHandler(console_handler)
|
72
|
-
|
73
|
-
if lvl == logging.DEBUG:
|
74
|
-
filename = os.path.join(os.path.dirname(__file__), 'logs', filename)
|
75
|
-
self.logger_file_path = filename
|
76
|
-
# Automatically generates the logging file at specified intervals
|
77
|
-
file_handler = handlers.TimedRotatingFileHandler(
|
78
|
-
# If more than (backcount+1) existed, oldest logs will be deleted
|
79
|
-
filename=filename, when=when, backupCount=backcount, encoding='utf-8'
|
80
|
-
)
|
81
|
-
file_handler.setLevel(lvl)
|
82
|
-
file_handler.setFormatter(format_str)
|
83
|
-
# This for the logger when it appends the date to the new log
|
84
|
-
file_handler.namer = lambda name: name.replace(".log", "") + ".log"
|
85
|
-
self.console_logger.addHandler(file_handler)
|
86
|
-
self.debug(f'Debug log path: {self.logger_file_path}')
|
87
|
-
else:
|
88
|
-
self.logger_file_path = None
|
89
|
-
|
90
|
-
def debug(self, message: str) -> None:
|
91
|
-
self.console_logger.debug(message)
|
92
|
-
|
93
|
-
def info(self, message: str) -> None:
|
94
|
-
self.console_logger.info(message)
|
95
|
-
|
96
|
-
def warning(self, message: str) -> None:
|
97
|
-
self.console_logger.warning(message)
|
98
|
-
|
99
|
-
def error(self, message: str) -> None:
|
100
|
-
self.console_logger.error(message)
|
101
|
-
|
102
|
-
def critical(self, message: str) -> None:
|
103
|
-
self.console_logger.critical(message)
|
104
|
-
|
105
|
-
|
106
59
|
class _StorageTools:
|
107
60
|
@staticmethod
|
108
61
|
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
@@ -0,0 +1,7 @@
|
|
1
|
+
from .camo import CamoufoxEngine
|
2
|
+
from .static import StaticEngine
|
3
|
+
from .pw import PlaywrightEngine
|
4
|
+
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
|
5
|
+
from .toolbelt import check_if_engine_usable
|
6
|
+
|
7
|
+
__all__ = ['CamoufoxEngine', 'PlaywrightEngine']
|
@@ -0,0 +1,128 @@
|
|
1
|
+
import logging
|
2
|
+
from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
3
|
+
|
4
|
+
from scrapling.engines.toolbelt import (
|
5
|
+
Response,
|
6
|
+
do_nothing,
|
7
|
+
get_os_name,
|
8
|
+
intercept_route,
|
9
|
+
check_type_validity,
|
10
|
+
construct_proxy_dict,
|
11
|
+
generate_convincing_referer,
|
12
|
+
)
|
13
|
+
|
14
|
+
from camoufox.sync_api import Camoufox
|
15
|
+
|
16
|
+
|
17
|
+
class CamoufoxEngine:
|
18
|
+
def __init__(
|
19
|
+
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
20
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
21
|
+
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
22
|
+
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
23
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
|
24
|
+
):
|
25
|
+
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
26
|
+
|
27
|
+
:param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
|
28
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
29
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
30
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
31
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
32
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
33
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
34
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
35
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
36
|
+
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
37
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
38
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
39
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
40
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
41
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
42
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
43
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
44
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
45
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
46
|
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
47
|
+
"""
|
48
|
+
self.headless = headless
|
49
|
+
self.block_images = bool(block_images)
|
50
|
+
self.disable_resources = bool(disable_resources)
|
51
|
+
self.block_webrtc = bool(block_webrtc)
|
52
|
+
self.allow_webgl = bool(allow_webgl)
|
53
|
+
self.network_idle = bool(network_idle)
|
54
|
+
self.google_search = bool(google_search)
|
55
|
+
self.os_randomize = bool(os_randomize)
|
56
|
+
self.extra_headers = extra_headers or {}
|
57
|
+
self.proxy = construct_proxy_dict(proxy)
|
58
|
+
self.addons = addons or []
|
59
|
+
self.humanize = humanize
|
60
|
+
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
61
|
+
if callable(page_action):
|
62
|
+
self.page_action = page_action
|
63
|
+
else:
|
64
|
+
self.page_action = do_nothing
|
65
|
+
logging.error('[Ignored] Argument "page_action" must be callable')
|
66
|
+
|
67
|
+
self.wait_selector = wait_selector
|
68
|
+
self.wait_selector_state = wait_selector_state
|
69
|
+
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
70
|
+
|
71
|
+
def fetch(self, url: str) -> Response:
|
72
|
+
"""Opens up the browser and do your request based on your chosen options.
|
73
|
+
|
74
|
+
:param url: Target url.
|
75
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
76
|
+
"""
|
77
|
+
with Camoufox(
|
78
|
+
proxy=self.proxy,
|
79
|
+
addons=self.addons,
|
80
|
+
headless=self.headless,
|
81
|
+
humanize=self.humanize,
|
82
|
+
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
83
|
+
allow_webgl=self.allow_webgl,
|
84
|
+
block_webrtc=self.block_webrtc,
|
85
|
+
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
86
|
+
os=None if self.os_randomize else get_os_name(),
|
87
|
+
) as browser:
|
88
|
+
page = browser.new_page()
|
89
|
+
page.set_default_navigation_timeout(self.timeout)
|
90
|
+
page.set_default_timeout(self.timeout)
|
91
|
+
if self.disable_resources:
|
92
|
+
page.route("**/*", intercept_route)
|
93
|
+
|
94
|
+
if self.extra_headers:
|
95
|
+
page.set_extra_http_headers(self.extra_headers)
|
96
|
+
|
97
|
+
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
98
|
+
page.wait_for_load_state(state="domcontentloaded")
|
99
|
+
if self.network_idle:
|
100
|
+
page.wait_for_load_state('networkidle')
|
101
|
+
|
102
|
+
page = self.page_action(page)
|
103
|
+
|
104
|
+
if self.wait_selector and type(self.wait_selector) is str:
|
105
|
+
waiter = page.locator(self.wait_selector)
|
106
|
+
waiter.wait_for(state=self.wait_selector_state)
|
107
|
+
|
108
|
+
content_type = res.headers.get('content-type', '')
|
109
|
+
# Parse charset from content-type
|
110
|
+
encoding = 'utf-8' # default encoding
|
111
|
+
if 'charset=' in content_type.lower():
|
112
|
+
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
113
|
+
|
114
|
+
response = Response(
|
115
|
+
url=res.url,
|
116
|
+
text=page.content(),
|
117
|
+
content=res.body(),
|
118
|
+
status=res.status,
|
119
|
+
reason=res.status_text,
|
120
|
+
encoding=encoding,
|
121
|
+
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
122
|
+
headers=res.all_headers(),
|
123
|
+
request_headers=res.request.all_headers(),
|
124
|
+
adaptor_arguments=self.adaptor_arguments
|
125
|
+
)
|
126
|
+
page.close()
|
127
|
+
|
128
|
+
return response
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# Disable loading these resources for speed
|
2
|
+
DEFAULT_DISABLED_RESOURCES = [
|
3
|
+
'font',
|
4
|
+
'image',
|
5
|
+
'media',
|
6
|
+
'beacon',
|
7
|
+
'object',
|
8
|
+
'imageset',
|
9
|
+
'texttrack',
|
10
|
+
'websocket',
|
11
|
+
'csp_report',
|
12
|
+
'stylesheet',
|
13
|
+
]
|
14
|
+
|
15
|
+
DEFAULT_STEALTH_FLAGS = [
|
16
|
+
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
17
|
+
# Generally this will make the browser faster and less detectable
|
18
|
+
'--no-pings',
|
19
|
+
'--incognito',
|
20
|
+
'--test-type',
|
21
|
+
'--lang=en-US',
|
22
|
+
'--mute-audio',
|
23
|
+
'--no-first-run',
|
24
|
+
'--disable-sync',
|
25
|
+
'--hide-scrollbars',
|
26
|
+
'--disable-logging',
|
27
|
+
'--start-maximized', # For headless check bypass
|
28
|
+
'--enable-async-dns',
|
29
|
+
'--disable-breakpad',
|
30
|
+
'--disable-infobars',
|
31
|
+
'--accept-lang=en-US',
|
32
|
+
'--use-mock-keychain',
|
33
|
+
'--disable-translate',
|
34
|
+
'--disable-extensions',
|
35
|
+
'--disable-voice-input',
|
36
|
+
'--window-position=0,0',
|
37
|
+
'--disable-wake-on-wifi',
|
38
|
+
'--ignore-gpu-blocklist',
|
39
|
+
'--enable-tcp-fast-open',
|
40
|
+
'--enable-web-bluetooth',
|
41
|
+
'--disable-hang-monitor',
|
42
|
+
'--password-store=basic',
|
43
|
+
'--disable-cloud-import',
|
44
|
+
'--disable-default-apps',
|
45
|
+
'--disable-print-preview',
|
46
|
+
'--disable-dev-shm-usage',
|
47
|
+
'--disable-popup-blocking',
|
48
|
+
'--metrics-recording-only',
|
49
|
+
'--disable-crash-reporter',
|
50
|
+
'--disable-partial-raster',
|
51
|
+
'--disable-gesture-typing',
|
52
|
+
'--disable-checker-imaging',
|
53
|
+
'--disable-prompt-on-repost',
|
54
|
+
'--force-color-profile=srgb',
|
55
|
+
'--font-render-hinting=none',
|
56
|
+
'--no-default-browser-check',
|
57
|
+
'--aggressive-cache-discard',
|
58
|
+
'--disable-component-update',
|
59
|
+
'--disable-cookie-encryption',
|
60
|
+
'--disable-domain-reliability',
|
61
|
+
'--disable-threaded-animation',
|
62
|
+
'--disable-threaded-scrolling',
|
63
|
+
# '--disable-reading-from-canvas', # For Firefox
|
64
|
+
'--enable-simple-cache-backend',
|
65
|
+
'--disable-background-networking',
|
66
|
+
'--disable-session-crashed-bubble',
|
67
|
+
'--enable-surface-synchronization',
|
68
|
+
'--disable-image-animation-resync',
|
69
|
+
'--disable-renderer-backgrounding',
|
70
|
+
'--disable-ipc-flooding-protection',
|
71
|
+
'--prerender-from-omnibox=disabled',
|
72
|
+
'--safebrowsing-disable-auto-update',
|
73
|
+
'--disable-offer-upload-credit-cards',
|
74
|
+
'--disable-features=site-per-process',
|
75
|
+
'--disable-background-timer-throttling',
|
76
|
+
'--disable-new-content-rendering-timeout',
|
77
|
+
'--run-all-compositor-stages-before-draw',
|
78
|
+
'--disable-client-side-phishing-detection',
|
79
|
+
'--disable-backgrounding-occluded-windows',
|
80
|
+
'--disable-layer-tree-host-memory-pressure',
|
81
|
+
'--autoplay-policy=no-user-gesture-required',
|
82
|
+
'--disable-offer-store-unmasked-wallet-cards',
|
83
|
+
'--disable-blink-features=AutomationControlled',
|
84
|
+
'--webrtc-ip-handling-policy=disable_non_proxied_udp',
|
85
|
+
'--disable-component-extensions-with-background-pages',
|
86
|
+
'--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
|
87
|
+
'--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance',
|
88
|
+
'--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
|
89
|
+
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
|
90
|
+
]
|
91
|
+
|
92
|
+
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
93
|
+
NSTBROWSER_DEFAULT_QUERY = {
|
94
|
+
"once": True,
|
95
|
+
"headless": True,
|
96
|
+
"autoClose": True,
|
97
|
+
"fingerprint": {
|
98
|
+
"flags": {
|
99
|
+
"timezone": "BasedOnIp",
|
100
|
+
"screen": "Custom"
|
101
|
+
},
|
102
|
+
"platform": 'linux', # support: windows, mac, linux
|
103
|
+
"kernel": 'chromium', # only support: chromium
|
104
|
+
"kernelMilestone": '128',
|
105
|
+
"hardwareConcurrency": 8,
|
106
|
+
"deviceMemory": 8,
|
107
|
+
},
|
108
|
+
}
|