scrapling 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -3
- scrapling/core/__init__.py +0 -0
- scrapling/core/_types.py +25 -0
- scrapling/{custom_types.py → core/custom_types.py} +48 -3
- scrapling/{mixins.py → core/mixins.py} +22 -7
- scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
- scrapling/{translator.py → core/translator.py} +2 -12
- scrapling/{utils.py → core/utils.py} +14 -61
- scrapling/engines/__init__.py +7 -0
- scrapling/engines/camo.py +128 -0
- scrapling/engines/constants.py +108 -0
- scrapling/engines/pw.py +237 -0
- scrapling/engines/static.py +112 -0
- scrapling/engines/toolbelt/__init__.py +19 -0
- scrapling/engines/toolbelt/custom.py +154 -0
- scrapling/engines/toolbelt/fingerprints.py +81 -0
- scrapling/engines/toolbelt/navigation.py +108 -0
- scrapling/fetchers.py +198 -0
- scrapling/parser.py +223 -70
- scrapling/py.typed +1 -0
- scrapling-0.2.1.dist-info/METADATA +835 -0
- scrapling-0.2.1.dist-info/RECORD +33 -0
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/WHEEL +1 -1
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/top_level.txt +1 -0
- tests/__init__.py +1 -0
- tests/fetchers/__init__.py +1 -0
- tests/fetchers/test_camoufox.py +62 -0
- tests/fetchers/test_httpx.py +67 -0
- tests/fetchers/test_playwright.py +74 -0
- tests/parser/__init__.py +0 -0
- tests/parser/test_automatch.py +56 -0
- tests/parser/test_general.py +286 -0
- scrapling-0.1.2.dist-info/METADATA +0 -477
- scrapling-0.1.2.dist-info/RECORD +0 -12
- {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/LICENSE +0 -0
scrapling/__init__.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# Declare top-level shortcuts
|
2
|
+
from scrapling.fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher, CustomFetcher
|
2
3
|
from scrapling.parser import Adaptor, Adaptors
|
3
|
-
from scrapling.custom_types import TextHandler, AttributesHandler
|
4
|
+
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
4
5
|
|
5
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
6
|
-
__version__ = "0.1
|
7
|
+
__version__ = "0.2.1"
|
7
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
8
9
|
|
9
10
|
|
10
|
-
__all__ = ['Adaptor', '
|
11
|
+
__all__ = ['Adaptor', 'Fetcher', 'StealthyFetcher', 'PlayWrightFetcher']
|
File without changes
|
scrapling/core/_types.py
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
"""
|
2
|
+
Type definitions for type checking purposes.
|
3
|
+
"""
|
4
|
+
|
5
|
+
from typing import (
|
6
|
+
Dict, Optional, Union, Callable, Any, List, Tuple, Pattern, Generator, Iterable, Type, TYPE_CHECKING, Literal
|
7
|
+
)
|
8
|
+
|
9
|
+
try:
|
10
|
+
from typing import Protocol
|
11
|
+
except ImportError:
|
12
|
+
# Added in Python 3.8
|
13
|
+
Protocol = object
|
14
|
+
|
15
|
+
try:
|
16
|
+
from typing import SupportsIndex
|
17
|
+
except ImportError:
|
18
|
+
# 'SupportsIndex' got added in Python 3.8
|
19
|
+
SupportsIndex = None
|
20
|
+
|
21
|
+
if TYPE_CHECKING:
|
22
|
+
# typing.Self requires Python 3.11
|
23
|
+
from typing_extensions import Self
|
24
|
+
else:
|
25
|
+
Self = object
|
@@ -1,9 +1,9 @@
|
|
1
1
|
import re
|
2
2
|
from types import MappingProxyType
|
3
3
|
from collections.abc import Mapping
|
4
|
-
from typing import Dict, List, Union, Pattern
|
5
4
|
|
6
|
-
from scrapling.utils import _is_iterable, flatten
|
5
|
+
from scrapling.core.utils import _is_iterable, flatten
|
6
|
+
from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
|
7
7
|
|
8
8
|
from orjson import loads, dumps
|
9
9
|
from w3lib.html import replace_entities as _replace_entities
|
@@ -69,7 +69,7 @@ class TextHandler(str):
|
|
69
69
|
return [TextHandler(_replace_entities(s)) for s in results]
|
70
70
|
|
71
71
|
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
72
|
-
clean_match: bool = False, case_sensitive: bool = False,
|
72
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
73
73
|
"""Apply the given regex to text and return the first match if found, otherwise return the default value.
|
74
74
|
|
75
75
|
:param regex: Can be either a compiled regular expression or a string.
|
@@ -83,6 +83,51 @@ class TextHandler(str):
|
|
83
83
|
return result[0] if result else default
|
84
84
|
|
85
85
|
|
86
|
+
class TextHandlers(List[TextHandler]):
|
87
|
+
"""
|
88
|
+
The :class:`TextHandlers` class is a subclass of the builtin ``List`` class, which provides a few additional methods.
|
89
|
+
"""
|
90
|
+
__slots__ = ()
|
91
|
+
|
92
|
+
def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[TextHandler, "TextHandlers[TextHandler]"]:
|
93
|
+
lst = super().__getitem__(pos)
|
94
|
+
if isinstance(pos, slice):
|
95
|
+
return self.__class__(lst)
|
96
|
+
else:
|
97
|
+
return lst
|
98
|
+
|
99
|
+
def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True, clean_match: bool = False,
|
100
|
+
case_sensitive: bool = False) -> 'List[str]':
|
101
|
+
"""Call the ``.re()`` method for each element in this list and return
|
102
|
+
their results flattened as TextHandlers.
|
103
|
+
|
104
|
+
:param regex: Can be either a compiled regular expression or a string.
|
105
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
106
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
107
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
108
|
+
"""
|
109
|
+
results = [
|
110
|
+
n.re(regex, replace_entities, clean_match, case_sensitive) for n in self
|
111
|
+
]
|
112
|
+
return flatten(results)
|
113
|
+
|
114
|
+
def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
|
115
|
+
clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
|
116
|
+
"""Call the ``.re_first()`` method for each element in this list and return
|
117
|
+
the first result or the default value otherwise.
|
118
|
+
|
119
|
+
:param regex: Can be either a compiled regular expression or a string.
|
120
|
+
:param default: The default value to be returned if there is no match
|
121
|
+
:param replace_entities: if enabled character entity references are replaced by their corresponding character
|
122
|
+
:param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
|
123
|
+
:param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
|
124
|
+
"""
|
125
|
+
for n in self:
|
126
|
+
for result in n.re(regex, replace_entities, clean_match, case_sensitive):
|
127
|
+
return result
|
128
|
+
return default
|
129
|
+
|
130
|
+
|
86
131
|
class AttributesHandler(Mapping):
|
87
132
|
"""A read-only mapping to use instead of the standard dictionary for the speed boost but
|
88
133
|
at the same time I use it to add more functionalities.
|
@@ -4,7 +4,7 @@ class SelectorsGeneration:
|
|
4
4
|
Trying to generate selectors like Firefox or maybe cleaner ones!? Ehm
|
5
5
|
Inspiration: https://searchfox.org/mozilla-central/source/devtools/shared/inspector/css-logic.js#591"""
|
6
6
|
|
7
|
-
def __general_selection(self, selection: str = 'css') -> str:
|
7
|
+
def __general_selection(self, selection: str = 'css', full_path=False) -> str:
|
8
8
|
"""Generate a selector for the current element.
|
9
9
|
:return: A string of the generated selector.
|
10
10
|
"""
|
@@ -20,10 +20,11 @@ class SelectorsGeneration:
|
|
20
20
|
else f"[@id='{target.attrib['id']}']"
|
21
21
|
)
|
22
22
|
selectorPath.append(part)
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
23
|
+
if not full_path:
|
24
|
+
return (
|
25
|
+
" > ".join(reversed(selectorPath)) if css
|
26
|
+
else '//*' + "/".join(reversed(selectorPath))
|
27
|
+
)
|
27
28
|
else:
|
28
29
|
part = f'{target.tag}'
|
29
30
|
# We won't use classes anymore because I some websites share exact classes between elements
|
@@ -60,15 +61,29 @@ class SelectorsGeneration:
|
|
60
61
|
)
|
61
62
|
|
62
63
|
@property
|
63
|
-
def
|
64
|
+
def generate_css_selector(self) -> str:
|
64
65
|
"""Generate a CSS selector for the current element
|
65
66
|
:return: A string of the generated selector.
|
66
67
|
"""
|
67
68
|
return self.__general_selection()
|
68
69
|
|
69
70
|
@property
|
70
|
-
def
|
71
|
+
def generate_full_css_selector(self) -> str:
|
72
|
+
"""Generate a complete CSS selector for the current element
|
73
|
+
:return: A string of the generated selector.
|
74
|
+
"""
|
75
|
+
return self.__general_selection(full_path=True)
|
76
|
+
|
77
|
+
@property
|
78
|
+
def generate_xpath_selector(self) -> str:
|
71
79
|
"""Generate a XPath selector for the current element
|
72
80
|
:return: A string of the generated selector.
|
73
81
|
"""
|
74
82
|
return self.__general_selection('xpath')
|
83
|
+
|
84
|
+
@property
|
85
|
+
def generate_full_xpath_selector(self) -> str:
|
86
|
+
"""Generate a complete XPath selector for the current element
|
87
|
+
:return: A string of the generated selector.
|
88
|
+
"""
|
89
|
+
return self.__general_selection('xpath', full_path=True)
|
@@ -4,9 +4,9 @@ import logging
|
|
4
4
|
import threading
|
5
5
|
from hashlib import sha256
|
6
6
|
from abc import ABC, abstractmethod
|
7
|
-
from typing import Dict, Optional, Union
|
8
7
|
|
9
|
-
from scrapling.
|
8
|
+
from scrapling.core._types import Dict, Optional, Union
|
9
|
+
from scrapling.core.utils import _StorageTools, cache
|
10
10
|
|
11
11
|
from lxml import html
|
12
12
|
from tldextract import extract as tld
|
@@ -9,24 +9,14 @@ which will be important in future releases but most importantly...
|
|
9
9
|
import re
|
10
10
|
|
11
11
|
from w3lib.html import HTML5_WHITESPACE
|
12
|
-
from
|
13
|
-
|
14
|
-
from typing import Protocol
|
15
|
-
except ImportError:
|
16
|
-
# Added in Python 3.8
|
17
|
-
Protocol = object
|
18
|
-
|
19
|
-
from scrapling.utils import cache
|
12
|
+
from scrapling.core.utils import cache
|
13
|
+
from scrapling.core._types import Any, Optional, Protocol, Self
|
20
14
|
|
21
15
|
from cssselect.xpath import ExpressionError
|
22
16
|
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
23
17
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
24
18
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
25
19
|
|
26
|
-
if TYPE_CHECKING:
|
27
|
-
# typing.Self requires Python 3.11
|
28
|
-
from typing_extensions import Self
|
29
|
-
|
30
20
|
|
31
21
|
regex = f"[{HTML5_WHITESPACE}]+"
|
32
22
|
replace_html5_whitespaces = re.compile(regex).sub
|
@@ -1,14 +1,14 @@
|
|
1
1
|
import re
|
2
|
-
import os
|
3
2
|
import logging
|
4
3
|
from itertools import chain
|
5
|
-
from logging import handlers
|
6
4
|
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
7
5
|
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
8
6
|
|
9
|
-
from
|
7
|
+
from scrapling.core._types import Dict, Iterable, Any, Union
|
10
8
|
|
9
|
+
import orjson
|
11
10
|
from lxml import html
|
11
|
+
|
12
12
|
html_forbidden = {html.HtmlComment, }
|
13
13
|
logging.basicConfig(
|
14
14
|
level=logging.ERROR,
|
@@ -19,6 +19,17 @@ logging.basicConfig(
|
|
19
19
|
)
|
20
20
|
|
21
21
|
|
22
|
+
def is_jsonable(content: Union[bytes, str]) -> bool:
|
23
|
+
if type(content) is bytes:
|
24
|
+
content = content.decode()
|
25
|
+
|
26
|
+
try:
|
27
|
+
_ = orjson.loads(content)
|
28
|
+
return True
|
29
|
+
except orjson.JSONDecodeError:
|
30
|
+
return False
|
31
|
+
|
32
|
+
|
22
33
|
@cache(None, typed=True)
|
23
34
|
def setup_basic_logging(level: str = 'debug'):
|
24
35
|
levels = {
|
@@ -45,64 +56,6 @@ def _is_iterable(s: Any):
|
|
45
56
|
return isinstance(s, (list, tuple,))
|
46
57
|
|
47
58
|
|
48
|
-
@cache(None, typed=True)
|
49
|
-
class _Logger(object):
|
50
|
-
# I will leave this class here for now in case I decide I want to come back to use it :)
|
51
|
-
__slots__ = ('console_logger', 'logger_file_path',)
|
52
|
-
levels = {
|
53
|
-
'debug': logging.DEBUG,
|
54
|
-
'info': logging.INFO,
|
55
|
-
'warning': logging.WARNING,
|
56
|
-
'error': logging.ERROR,
|
57
|
-
'critical': logging.CRITICAL
|
58
|
-
}
|
59
|
-
|
60
|
-
def __init__(self, filename: str = 'debug.log', level: str = 'debug', when: str = 'midnight', backcount: int = 1):
|
61
|
-
os.makedirs(os.path.join(os.path.dirname(__file__), 'logs'), exist_ok=True)
|
62
|
-
format_str = logging.Formatter("[%(asctime)s] %(levelname)s: %(message)s", "%Y-%m-%d %H:%M:%S")
|
63
|
-
|
64
|
-
# on-screen output
|
65
|
-
lvl = self.levels[level.lower()]
|
66
|
-
self.console_logger = logging.getLogger('Scrapling')
|
67
|
-
self.console_logger.setLevel(lvl)
|
68
|
-
console_handler = logging.StreamHandler()
|
69
|
-
console_handler.setLevel(lvl)
|
70
|
-
console_handler.setFormatter(format_str)
|
71
|
-
self.console_logger.addHandler(console_handler)
|
72
|
-
|
73
|
-
if lvl == logging.DEBUG:
|
74
|
-
filename = os.path.join(os.path.dirname(__file__), 'logs', filename)
|
75
|
-
self.logger_file_path = filename
|
76
|
-
# Automatically generates the logging file at specified intervals
|
77
|
-
file_handler = handlers.TimedRotatingFileHandler(
|
78
|
-
# If more than (backcount+1) existed, oldest logs will be deleted
|
79
|
-
filename=filename, when=when, backupCount=backcount, encoding='utf-8'
|
80
|
-
)
|
81
|
-
file_handler.setLevel(lvl)
|
82
|
-
file_handler.setFormatter(format_str)
|
83
|
-
# This for the logger when it appends the date to the new log
|
84
|
-
file_handler.namer = lambda name: name.replace(".log", "") + ".log"
|
85
|
-
self.console_logger.addHandler(file_handler)
|
86
|
-
self.debug(f'Debug log path: {self.logger_file_path}')
|
87
|
-
else:
|
88
|
-
self.logger_file_path = None
|
89
|
-
|
90
|
-
def debug(self, message: str) -> None:
|
91
|
-
self.console_logger.debug(message)
|
92
|
-
|
93
|
-
def info(self, message: str) -> None:
|
94
|
-
self.console_logger.info(message)
|
95
|
-
|
96
|
-
def warning(self, message: str) -> None:
|
97
|
-
self.console_logger.warning(message)
|
98
|
-
|
99
|
-
def error(self, message: str) -> None:
|
100
|
-
self.console_logger.error(message)
|
101
|
-
|
102
|
-
def critical(self, message: str) -> None:
|
103
|
-
self.console_logger.critical(message)
|
104
|
-
|
105
|
-
|
106
59
|
class _StorageTools:
|
107
60
|
@staticmethod
|
108
61
|
def __clean_attributes(element: html.HtmlElement, forbidden: tuple = ()) -> Dict:
|
@@ -0,0 +1,7 @@
|
|
1
|
+
from .camo import CamoufoxEngine
|
2
|
+
from .static import StaticEngine
|
3
|
+
from .pw import PlaywrightEngine
|
4
|
+
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
|
5
|
+
from .toolbelt import check_if_engine_usable
|
6
|
+
|
7
|
+
__all__ = ['CamoufoxEngine', 'PlaywrightEngine']
|
@@ -0,0 +1,128 @@
|
|
1
|
+
import logging
|
2
|
+
from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
3
|
+
|
4
|
+
from scrapling.engines.toolbelt import (
|
5
|
+
Response,
|
6
|
+
do_nothing,
|
7
|
+
get_os_name,
|
8
|
+
intercept_route,
|
9
|
+
check_type_validity,
|
10
|
+
construct_proxy_dict,
|
11
|
+
generate_convincing_referer,
|
12
|
+
)
|
13
|
+
|
14
|
+
from camoufox.sync_api import Camoufox
|
15
|
+
|
16
|
+
|
17
|
+
class CamoufoxEngine:
|
18
|
+
def __init__(
|
19
|
+
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
|
20
|
+
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = False, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
|
21
|
+
timeout: Optional[float] = 30000, page_action: Callable = do_nothing, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
|
22
|
+
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
|
23
|
+
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, adaptor_arguments: Dict = None
|
24
|
+
):
|
25
|
+
"""An engine that utilizes Camoufox library, check the `StealthyFetcher` class for more documentation.
|
26
|
+
|
27
|
+
:param headless: Run the browser in headless/hidden (default), virtual screen mode, or headful/visible mode.
|
28
|
+
:param block_images: Prevent the loading of images through Firefox preferences.
|
29
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
30
|
+
:param disable_resources: Drop requests of unnecessary resources for a speed boost. It depends but it made requests ~25% faster in my tests for some websites.
|
31
|
+
Requests dropped are of type `font`, `image`, `media`, `beacon`, `object`, `imageset`, `texttrack`, `websocket`, `csp_report`, and `stylesheet`.
|
32
|
+
This can help save your proxy usage but be careful with this option as it makes some websites never finish loading.
|
33
|
+
:param block_webrtc: Blocks WebRTC entirely.
|
34
|
+
:param addons: List of Firefox addons to use. Must be paths to extracted addons.
|
35
|
+
:param humanize: Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window.
|
36
|
+
:param allow_webgl: Whether to allow WebGL. To prevent leaks, only use this for special cases.
|
37
|
+
:param network_idle: Wait for the page until there are no network connections for at least 500 ms.
|
38
|
+
:param os_randomize: If enabled, Scrapling will randomize the OS fingerprints used. The default is Scrapling matching the fingerprints with the current OS.
|
39
|
+
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000
|
40
|
+
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
|
41
|
+
:param wait_selector: Wait for a specific css selector to be in a specific state.
|
42
|
+
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. Default state is `attached`.
|
43
|
+
:param google_search: Enabled by default, Scrapling will set the referer header to be as if this request came from a Google search for this website's domain name.
|
44
|
+
:param extra_headers: A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._
|
45
|
+
:param proxy: The proxy to be used with requests, it can be a string or a dictionary with the keys 'server', 'username', and 'password' only.
|
46
|
+
:param adaptor_arguments: The arguments that will be passed in the end while creating the final Adaptor's class.
|
47
|
+
"""
|
48
|
+
self.headless = headless
|
49
|
+
self.block_images = bool(block_images)
|
50
|
+
self.disable_resources = bool(disable_resources)
|
51
|
+
self.block_webrtc = bool(block_webrtc)
|
52
|
+
self.allow_webgl = bool(allow_webgl)
|
53
|
+
self.network_idle = bool(network_idle)
|
54
|
+
self.google_search = bool(google_search)
|
55
|
+
self.os_randomize = bool(os_randomize)
|
56
|
+
self.extra_headers = extra_headers or {}
|
57
|
+
self.proxy = construct_proxy_dict(proxy)
|
58
|
+
self.addons = addons or []
|
59
|
+
self.humanize = humanize
|
60
|
+
self.timeout = check_type_validity(timeout, [int, float], 30000)
|
61
|
+
if callable(page_action):
|
62
|
+
self.page_action = page_action
|
63
|
+
else:
|
64
|
+
self.page_action = do_nothing
|
65
|
+
logging.error('[Ignored] Argument "page_action" must be callable')
|
66
|
+
|
67
|
+
self.wait_selector = wait_selector
|
68
|
+
self.wait_selector_state = wait_selector_state
|
69
|
+
self.adaptor_arguments = adaptor_arguments if adaptor_arguments else {}
|
70
|
+
|
71
|
+
def fetch(self, url: str) -> Response:
|
72
|
+
"""Opens up the browser and do your request based on your chosen options.
|
73
|
+
|
74
|
+
:param url: Target url.
|
75
|
+
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
|
76
|
+
"""
|
77
|
+
with Camoufox(
|
78
|
+
proxy=self.proxy,
|
79
|
+
addons=self.addons,
|
80
|
+
headless=self.headless,
|
81
|
+
humanize=self.humanize,
|
82
|
+
i_know_what_im_doing=True, # To turn warnings off with the user configurations
|
83
|
+
allow_webgl=self.allow_webgl,
|
84
|
+
block_webrtc=self.block_webrtc,
|
85
|
+
block_images=self.block_images, # Careful! it makes some websites doesn't finish loading at all like stackoverflow even in headful
|
86
|
+
os=None if self.os_randomize else get_os_name(),
|
87
|
+
) as browser:
|
88
|
+
page = browser.new_page()
|
89
|
+
page.set_default_navigation_timeout(self.timeout)
|
90
|
+
page.set_default_timeout(self.timeout)
|
91
|
+
if self.disable_resources:
|
92
|
+
page.route("**/*", intercept_route)
|
93
|
+
|
94
|
+
if self.extra_headers:
|
95
|
+
page.set_extra_http_headers(self.extra_headers)
|
96
|
+
|
97
|
+
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
|
98
|
+
page.wait_for_load_state(state="domcontentloaded")
|
99
|
+
if self.network_idle:
|
100
|
+
page.wait_for_load_state('networkidle')
|
101
|
+
|
102
|
+
page = self.page_action(page)
|
103
|
+
|
104
|
+
if self.wait_selector and type(self.wait_selector) is str:
|
105
|
+
waiter = page.locator(self.wait_selector)
|
106
|
+
waiter.wait_for(state=self.wait_selector_state)
|
107
|
+
|
108
|
+
content_type = res.headers.get('content-type', '')
|
109
|
+
# Parse charset from content-type
|
110
|
+
encoding = 'utf-8' # default encoding
|
111
|
+
if 'charset=' in content_type.lower():
|
112
|
+
encoding = content_type.lower().split('charset=')[-1].split(';')[0].strip()
|
113
|
+
|
114
|
+
response = Response(
|
115
|
+
url=res.url,
|
116
|
+
text=page.content(),
|
117
|
+
content=res.body(),
|
118
|
+
status=res.status,
|
119
|
+
reason=res.status_text,
|
120
|
+
encoding=encoding,
|
121
|
+
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
122
|
+
headers=res.all_headers(),
|
123
|
+
request_headers=res.request.all_headers(),
|
124
|
+
adaptor_arguments=self.adaptor_arguments
|
125
|
+
)
|
126
|
+
page.close()
|
127
|
+
|
128
|
+
return response
|
@@ -0,0 +1,108 @@
|
|
1
|
+
# Disable loading these resources for speed
|
2
|
+
DEFAULT_DISABLED_RESOURCES = [
|
3
|
+
'font',
|
4
|
+
'image',
|
5
|
+
'media',
|
6
|
+
'beacon',
|
7
|
+
'object',
|
8
|
+
'imageset',
|
9
|
+
'texttrack',
|
10
|
+
'websocket',
|
11
|
+
'csp_report',
|
12
|
+
'stylesheet',
|
13
|
+
]
|
14
|
+
|
15
|
+
DEFAULT_STEALTH_FLAGS = [
|
16
|
+
# Explanation: https://peter.sh/experiments/chromium-command-line-switches/
|
17
|
+
# Generally this will make the browser faster and less detectable
|
18
|
+
'--no-pings',
|
19
|
+
'--incognito',
|
20
|
+
'--test-type',
|
21
|
+
'--lang=en-US',
|
22
|
+
'--mute-audio',
|
23
|
+
'--no-first-run',
|
24
|
+
'--disable-sync',
|
25
|
+
'--hide-scrollbars',
|
26
|
+
'--disable-logging',
|
27
|
+
'--start-maximized', # For headless check bypass
|
28
|
+
'--enable-async-dns',
|
29
|
+
'--disable-breakpad',
|
30
|
+
'--disable-infobars',
|
31
|
+
'--accept-lang=en-US',
|
32
|
+
'--use-mock-keychain',
|
33
|
+
'--disable-translate',
|
34
|
+
'--disable-extensions',
|
35
|
+
'--disable-voice-input',
|
36
|
+
'--window-position=0,0',
|
37
|
+
'--disable-wake-on-wifi',
|
38
|
+
'--ignore-gpu-blocklist',
|
39
|
+
'--enable-tcp-fast-open',
|
40
|
+
'--enable-web-bluetooth',
|
41
|
+
'--disable-hang-monitor',
|
42
|
+
'--password-store=basic',
|
43
|
+
'--disable-cloud-import',
|
44
|
+
'--disable-default-apps',
|
45
|
+
'--disable-print-preview',
|
46
|
+
'--disable-dev-shm-usage',
|
47
|
+
'--disable-popup-blocking',
|
48
|
+
'--metrics-recording-only',
|
49
|
+
'--disable-crash-reporter',
|
50
|
+
'--disable-partial-raster',
|
51
|
+
'--disable-gesture-typing',
|
52
|
+
'--disable-checker-imaging',
|
53
|
+
'--disable-prompt-on-repost',
|
54
|
+
'--force-color-profile=srgb',
|
55
|
+
'--font-render-hinting=none',
|
56
|
+
'--no-default-browser-check',
|
57
|
+
'--aggressive-cache-discard',
|
58
|
+
'--disable-component-update',
|
59
|
+
'--disable-cookie-encryption',
|
60
|
+
'--disable-domain-reliability',
|
61
|
+
'--disable-threaded-animation',
|
62
|
+
'--disable-threaded-scrolling',
|
63
|
+
# '--disable-reading-from-canvas', # For Firefox
|
64
|
+
'--enable-simple-cache-backend',
|
65
|
+
'--disable-background-networking',
|
66
|
+
'--disable-session-crashed-bubble',
|
67
|
+
'--enable-surface-synchronization',
|
68
|
+
'--disable-image-animation-resync',
|
69
|
+
'--disable-renderer-backgrounding',
|
70
|
+
'--disable-ipc-flooding-protection',
|
71
|
+
'--prerender-from-omnibox=disabled',
|
72
|
+
'--safebrowsing-disable-auto-update',
|
73
|
+
'--disable-offer-upload-credit-cards',
|
74
|
+
'--disable-features=site-per-process',
|
75
|
+
'--disable-background-timer-throttling',
|
76
|
+
'--disable-new-content-rendering-timeout',
|
77
|
+
'--run-all-compositor-stages-before-draw',
|
78
|
+
'--disable-client-side-phishing-detection',
|
79
|
+
'--disable-backgrounding-occluded-windows',
|
80
|
+
'--disable-layer-tree-host-memory-pressure',
|
81
|
+
'--autoplay-policy=no-user-gesture-required',
|
82
|
+
'--disable-offer-store-unmasked-wallet-cards',
|
83
|
+
'--disable-blink-features=AutomationControlled',
|
84
|
+
'--webrtc-ip-handling-policy=disable_non_proxied_udp',
|
85
|
+
'--disable-component-extensions-with-background-pages',
|
86
|
+
'--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
|
87
|
+
'--enable-features=NetworkService,NetworkServiceInProcess,TrustTokens,TrustTokensAlwaysAllowIssuance',
|
88
|
+
'--blink-settings=primaryHoverType=2,availableHoverTypes=2,primaryPointerType=4,availablePointerTypes=4',
|
89
|
+
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process,TranslateUI,BlinkGenPropertyTrees',
|
90
|
+
]
|
91
|
+
|
92
|
+
# Defaulting to the docker mode, token doesn't matter in it as it's passed for the container
|
93
|
+
NSTBROWSER_DEFAULT_QUERY = {
|
94
|
+
"once": True,
|
95
|
+
"headless": True,
|
96
|
+
"autoClose": True,
|
97
|
+
"fingerprint": {
|
98
|
+
"flags": {
|
99
|
+
"timezone": "BasedOnIp",
|
100
|
+
"screen": "Custom"
|
101
|
+
},
|
102
|
+
"platform": 'linux', # support: windows, mac, linux
|
103
|
+
"kernel": 'chromium', # only support: chromium
|
104
|
+
"kernelMilestone": '128',
|
105
|
+
"hardwareConcurrency": 8,
|
106
|
+
"deviceMemory": 8,
|
107
|
+
},
|
108
|
+
}
|