scrapling 0.2.7__py3-none-any.whl → 0.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +4 -3
- scrapling/core/_types.py +2 -3
- scrapling/core/custom_types.py +5 -5
- scrapling/core/storage_adaptors.py +6 -6
- scrapling/core/translator.py +5 -6
- scrapling/core/utils.py +15 -12
- scrapling/defaults.py +1 -1
- scrapling/engines/__init__.py +2 -2
- scrapling/engines/camo.py +8 -12
- scrapling/engines/pw.py +9 -14
- scrapling/engines/static.py +4 -3
- scrapling/engines/toolbelt/__init__.py +6 -20
- scrapling/engines/toolbelt/custom.py +3 -2
- scrapling/engines/toolbelt/fingerprints.py +5 -5
- scrapling/engines/toolbelt/navigation.py +6 -6
- scrapling/fetchers.py +5 -4
- scrapling/parser.py +15 -8
- {scrapling-0.2.7.dist-info → scrapling-0.2.8.dist-info}/METADATA +14 -14
- scrapling-0.2.8.dist-info/RECORD +42 -0
- tests/fetchers/test_camoufox.py +1 -0
- tests/fetchers/test_httpx.py +1 -0
- tests/fetchers/test_playwright.py +1 -0
- tests/parser/test_general.py +3 -1
- scrapling-0.2.7.dist-info/RECORD +0 -42
- {scrapling-0.2.7.dist-info → scrapling-0.2.8.dist-info}/LICENSE +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.8.dist-info}/WHEEL +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.8.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -1,10 +1,11 @@
|
|
1
1
|
# Declare top-level shortcuts
|
2
|
-
from scrapling.
|
2
|
+
from scrapling.core.custom_types import AttributesHandler, TextHandler
|
3
|
+
from scrapling.fetchers import (CustomFetcher, Fetcher, PlayWrightFetcher,
|
4
|
+
StealthyFetcher)
|
3
5
|
from scrapling.parser import Adaptor, Adaptors
|
4
|
-
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
6
|
|
6
7
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
8
|
+
__version__ = "0.2.8"
|
8
9
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
10
|
|
10
11
|
|
scrapling/core/_types.py
CHANGED
@@ -2,9 +2,8 @@
|
|
2
2
|
Type definitions for type checking purposes.
|
3
3
|
"""
|
4
4
|
|
5
|
-
from typing import (
|
6
|
-
|
7
|
-
)
|
5
|
+
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
|
6
|
+
List, Literal, Optional, Pattern, Tuple, Type, Union)
|
8
7
|
|
9
8
|
try:
|
10
9
|
from typing import Protocol
|
scrapling/core/custom_types.py
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
import re
|
2
|
-
from types import MappingProxyType
|
3
2
|
from collections.abc import Mapping
|
3
|
+
from types import MappingProxyType
|
4
4
|
|
5
|
-
from
|
6
|
-
from scrapling.core._types import Dict, List, Union, Pattern, SupportsIndex
|
7
|
-
|
8
|
-
from orjson import loads, dumps
|
5
|
+
from orjson import dumps, loads
|
9
6
|
from w3lib.html import replace_entities as _replace_entities
|
10
7
|
|
8
|
+
from scrapling.core._types import Dict, List, Pattern, SupportsIndex, Union
|
9
|
+
from scrapling.core.utils import _is_iterable, flatten
|
10
|
+
|
11
11
|
|
12
12
|
class TextHandler(str):
|
13
13
|
"""Extends standard Python string by adding more functionality"""
|
@@ -1,16 +1,16 @@
|
|
1
|
-
import orjson
|
2
|
-
import sqlite3
|
3
1
|
import logging
|
2
|
+
import sqlite3
|
4
3
|
import threading
|
5
|
-
from hashlib import sha256
|
6
4
|
from abc import ABC, abstractmethod
|
5
|
+
from hashlib import sha256
|
7
6
|
|
8
|
-
|
9
|
-
from scrapling.core.utils import _StorageTools, cache
|
10
|
-
|
7
|
+
import orjson
|
11
8
|
from lxml import html
|
12
9
|
from tldextract import extract as tld
|
13
10
|
|
11
|
+
from scrapling.core._types import Dict, Optional, Union
|
12
|
+
from scrapling.core.utils import _StorageTools, cache
|
13
|
+
|
14
14
|
|
15
15
|
class StorageSystemMixin(ABC):
|
16
16
|
# If you want to make your own storage system, you have to inherit from this
|
scrapling/core/translator.py
CHANGED
@@ -10,15 +10,14 @@ So you don't have to learn a new selectors/api method like what bs4 done with so
|
|
10
10
|
|
11
11
|
import re
|
12
12
|
|
13
|
-
from w3lib.html import HTML5_WHITESPACE
|
14
|
-
from scrapling.core.utils import cache
|
15
|
-
from scrapling.core._types import Any, Optional, Protocol, Self
|
16
|
-
|
17
|
-
from cssselect.xpath import ExpressionError
|
18
|
-
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
19
13
|
from cssselect import HTMLTranslator as OriginalHTMLTranslator
|
20
14
|
from cssselect.parser import Element, FunctionalPseudoElement, PseudoElement
|
15
|
+
from cssselect.xpath import ExpressionError
|
16
|
+
from cssselect.xpath import XPathExpr as OriginalXPathExpr
|
17
|
+
from w3lib.html import HTML5_WHITESPACE
|
21
18
|
|
19
|
+
from scrapling.core._types import Any, Optional, Protocol, Self
|
20
|
+
from scrapling.core.utils import cache
|
22
21
|
|
23
22
|
regex = f"[{HTML5_WHITESPACE}]+"
|
24
23
|
replace_html5_whitespaces = re.compile(regex).sub
|
scrapling/core/utils.py
CHANGED
@@ -1,22 +1,25 @@
|
|
1
|
-
import re
|
2
1
|
import logging
|
2
|
+
import re
|
3
3
|
from itertools import chain
|
4
|
-
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
5
|
-
from functools import lru_cache as cache # functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
6
|
-
|
7
|
-
from scrapling.core._types import Dict, Iterable, Any, Union
|
8
4
|
|
9
5
|
import orjson
|
10
6
|
from lxml import html
|
11
7
|
|
8
|
+
from scrapling.core._types import Any, Dict, Iterable, Union
|
9
|
+
|
10
|
+
# Using cache on top of a class is brilliant way to achieve Singleton design pattern without much code
|
11
|
+
# functools.cache is available on Python 3.9+ only so let's keep lru_cache
|
12
|
+
from functools import lru_cache as cache # isort:skip
|
13
|
+
|
14
|
+
|
12
15
|
html_forbidden = {html.HtmlComment, }
|
13
16
|
logging.basicConfig(
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
17
|
+
level=logging.ERROR,
|
18
|
+
format='%(asctime)s - %(levelname)s - %(message)s',
|
19
|
+
handlers=[
|
20
|
+
logging.StreamHandler()
|
21
|
+
]
|
22
|
+
)
|
20
23
|
|
21
24
|
|
22
25
|
def is_jsonable(content: Union[bytes, str]) -> bool:
|
@@ -94,7 +97,7 @@ class _StorageTools:
|
|
94
97
|
parent = element.getparent()
|
95
98
|
return tuple(
|
96
99
|
(element.tag,) if parent is None else (
|
97
|
-
|
100
|
+
cls._get_element_path(parent) + (element.tag,)
|
98
101
|
)
|
99
102
|
)
|
100
103
|
|
scrapling/defaults.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from .fetchers import Fetcher,
|
1
|
+
from .fetchers import Fetcher, PlayWrightFetcher, StealthyFetcher
|
2
2
|
|
3
3
|
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
4
|
Fetcher = Fetcher()
|
scrapling/engines/__init__.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from .camo import CamoufoxEngine
|
2
|
-
from .static import StaticEngine
|
3
|
-
from .pw import PlaywrightEngine
|
4
2
|
from .constants import DEFAULT_DISABLED_RESOURCES, DEFAULT_STEALTH_FLAGS
|
3
|
+
from .pw import PlaywrightEngine
|
4
|
+
from .static import StaticEngine
|
5
5
|
from .toolbelt import check_if_engine_usable
|
6
6
|
|
7
7
|
__all__ = ['CamoufoxEngine', 'PlaywrightEngine']
|
scrapling/engines/camo.py
CHANGED
@@ -1,20 +1,16 @@
|
|
1
1
|
import logging
|
2
|
-
from scrapling.core._types import Union, Callable, Optional, Dict, List, Literal
|
3
|
-
|
4
|
-
from scrapling.engines.toolbelt import (
|
5
|
-
Response,
|
6
|
-
do_nothing,
|
7
|
-
StatusText,
|
8
|
-
get_os_name,
|
9
|
-
intercept_route,
|
10
|
-
check_type_validity,
|
11
|
-
construct_proxy_dict,
|
12
|
-
generate_convincing_referer,
|
13
|
-
)
|
14
2
|
|
15
3
|
from camoufox import DefaultAddons
|
16
4
|
from camoufox.sync_api import Camoufox
|
17
5
|
|
6
|
+
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
7
|
+
Union)
|
8
|
+
from scrapling.engines.toolbelt import (Response, StatusText,
|
9
|
+
check_type_validity,
|
10
|
+
construct_proxy_dict, do_nothing,
|
11
|
+
generate_convincing_referer,
|
12
|
+
get_os_name, intercept_route)
|
13
|
+
|
18
14
|
|
19
15
|
class CamoufoxEngine:
|
20
16
|
def __init__(
|
scrapling/engines/pw.py
CHANGED
@@ -1,20 +1,15 @@
|
|
1
1
|
import json
|
2
2
|
import logging
|
3
|
-
from scrapling.core._types import Union, Callable, Optional, List, Dict
|
4
3
|
|
5
|
-
from scrapling.
|
6
|
-
from scrapling.engines.
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
check_type_validity,
|
15
|
-
construct_proxy_dict,
|
16
|
-
generate_convincing_referer,
|
17
|
-
)
|
4
|
+
from scrapling.core._types import Callable, Dict, List, Optional, Union
|
5
|
+
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
|
6
|
+
NSTBROWSER_DEFAULT_QUERY)
|
7
|
+
from scrapling.engines.toolbelt import (Response, StatusText,
|
8
|
+
check_type_validity, construct_cdp_url,
|
9
|
+
construct_proxy_dict, do_nothing,
|
10
|
+
generate_convincing_referer,
|
11
|
+
generate_headers, intercept_route,
|
12
|
+
js_bypass_path)
|
18
13
|
|
19
14
|
|
20
15
|
class PlaywrightEngine:
|
scrapling/engines/static.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
import logging
|
2
2
|
|
3
|
-
from scrapling.core._types import Union, Optional, Dict
|
4
|
-
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
5
|
-
|
6
3
|
import httpx
|
7
4
|
from httpx._models import Response as httpxResponse
|
8
5
|
|
6
|
+
from scrapling.core._types import Dict, Optional, Union
|
7
|
+
|
8
|
+
from .toolbelt import Response, generate_convincing_referer, generate_headers
|
9
|
+
|
9
10
|
|
10
11
|
class StaticEngine:
|
11
12
|
def __init__(self, follow_redirects: bool = True, timeout: Optional[Union[int, float]] = None, adaptor_arguments: Dict = None):
|
@@ -1,20 +1,6 @@
|
|
1
|
-
from .
|
2
|
-
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
Response,
|
8
|
-
do_nothing,
|
9
|
-
StatusText,
|
10
|
-
BaseFetcher,
|
11
|
-
get_variable_name,
|
12
|
-
check_type_validity,
|
13
|
-
check_if_engine_usable,
|
14
|
-
)
|
15
|
-
from .navigation import (
|
16
|
-
js_bypass_path,
|
17
|
-
intercept_route,
|
18
|
-
construct_cdp_url,
|
19
|
-
construct_proxy_dict,
|
20
|
-
)
|
1
|
+
from .custom import (BaseFetcher, Response, StatusText, check_if_engine_usable,
|
2
|
+
check_type_validity, do_nothing, get_variable_name)
|
3
|
+
from .fingerprints import (generate_convincing_referer, generate_headers,
|
4
|
+
get_os_name)
|
5
|
+
from .navigation import (construct_cdp_url, construct_proxy_dict,
|
6
|
+
intercept_route, js_bypass_path)
|
@@ -5,10 +5,11 @@ import inspect
|
|
5
5
|
import logging
|
6
6
|
from email.message import Message
|
7
7
|
|
8
|
+
from scrapling.core._types import (Any, Callable, Dict, List, Optional, Tuple,
|
9
|
+
Type, Union)
|
8
10
|
from scrapling.core.custom_types import MappingProxyType
|
11
|
+
from scrapling.core.utils import cache, setup_basic_logging
|
9
12
|
from scrapling.parser import Adaptor, SQLiteStorageSystem
|
10
|
-
from scrapling.core.utils import setup_basic_logging, cache
|
11
|
-
from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callable, Tuple
|
12
13
|
|
13
14
|
|
14
15
|
class ResponseEncoding:
|
@@ -4,12 +4,12 @@ Functions related to generating headers and fingerprints generally
|
|
4
4
|
|
5
5
|
import platform
|
6
6
|
|
7
|
-
from
|
8
|
-
from
|
9
|
-
|
7
|
+
from browserforge.fingerprints import Fingerprint, FingerprintGenerator
|
8
|
+
from browserforge.headers import Browser, HeaderGenerator
|
10
9
|
from tldextract import extract
|
11
|
-
|
12
|
-
from
|
10
|
+
|
11
|
+
from scrapling.core._types import Dict, Union
|
12
|
+
from scrapling.core.utils import cache
|
13
13
|
|
14
14
|
|
15
15
|
@cache(None, typed=True)
|
@@ -2,16 +2,16 @@
|
|
2
2
|
Functions related to files and URLs
|
3
3
|
"""
|
4
4
|
|
5
|
-
import os
|
6
5
|
import logging
|
7
|
-
|
6
|
+
import os
|
7
|
+
from urllib.parse import urlencode, urlparse
|
8
|
+
|
9
|
+
from playwright.sync_api import Route
|
8
10
|
|
11
|
+
from scrapling.core._types import Dict, Optional, Union
|
9
12
|
from scrapling.core.utils import cache
|
10
|
-
from scrapling.core._types import Union, Dict, Optional
|
11
13
|
from scrapling.engines.constants import DEFAULT_DISABLED_RESOURCES
|
12
14
|
|
13
|
-
from playwright.sync_api import Route
|
14
|
-
|
15
15
|
|
16
16
|
def intercept_route(route: Route) -> Union[Route, None]:
|
17
17
|
"""This is just a route handler but it drops requests that its type falls in `DEFAULT_DISABLED_RESOURCES`
|
@@ -43,7 +43,7 @@ def construct_proxy_dict(proxy_string: Union[str, Dict[str, str]]) -> Union[Dict
|
|
43
43
|
}
|
44
44
|
except ValueError:
|
45
45
|
# Urllib will say that one of the parameters above can't be casted to the correct type like `int` for port etc...
|
46
|
-
raise TypeError(
|
46
|
+
raise TypeError('The proxy argument\'s string is in invalid format!')
|
47
47
|
|
48
48
|
elif isinstance(proxy_string, dict):
|
49
49
|
valid_keys = ('server', 'username', 'password', )
|
scrapling/fetchers.py
CHANGED
@@ -1,7 +1,8 @@
|
|
1
|
-
from scrapling.core._types import
|
2
|
-
|
3
|
-
from scrapling.engines
|
4
|
-
|
1
|
+
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
|
2
|
+
Union)
|
3
|
+
from scrapling.engines import (CamoufoxEngine, PlaywrightEngine, StaticEngine,
|
4
|
+
check_if_engine_usable)
|
5
|
+
from scrapling.engines.toolbelt import BaseFetcher, Response, do_nothing
|
5
6
|
|
6
7
|
|
7
8
|
class Fetcher(BaseFetcher):
|
scrapling/parser.py
CHANGED
@@ -1,16 +1,23 @@
|
|
1
|
+
import inspect
|
1
2
|
import os
|
2
3
|
import re
|
3
|
-
import inspect
|
4
4
|
from difflib import SequenceMatcher
|
5
5
|
|
6
|
-
from
|
7
|
-
from
|
8
|
-
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
9
|
-
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
10
|
-
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
|
11
|
-
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
6
|
+
from cssselect import SelectorError, SelectorSyntaxError
|
7
|
+
from cssselect import parse as split_selectors
|
12
8
|
from lxml import etree, html
|
13
|
-
|
9
|
+
|
10
|
+
from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
|
11
|
+
List, Optional, Pattern, SupportsIndex,
|
12
|
+
Tuple, Union)
|
13
|
+
from scrapling.core.custom_types import (AttributesHandler, TextHandler,
|
14
|
+
TextHandlers)
|
15
|
+
from scrapling.core.mixins import SelectorsGeneration
|
16
|
+
from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
|
17
|
+
StorageSystemMixin, _StorageTools)
|
18
|
+
from scrapling.core.translator import HTMLTranslator
|
19
|
+
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
|
20
|
+
is_jsonable, logging, setup_basic_logging)
|
14
21
|
|
15
22
|
|
16
23
|
class Adaptor(SelectorsGeneration):
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
4
|
-
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
3
|
+
Version: 0.2.8
|
4
|
+
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
7
7
|
Author-email: karim.shoair@pm.me
|
@@ -41,7 +41,7 @@ Requires-Dist: tldextract
|
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
42
|
Requires-Dist: playwright==1.48
|
43
43
|
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox>=0.
|
44
|
+
Requires-Dist: camoufox>=0.4.4
|
45
45
|
Requires-Dist: browserforge
|
46
46
|
|
47
47
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
@@ -52,7 +52,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
52
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
53
|
|
54
54
|
```python
|
55
|
-
>> from scrapling.
|
55
|
+
>> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
56
56
|
# Fetch websites' source under the radar!
|
57
57
|
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
58
|
>> print(page.status)
|
@@ -258,25 +258,25 @@ python -m browserforge update
|
|
258
258
|
```
|
259
259
|
|
260
260
|
## Fetching Websites
|
261
|
-
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you
|
261
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
262
262
|
|
263
263
|
### Features
|
264
|
-
You might be
|
264
|
+
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
265
265
|
```python
|
266
266
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
267
267
|
```
|
268
|
-
|
268
|
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
|
269
269
|
|
270
270
|
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
271
271
|
```python
|
272
|
-
from scrapling.
|
272
|
+
from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
273
273
|
```
|
274
274
|
then use it right away without initializing like:
|
275
275
|
```python
|
276
276
|
page = StealthyFetcher.fetch('https://example.com')
|
277
277
|
```
|
278
278
|
|
279
|
-
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
279
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
280
280
|
> [!NOTE]
|
281
281
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
282
282
|
### Fetcher
|
@@ -292,13 +292,13 @@ You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods i
|
|
292
292
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
293
293
|
```
|
294
294
|
### StealthyFetcher
|
295
|
-
This class is built on top of [Camoufox](https://github.com/daijro/camoufox)
|
295
|
+
This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
|
296
296
|
```python
|
297
297
|
>> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
|
298
298
|
>> page.status == 200
|
299
299
|
True
|
300
300
|
```
|
301
|
-
> Note: all requests done by this fetcher
|
301
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
302
302
|
|
303
303
|
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
304
304
|
|
@@ -334,7 +334,7 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
|
|
334
334
|
>> page.css_first("#search a::attr(href)")
|
335
335
|
'https://github.com/D4Vinci/Scrapling'
|
336
336
|
```
|
337
|
-
> Note: all requests done by this fetcher
|
337
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
338
338
|
|
339
339
|
Using this Fetcher class, you can make requests with:
|
340
340
|
1) Vanilla Playwright without any modifications other than the ones you chose.
|
@@ -346,7 +346,7 @@ Using this Fetcher class, you can make requests with:
|
|
346
346
|
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
347
347
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
348
348
|
|
349
|
-
> Hence using the `real_chrome` argument requires that you have
|
349
|
+
> Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
|
350
350
|
|
351
351
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
352
352
|
|
@@ -369,7 +369,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
369
369
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
370
370
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
371
371
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
372
|
-
| real_chrome | If you have
|
372
|
+
| real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
373
373
|
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
374
374
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
375
375
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
@@ -0,0 +1,42 @@
|
|
1
|
+
scrapling/__init__.py,sha256=0-gw4uqckCs7ikl6sHiB5c6y0AelpgefqJkBmSd7j1k,469
|
2
|
+
scrapling/defaults.py,sha256=qO6zAS7k5_QXvbjuoBv87fUMqASGMuM2dVry9J9auv0,287
|
3
|
+
scrapling/fetchers.py,sha256=iw1wEuFg14akJYpSg9webfBjAL341Pnofn4IkWahGlE,17486
|
4
|
+
scrapling/parser.py,sha256=suXggr39GimLnnLm9ivM1CQ40AoDwGke2sgnWszqFqk,54331
|
5
|
+
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
|
+
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
scrapling/core/_types.py,sha256=__HJ2JTk5vx5eg_7HAJmDjaHrMDIaoxNG8fadLLyKV8,566
|
8
|
+
scrapling/core/custom_types.py,sha256=8GCgcZL-IT5lP6titxL-RPCiItQSuJZjSlFIGCDxoSs,8402
|
9
|
+
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
|
+
scrapling/core/storage_adaptors.py,sha256=Q2-G7oDqoIqlIBEmnUsKwSzM2lNGNUPKtTbMjTV9178,6218
|
11
|
+
scrapling/core/translator.py,sha256=WN_xPyYrD1MjLPv8Ar8zHNTPC_iYsW29kkjET4hbFI0,5228
|
12
|
+
scrapling/core/utils.py,sha256=RajDRSPkVmszjpwNy8NIz8ZlUxPox8j2rSractr7Q9s,3779
|
13
|
+
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
14
|
+
scrapling/engines/camo.py,sha256=fmpGMW5T7we5cQC8muyvVo_A27yAqc5csm7dO_2jHiE,8446
|
15
|
+
scrapling/engines/constants.py,sha256=WTn-X4kFIDWjXTiqOT0tm4XT5pijcdohFyZ0Af2C5Xc,3723
|
16
|
+
scrapling/engines/pw.py,sha256=kWbkHm2vnQYeGuJnicKlAL1HrBKuXoFtyRMNFXLs4VY,13962
|
17
|
+
scrapling/engines/static.py,sha256=h629IjT78YbhjFYBVSli53lKiYrG3929TAaZ7TA-j-Y,8022
|
18
|
+
scrapling/engines/toolbelt/__init__.py,sha256=0tSsxMH5ALOMPXrLkr8mTH7LWg9QfIse4Ij9vUFgYjY,391
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=tab_wJmN6onvu2U8tDXeJ9jn6A47jTkmxSBoc-w8dIk,12789
|
20
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=Y3FW8uqxxeNK3v6vBVvki8VjeG5oRxSwim4Q2Hv_cRk,2917
|
21
|
+
scrapling/engines/toolbelt/navigation.py,sha256=Okpl4ynlLn2cUpSiaaoXDSOdDOXhvxNOOGphE_HXc5k,4016
|
22
|
+
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
|
+
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
24
|
+
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
25
|
+
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
26
|
+
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
27
|
+
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
28
|
+
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
|
+
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
|
+
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
+
tests/fetchers/test_camoufox.py,sha256=-1v_0mXeBcAVW932nkFws1HIDCodGbpNYniSnVMHeeU,3116
|
32
|
+
tests/fetchers/test_httpx.py,sha256=rrw9q4KdDAHpQVa4sTmw278Yv1OlwY_SKPbpBPLVN7c,3508
|
33
|
+
tests/fetchers/test_playwright.py,sha256=xwhRmlw7WBrtqyilZsoMHkHpyAx7iXQ-YexDMJURTao,3702
|
34
|
+
tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
|
35
|
+
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
|
+
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
37
|
+
tests/parser/test_general.py,sha256=sPbwQRka9Mh8MDz2Sto8Rwg78t0SWWxELgzhTVPEplE,11785
|
38
|
+
scrapling-0.2.8.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
+
scrapling-0.2.8.dist-info/METADATA,sha256=0As--zWykpljObaw8DZQJr6udpHm4NyRN-dfUOUrhBc,66605
|
40
|
+
scrapling-0.2.8.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
+
scrapling-0.2.8.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
+
scrapling-0.2.8.dist-info/RECORD,,
|
tests/fetchers/test_camoufox.py
CHANGED
tests/fetchers/test_httpx.py
CHANGED
tests/parser/test_general.py
CHANGED
scrapling-0.2.7.dist-info/RECORD
DELETED
@@ -1,42 +0,0 @@
|
|
1
|
-
scrapling/__init__.py,sha256=WjvhJ6xkiSHp7St2YJYYJIsiKL8WDYuAQ_qIsg03v-0,435
|
2
|
-
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
3
|
-
scrapling/fetchers.py,sha256=vjAsa-oleb7FfYsxqmEUVZGNxdo7LMVuiLuyjIGySQE,17417
|
4
|
-
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
5
|
-
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
|
-
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
-
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
8
|
-
scrapling/core/custom_types.py,sha256=ztE_tshJ8i5uKqqSbsN5S6MoIUSfX6SexlhRjAnkclk,8402
|
9
|
-
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
|
-
scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHAu1l2Q,6218
|
11
|
-
scrapling/core/translator.py,sha256=R97lKGq1SDbx8S8Hg_w_5d4ePgukTHj_hRIKFzWiRuc,5229
|
12
|
-
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
13
|
-
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
14
|
-
scrapling/engines/camo.py,sha256=Lw_uZ5SMBy3T6MkCNOMPk1i51Lnpfd0M7HyAUJAzKIg,8284
|
15
|
-
scrapling/engines/constants.py,sha256=WTn-X4kFIDWjXTiqOT0tm4XT5pijcdohFyZ0Af2C5Xc,3723
|
16
|
-
scrapling/engines/pw.py,sha256=ZRmbFNQWzvxUHVrIUcKefyg6fDpBrN6erdatDpcLBaw,13762
|
17
|
-
scrapling/engines/static.py,sha256=ryVCIjTpVLNlCxSf_NYwDSdsoDbafnsGpkCoCROPhlI,8021
|
18
|
-
scrapling/engines/toolbelt/__init__.py,sha256=BbxfC0depVOV3i3BnBnyfjHtLcZrDbhz6c5rTRczZUc,383
|
19
|
-
scrapling/engines/toolbelt/custom.py,sha256=KopO0SVWzFoNB8LbFDQhtErm8KCid6nkQcGqRaItC6U,12752
|
20
|
-
scrapling/engines/toolbelt/fingerprints.py,sha256=T9HQejHzAnHsD5EIXvrYVC5siiG5q2gOOXVIIANmzMc,2917
|
21
|
-
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
22
|
-
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
|
-
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
24
|
-
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
25
|
-
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
26
|
-
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
27
|
-
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
28
|
-
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
|
-
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
|
-
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
-
tests/fetchers/test_camoufox.py,sha256=53piGA5uuPvOx5BeUEA0bbizYihwHGxehnj5uqCr6Q0,3115
|
32
|
-
tests/fetchers/test_httpx.py,sha256=UivOItR3-l-bXp9E6TP5Tvn2OrCdgiVkWsti-f9xdpU,3507
|
33
|
-
tests/fetchers/test_playwright.py,sha256=7qwbIU2SwjiQEbaGPA_MBo6kAXM4IBmfvy5kUvKT11M,3701
|
34
|
-
tests/fetchers/test_utils.py,sha256=FPPJkBrqgYxdGeWwapH8Vj8zyfYVLiTE1qSLu8eBWik,5728
|
35
|
-
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
36
|
-
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
37
|
-
tests/parser/test_general.py,sha256=qaiVzpvqESfdXYFat6QrpnMkevPYgCzIcTZK5FwdC0s,11783
|
38
|
-
scrapling-0.2.7.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
39
|
-
scrapling-0.2.7.dist-info/METADATA,sha256=kYARTFqiiLsL_cvnU03pf2I1E5N_NmJk25gbeLzSR4M,66607
|
40
|
-
scrapling-0.2.7.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
41
|
-
scrapling-0.2.7.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
42
|
-
scrapling-0.2.7.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|