scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +5 -4
- scrapling/core/_types.py +2 -3
- scrapling/core/custom_types.py +93 -11
- scrapling/core/storage_adaptors.py +9 -10
- scrapling/core/translator.py +6 -7
- scrapling/core/utils.py +35 -30
- scrapling/defaults.py +2 -1
- scrapling/engines/__init__.py +2 -2
- scrapling/engines/camo.py +96 -26
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +166 -96
- scrapling/engines/static.py +94 -50
- scrapling/engines/toolbelt/__init__.py +6 -20
- scrapling/engines/toolbelt/custom.py +22 -23
- scrapling/engines/toolbelt/fingerprints.py +7 -7
- scrapling/engines/toolbelt/navigation.py +25 -12
- scrapling/fetchers.py +233 -17
- scrapling/parser.py +63 -28
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
- scrapling-0.2.9.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +263 -219
- scrapling-0.2.7.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -64
- tests/fetchers/test_httpx.py +0 -67
- tests/fetchers/test_playwright.py +0 -76
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
- {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
scrapling/parser.py
CHANGED
@@ -1,22 +1,31 @@
|
|
1
|
+
import inspect
|
1
2
|
import os
|
2
3
|
import re
|
3
|
-
import inspect
|
4
4
|
from difflib import SequenceMatcher
|
5
|
+
from urllib.parse import urljoin
|
5
6
|
|
6
|
-
from
|
7
|
-
from
|
8
|
-
from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
|
9
|
-
from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
|
10
|
-
from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
|
11
|
-
from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
|
7
|
+
from cssselect import SelectorError, SelectorSyntaxError
|
8
|
+
from cssselect import parse as split_selectors
|
12
9
|
from lxml import etree, html
|
13
|
-
|
10
|
+
|
11
|
+
from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
|
12
|
+
List, Optional, Pattern, SupportsIndex,
|
13
|
+
Tuple, Union)
|
14
|
+
from scrapling.core.custom_types import (AttributesHandler, TextHandler,
|
15
|
+
TextHandlers)
|
16
|
+
from scrapling.core.mixins import SelectorsGeneration
|
17
|
+
from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
|
18
|
+
StorageSystemMixin, _StorageTools)
|
19
|
+
from scrapling.core.translator import HTMLTranslator
|
20
|
+
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
|
21
|
+
is_jsonable, log)
|
14
22
|
|
15
23
|
|
16
24
|
class Adaptor(SelectorsGeneration):
|
17
25
|
__slots__ = (
|
18
|
-
'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
|
26
|
+
'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
|
19
27
|
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
|
28
|
+
'__keep_cdata', '__raw_body'
|
20
29
|
)
|
21
30
|
|
22
31
|
def __init__(
|
@@ -28,10 +37,10 @@ class Adaptor(SelectorsGeneration):
|
|
28
37
|
huge_tree: bool = True,
|
29
38
|
root: Optional[html.HtmlElement] = None,
|
30
39
|
keep_comments: Optional[bool] = False,
|
40
|
+
keep_cdata: Optional[bool] = False,
|
31
41
|
auto_match: Optional[bool] = True,
|
32
42
|
storage: Any = SQLiteStorageSystem,
|
33
43
|
storage_args: Optional[Dict] = None,
|
34
|
-
debug: Optional[bool] = True,
|
35
44
|
**kwargs
|
36
45
|
):
|
37
46
|
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
@@ -51,33 +60,36 @@ class Adaptor(SelectorsGeneration):
|
|
51
60
|
:param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
|
52
61
|
Don't use it unless you know what you are doing!
|
53
62
|
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
63
|
+
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
54
64
|
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
55
65
|
priority over all auto-match related arguments/functions in the class.
|
56
66
|
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
57
67
|
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
58
68
|
If empty, default values will be used.
|
59
|
-
:param debug: Enable debug mode
|
60
69
|
"""
|
61
70
|
if root is None and not body and text is None:
|
62
71
|
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
63
72
|
|
64
73
|
self.__text = None
|
74
|
+
self.__raw_body = ''
|
65
75
|
if root is None:
|
66
76
|
if text is None:
|
67
77
|
if not body or not isinstance(body, bytes):
|
68
78
|
raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
|
69
79
|
|
70
80
|
body = body.replace(b"\x00", b"").strip()
|
81
|
+
self.__raw_body = body.replace(b"\x00", b"").strip().decode()
|
71
82
|
else:
|
72
83
|
if not isinstance(text, str):
|
73
84
|
raise TypeError(f"text argument must be of type str, got {text.__class__}")
|
74
85
|
|
75
86
|
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
87
|
+
self.__raw_body = text.strip()
|
76
88
|
|
77
89
|
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
78
90
|
parser = html.HTMLParser(
|
79
|
-
recover=True, remove_blank_text=True, remove_comments=(keep_comments
|
80
|
-
compact=True, huge_tree=huge_tree, default_doctype=True
|
91
|
+
recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
|
92
|
+
compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
|
81
93
|
)
|
82
94
|
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
83
95
|
if is_jsonable(text or body.decode()):
|
@@ -92,7 +104,6 @@ class Adaptor(SelectorsGeneration):
|
|
92
104
|
|
93
105
|
self._root = root
|
94
106
|
|
95
|
-
setup_basic_logging(level='debug' if debug else 'info')
|
96
107
|
self.__auto_match_enabled = auto_match
|
97
108
|
|
98
109
|
if self.__auto_match_enabled:
|
@@ -103,7 +114,7 @@ class Adaptor(SelectorsGeneration):
|
|
103
114
|
}
|
104
115
|
|
105
116
|
if not hasattr(storage, '__wrapped__'):
|
106
|
-
raise ValueError("Storage class must be wrapped with
|
117
|
+
raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
|
107
118
|
|
108
119
|
if not issubclass(storage.__wrapped__, StorageSystemMixin):
|
109
120
|
raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
|
@@ -111,13 +122,13 @@ class Adaptor(SelectorsGeneration):
|
|
111
122
|
self._storage = storage(**storage_args)
|
112
123
|
|
113
124
|
self.__keep_comments = keep_comments
|
125
|
+
self.__keep_cdata = keep_cdata
|
114
126
|
self.__huge_tree_enabled = huge_tree
|
115
127
|
self.encoding = encoding
|
116
128
|
self.url = url
|
117
129
|
# For selector stuff
|
118
130
|
self.__attributes = None
|
119
131
|
self.__tag = None
|
120
|
-
self.__debug = debug
|
121
132
|
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
122
133
|
self.__response_data = {
|
123
134
|
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
|
@@ -148,8 +159,8 @@ class Adaptor(SelectorsGeneration):
|
|
148
159
|
root=element,
|
149
160
|
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
150
161
|
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
151
|
-
keep_comments=
|
152
|
-
huge_tree=self.__huge_tree_enabled,
|
162
|
+
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
|
163
|
+
huge_tree=self.__huge_tree_enabled,
|
153
164
|
**self.__response_data
|
154
165
|
)
|
155
166
|
return element
|
@@ -236,6 +247,10 @@ class Adaptor(SelectorsGeneration):
|
|
236
247
|
|
237
248
|
return TextHandler(separator.join([s for s in _all_strings]))
|
238
249
|
|
250
|
+
def urljoin(self, relative_url: str) -> str:
|
251
|
+
"""Join this Adaptor's url with a relative url to form an absolute full URL."""
|
252
|
+
return urljoin(self.url, relative_url)
|
253
|
+
|
239
254
|
@property
|
240
255
|
def attrib(self) -> AttributesHandler:
|
241
256
|
"""Get attributes of the element"""
|
@@ -248,7 +263,10 @@ class Adaptor(SelectorsGeneration):
|
|
248
263
|
"""Return the inner html code of the element"""
|
249
264
|
return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
|
250
265
|
|
251
|
-
|
266
|
+
@property
|
267
|
+
def body(self) -> str:
|
268
|
+
"""Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
|
269
|
+
return self.__raw_body or self.html_content
|
252
270
|
|
253
271
|
def prettify(self) -> str:
|
254
272
|
"""Return a prettified version of the element's inner html-code"""
|
@@ -323,6 +341,16 @@ class Adaptor(SelectorsGeneration):
|
|
323
341
|
|
324
342
|
return self.__convert_results(prev_element)
|
325
343
|
|
344
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
345
|
+
def get(self, default=None):
|
346
|
+
return self
|
347
|
+
|
348
|
+
def get_all(self):
|
349
|
+
return self
|
350
|
+
|
351
|
+
extract = get_all
|
352
|
+
extract_first = get
|
353
|
+
|
326
354
|
def __str__(self) -> str:
|
327
355
|
return self.html_content
|
328
356
|
|
@@ -385,10 +413,10 @@ class Adaptor(SelectorsGeneration):
|
|
385
413
|
if score_table:
|
386
414
|
highest_probability = max(score_table.keys())
|
387
415
|
if score_table[highest_probability] and highest_probability >= percentage:
|
388
|
-
|
389
|
-
|
416
|
+
log.debug(f'Highest probability was {highest_probability}%')
|
417
|
+
log.debug('Top 5 best matching elements are: ')
|
390
418
|
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
391
|
-
|
419
|
+
log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
|
392
420
|
if not adaptor_type:
|
393
421
|
return score_table[highest_probability]
|
394
422
|
return self.__convert_results(score_table[highest_probability])
|
@@ -514,7 +542,7 @@ class Adaptor(SelectorsGeneration):
|
|
514
542
|
|
515
543
|
if selected_elements:
|
516
544
|
if not self.__auto_match_enabled and auto_save:
|
517
|
-
|
545
|
+
log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
518
546
|
|
519
547
|
elif self.__auto_match_enabled and auto_save:
|
520
548
|
self.save(selected_elements[0], identifier or selector)
|
@@ -533,7 +561,7 @@ class Adaptor(SelectorsGeneration):
|
|
533
561
|
return self.__convert_results(selected_elements)
|
534
562
|
|
535
563
|
elif not self.__auto_match_enabled and auto_match:
|
536
|
-
|
564
|
+
log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
537
565
|
|
538
566
|
return self.__convert_results(selected_elements)
|
539
567
|
|
@@ -737,8 +765,8 @@ class Adaptor(SelectorsGeneration):
|
|
737
765
|
|
738
766
|
self._storage.save(element, identifier)
|
739
767
|
else:
|
740
|
-
|
741
|
-
"Can't use Auto-match features
|
768
|
+
log.critical(
|
769
|
+
"Can't use Auto-match features while disabled globally, you have to start a new class instance."
|
742
770
|
)
|
743
771
|
|
744
772
|
def retrieve(self, identifier: str) -> Optional[Dict]:
|
@@ -751,8 +779,8 @@ class Adaptor(SelectorsGeneration):
|
|
751
779
|
if self.__auto_match_enabled:
|
752
780
|
return self._storage.retrieve(identifier)
|
753
781
|
|
754
|
-
|
755
|
-
"Can't use Auto-match features
|
782
|
+
log.critical(
|
783
|
+
"Can't use Auto-match features while disabled globally, you have to start a new class instance."
|
756
784
|
)
|
757
785
|
|
758
786
|
# Operations on text functions
|
@@ -1066,12 +1094,19 @@ class Adaptors(List[Adaptor]):
|
|
1066
1094
|
]
|
1067
1095
|
return self.__class__(results) if results else results
|
1068
1096
|
|
1097
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
1069
1098
|
def get(self, default=None):
|
1070
1099
|
"""Returns the first item of the current list
|
1071
1100
|
:param default: the default value to return if the current list is empty
|
1072
1101
|
"""
|
1073
1102
|
return self[0] if len(self) > 0 else default
|
1074
1103
|
|
1104
|
+
def extract(self):
|
1105
|
+
return self
|
1106
|
+
|
1107
|
+
extract_first = get
|
1108
|
+
get_all = extract
|
1109
|
+
|
1075
1110
|
@property
|
1076
1111
|
def first(self):
|
1077
1112
|
"""Returns the first item of the current list or `None` if the list is empty"""
|
@@ -1,7 +1,7 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
4
|
-
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
3
|
+
Version: 0.2.9
|
4
|
+
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
7
7
|
Author-email: karim.shoair@pm.me
|
@@ -29,7 +29,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
29
29
|
Classifier: Programming Language :: Python :: 3.13
|
30
30
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
31
31
|
Classifier: Typing :: Typed
|
32
|
-
Requires-Python: >=3.
|
32
|
+
Requires-Python: >=3.9
|
33
33
|
Description-Content-Type: text/markdown
|
34
34
|
License-File: LICENSE
|
35
35
|
Requires-Dist: requests>=2.3
|
@@ -39,10 +39,9 @@ Requires-Dist: w3lib
|
|
39
39
|
Requires-Dist: orjson>=3
|
40
40
|
Requires-Dist: tldextract
|
41
41
|
Requires-Dist: httpx[brotli,zstd]
|
42
|
-
Requires-Dist: playwright
|
43
|
-
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox>=0.
|
45
|
-
Requires-Dist: browserforge
|
42
|
+
Requires-Dist: playwright>=1.49.1
|
43
|
+
Requires-Dist: rebrowser-playwright>=1.49.1
|
44
|
+
Requires-Dist: camoufox[geoip]>=0.4.9
|
46
45
|
|
47
46
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
48
47
|
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
@@ -52,7 +51,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
51
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
52
|
|
54
53
|
```python
|
55
|
-
>> from scrapling.
|
54
|
+
>> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
|
56
55
|
# Fetch websites' source under the radar!
|
57
56
|
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
57
|
>> print(page.status)
|
@@ -81,7 +80,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
81
80
|
|
82
81
|
## Table of content
|
83
82
|
* [Key Features](#key-features)
|
84
|
-
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
|
83
|
+
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
85
84
|
* [Adaptive Scraping](#adaptive-scraping)
|
86
85
|
* [Performance](#performance)
|
87
86
|
* [Developing Experience](#developing-experience)
|
@@ -122,7 +121,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
122
121
|
|
123
122
|
## Key Features
|
124
123
|
|
125
|
-
### Fetch websites as you prefer
|
124
|
+
### Fetch websites as you prefer with async support
|
126
125
|
- **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
|
127
126
|
- **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
|
128
127
|
- **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
|
@@ -213,7 +212,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
213
212
|
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
214
213
|
|
215
214
|
## Installation
|
216
|
-
Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.
|
215
|
+
Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
|
217
216
|
```bash
|
218
217
|
pip3 install scrapling
|
219
218
|
```
|
@@ -258,47 +257,58 @@ python -m browserforge update
|
|
258
257
|
```
|
259
258
|
|
260
259
|
## Fetching Websites
|
261
|
-
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you
|
260
|
+
Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
|
262
261
|
|
263
262
|
### Features
|
264
|
-
You might be
|
263
|
+
You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
265
264
|
```python
|
266
265
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
267
266
|
```
|
268
|
-
|
267
|
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
|
269
268
|
|
270
269
|
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
271
270
|
```python
|
272
|
-
from scrapling.
|
271
|
+
from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
|
273
272
|
```
|
274
273
|
then use it right away without initializing like:
|
275
274
|
```python
|
276
275
|
page = StealthyFetcher.fetch('https://example.com')
|
277
276
|
```
|
278
277
|
|
279
|
-
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
278
|
+
Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
280
279
|
> [!NOTE]
|
281
280
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
282
281
|
### Fetcher
|
283
282
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
284
283
|
|
285
|
-
For all methods, you have `
|
284
|
+
For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
|
286
285
|
|
287
286
|
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
288
287
|
```python
|
289
|
-
>> page = Fetcher().get('https://httpbin.org/get',
|
288
|
+
>> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
290
289
|
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
291
290
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
292
291
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
293
292
|
```
|
293
|
+
For Async requests, you will just replace the import like below:
|
294
|
+
```python
|
295
|
+
>> from scrapling import AsyncFetcher
|
296
|
+
>> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
297
|
+
>> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
298
|
+
>> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
299
|
+
>> page = await AsyncFetcher().delete('https://httpbin.org/delete')
|
300
|
+
```
|
294
301
|
### StealthyFetcher
|
295
|
-
This class is built on top of [Camoufox](https://github.com/daijro/camoufox)
|
302
|
+
This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
|
296
303
|
```python
|
297
304
|
>> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
|
298
305
|
>> page.status == 200
|
299
306
|
True
|
307
|
+
>> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
|
308
|
+
>> page.status == 200
|
309
|
+
True
|
300
310
|
```
|
301
|
-
> Note: all requests done by this fetcher
|
311
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
302
312
|
|
303
313
|
<details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
|
304
314
|
|
@@ -314,7 +324,8 @@ True
|
|
314
324
|
| page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
|
315
325
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
316
326
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
317
|
-
| allow_webgl |
|
327
|
+
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
328
|
+
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
318
329
|
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
319
330
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
320
331
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
@@ -333,8 +344,11 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
|
|
333
344
|
>> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
|
334
345
|
>> page.css_first("#search a::attr(href)")
|
335
346
|
'https://github.com/D4Vinci/Scrapling'
|
347
|
+
>> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
|
348
|
+
>> page.css_first("#search a::attr(href)")
|
349
|
+
'https://github.com/D4Vinci/Scrapling'
|
336
350
|
```
|
337
|
-
> Note: all requests done by this fetcher
|
351
|
+
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
338
352
|
|
339
353
|
Using this Fetcher class, you can make requests with:
|
340
354
|
1) Vanilla Playwright without any modifications other than the ones you chose.
|
@@ -346,7 +360,7 @@ Using this Fetcher class, you can make requests with:
|
|
346
360
|
3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
|
347
361
|
4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
|
348
362
|
|
349
|
-
> Hence using the `real_chrome` argument requires that you have
|
363
|
+
> Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
|
350
364
|
|
351
365
|
Add that to a lot of controlling/hiding options as you will see in the arguments list below.
|
352
366
|
|
@@ -369,7 +383,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
|
|
369
383
|
| hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
|
370
384
|
| disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
|
371
385
|
| stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
|
372
|
-
| real_chrome | If you have
|
386
|
+
| real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
|
373
387
|
| locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
|
374
388
|
| cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
|
375
389
|
| nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
|
@@ -437,6 +451,9 @@ You can select elements by their text content in multiple ways, here's a full ex
|
|
437
451
|
>>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
|
438
452
|
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
|
439
453
|
|
454
|
+
>>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
|
455
|
+
'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
|
456
|
+
|
440
457
|
>>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
|
441
458
|
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
|
442
459
|
|
@@ -850,7 +867,6 @@ This project includes code adapted from:
|
|
850
867
|
|
851
868
|
## Known Issues
|
852
869
|
- In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
|
853
|
-
- Currently, Scrapling is not compatible with async/await.
|
854
870
|
|
855
871
|
---
|
856
872
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -0,0 +1,47 @@
|
|
1
|
+
scrapling/__init__.py,sha256=4adit4xM1Io6mBz-VnnSHcPCQxIYhvDmDVMhbXu8VF4,499
|
2
|
+
scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
|
3
|
+
scrapling/fetchers.py,sha256=I_N32DMjCzNCMmrkGYoX480x1Eh5Lka6cMJ-EcSfszk,35342
|
4
|
+
scrapling/parser.py,sha256=NKwOsGR6TB7XC9lMkA418_DRWE6pyUqK0XtmTAA51ic,55215
|
5
|
+
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
|
+
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
scrapling/core/_types.py,sha256=__HJ2JTk5vx5eg_7HAJmDjaHrMDIaoxNG8fadLLyKV8,566
|
8
|
+
scrapling/core/custom_types.py,sha256=ZRzpoT6qQ4vU_ejhLXa7WYuYLGl5HwAjLPe01xdhuvM,10808
|
9
|
+
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
|
+
scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
|
11
|
+
scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
|
12
|
+
scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
|
13
|
+
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
14
|
+
scrapling/engines/camo.py,sha256=L5jRNUgJSAY5hE8KCD-tz4SFrx7ZjowJoWpHrl7havI,12359
|
15
|
+
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
16
|
+
scrapling/engines/pw.py,sha256=0vCDaodve_WcOdbGqBdyRwMECPZmQ0eGLQikh4WHKFc,17011
|
17
|
+
scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
|
18
|
+
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=FbWTUC0Z8NTmTLFDiiCchs4W0_Q40lz2ONnhInRNuvA,12947
|
20
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
|
21
|
+
scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
|
22
|
+
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
|
+
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
24
|
+
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
25
|
+
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
26
|
+
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
27
|
+
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
28
|
+
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
|
+
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
|
+
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
+
tests/fetchers/test_utils.py,sha256=ANFu-4FFhtyGFGIwJksUO2M2tTTcKU2M_t6F2aav8lM,4967
|
32
|
+
tests/fetchers/async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
|
+
tests/fetchers/async/test_camoufox.py,sha256=BANJ0TVqEdsjkYlsyU-q_spfaMsqTLOBQU8LUDurL9I,3685
|
34
|
+
tests/fetchers/async/test_httpx.py,sha256=6WgsvqV1-rYTjZ9na5x-wt49C3Ur9D99HXBFbewO0gc,3888
|
35
|
+
tests/fetchers/async/test_playwright.py,sha256=zzSYnfRksjNep_YipTiYAB9eQaIo3fssKLrsGzXEakw,4068
|
36
|
+
tests/fetchers/sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
+
tests/fetchers/sync/test_camoufox.py,sha256=IcDXPAWSSJnYT6psDFKSbCeym5n7hCrMPYQEghaOX3A,3165
|
38
|
+
tests/fetchers/sync/test_httpx.py,sha256=xItYWjnDOIswKJzua2tDq8Oy43nTeFl0O1bci7lzGmg,3615
|
39
|
+
tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxnmprW0WO6Q,3780
|
40
|
+
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
|
+
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
42
|
+
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
43
|
+
scrapling-0.2.9.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
44
|
+
scrapling-0.2.9.dist-info/METADATA,sha256=Wg6lcRo_5LcyotrB1ZXagT5-gToAyRmtNKsq6TJoNk4,68382
|
45
|
+
scrapling-0.2.9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
46
|
+
scrapling-0.2.9.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
47
|
+
scrapling-0.2.9.dist-info/RECORD,,
|
File without changes
|
@@ -0,0 +1,95 @@
|
|
1
|
+
import pytest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import StealthyFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
@pytest.mark.asyncio
|
9
|
+
class TestStealthyFetcher:
|
10
|
+
@pytest.fixture(scope="class")
|
11
|
+
def fetcher(self):
|
12
|
+
return StealthyFetcher(auto_match=False)
|
13
|
+
|
14
|
+
@pytest.fixture(scope="class")
|
15
|
+
def urls(self, httpbin):
|
16
|
+
url = httpbin.url
|
17
|
+
return {
|
18
|
+
'status_200': f'{url}/status/200',
|
19
|
+
'status_404': f'{url}/status/404',
|
20
|
+
'status_501': f'{url}/status/501',
|
21
|
+
'basic_url': f'{url}/get',
|
22
|
+
'html_url': f'{url}/html',
|
23
|
+
'delayed_url': f'{url}/delay/10', # 10 Seconds delay response
|
24
|
+
'cookies_url': f"{url}/cookies/set/test/value"
|
25
|
+
}
|
26
|
+
|
27
|
+
async def test_basic_fetch(self, fetcher, urls):
|
28
|
+
"""Test doing basic fetch request with multiple statuses"""
|
29
|
+
assert (await fetcher.async_fetch(urls['status_200'])).status == 200
|
30
|
+
assert (await fetcher.async_fetch(urls['status_404'])).status == 404
|
31
|
+
assert (await fetcher.async_fetch(urls['status_501'])).status == 501
|
32
|
+
|
33
|
+
async def test_networkidle(self, fetcher, urls):
|
34
|
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
35
|
+
assert (await fetcher.async_fetch(urls['basic_url'], network_idle=True)).status == 200
|
36
|
+
|
37
|
+
async def test_blocking_resources(self, fetcher, urls):
|
38
|
+
"""Test if blocking resources make page does not finish loading or not"""
|
39
|
+
assert (await fetcher.async_fetch(urls['basic_url'], block_images=True)).status == 200
|
40
|
+
assert (await fetcher.async_fetch(urls['basic_url'], disable_resources=True)).status == 200
|
41
|
+
|
42
|
+
async def test_waiting_selector(self, fetcher, urls):
|
43
|
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
44
|
+
assert (await fetcher.async_fetch(urls['html_url'], wait_selector='h1')).status == 200
|
45
|
+
assert (await fetcher.async_fetch(
|
46
|
+
urls['html_url'],
|
47
|
+
wait_selector='h1',
|
48
|
+
wait_selector_state='visible'
|
49
|
+
)).status == 200
|
50
|
+
|
51
|
+
async def test_cookies_loading(self, fetcher, urls):
|
52
|
+
"""Test if cookies are set after the request"""
|
53
|
+
response = await fetcher.async_fetch(urls['cookies_url'])
|
54
|
+
assert response.cookies == {'test': 'value'}
|
55
|
+
|
56
|
+
async def test_automation(self, fetcher, urls):
|
57
|
+
"""Test if automation break the code or not"""
|
58
|
+
|
59
|
+
async def scroll_page(page):
|
60
|
+
await page.mouse.wheel(10, 0)
|
61
|
+
await page.mouse.move(100, 400)
|
62
|
+
await page.mouse.up()
|
63
|
+
return page
|
64
|
+
|
65
|
+
assert (await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)).status == 200
|
66
|
+
|
67
|
+
async def test_properties(self, fetcher, urls):
|
68
|
+
"""Test if different arguments breaks the code or not"""
|
69
|
+
assert (await fetcher.async_fetch(
|
70
|
+
urls['html_url'],
|
71
|
+
block_webrtc=True,
|
72
|
+
allow_webgl=True
|
73
|
+
)).status == 200
|
74
|
+
|
75
|
+
assert (await fetcher.async_fetch(
|
76
|
+
urls['html_url'],
|
77
|
+
block_webrtc=False,
|
78
|
+
allow_webgl=True
|
79
|
+
)).status == 200
|
80
|
+
|
81
|
+
assert (await fetcher.async_fetch(
|
82
|
+
urls['html_url'],
|
83
|
+
block_webrtc=True,
|
84
|
+
allow_webgl=False
|
85
|
+
)).status == 200
|
86
|
+
|
87
|
+
assert (await fetcher.async_fetch(
|
88
|
+
urls['html_url'],
|
89
|
+
extra_headers={'ayo': ''},
|
90
|
+
os_randomize=True
|
91
|
+
)).status == 200
|
92
|
+
|
93
|
+
async def test_infinite_timeout(self, fetcher, urls):
|
94
|
+
"""Test if infinite timeout breaks the code or not"""
|
95
|
+
assert (await fetcher.async_fetch(urls['delayed_url'], timeout=None)).status == 200
|