scrapling 0.2.8__py3-none-any.whl → 0.2.91__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +4 -4
- scrapling/core/_types.py +2 -0
- scrapling/core/custom_types.py +88 -6
- scrapling/core/storage_adaptors.py +5 -6
- scrapling/core/translator.py +2 -2
- scrapling/core/utils.py +29 -27
- scrapling/defaults.py +2 -1
- scrapling/engines/camo.py +124 -24
- scrapling/engines/constants.py +4 -4
- scrapling/engines/pw.py +195 -91
- scrapling/engines/static.py +91 -48
- scrapling/engines/toolbelt/__init__.py +3 -3
- scrapling/engines/toolbelt/custom.py +16 -22
- scrapling/engines/toolbelt/fingerprints.py +3 -3
- scrapling/engines/toolbelt/navigation.py +21 -8
- scrapling/fetchers.py +231 -16
- scrapling/parser.py +50 -22
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/METADATA +33 -18
- scrapling-0.2.91.dist-info/RECORD +47 -0
- tests/fetchers/async/__init__.py +0 -0
- tests/fetchers/async/test_camoufox.py +95 -0
- tests/fetchers/async/test_httpx.py +83 -0
- tests/fetchers/async/test_playwright.py +99 -0
- tests/fetchers/sync/__init__.py +0 -0
- tests/fetchers/sync/test_camoufox.py +68 -0
- tests/fetchers/sync/test_httpx.py +82 -0
- tests/fetchers/sync/test_playwright.py +87 -0
- tests/fetchers/test_utils.py +90 -122
- tests/parser/test_automatch.py +64 -9
- tests/parser/test_general.py +260 -218
- scrapling-0.2.8.dist-info/RECORD +0 -42
- tests/fetchers/test_camoufox.py +0 -65
- tests/fetchers/test_httpx.py +0 -68
- tests/fetchers/test_playwright.py +0 -77
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/LICENSE +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/WHEEL +0 -0
- {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/top_level.txt +0 -0
scrapling/parser.py
CHANGED
@@ -2,6 +2,7 @@ import inspect
|
|
2
2
|
import os
|
3
3
|
import re
|
4
4
|
from difflib import SequenceMatcher
|
5
|
+
from urllib.parse import urljoin
|
5
6
|
|
6
7
|
from cssselect import SelectorError, SelectorSyntaxError
|
7
8
|
from cssselect import parse as split_selectors
|
@@ -17,13 +18,14 @@ from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
|
|
17
18
|
StorageSystemMixin, _StorageTools)
|
18
19
|
from scrapling.core.translator import HTMLTranslator
|
19
20
|
from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
|
20
|
-
is_jsonable,
|
21
|
+
is_jsonable, log)
|
21
22
|
|
22
23
|
|
23
24
|
class Adaptor(SelectorsGeneration):
|
24
25
|
__slots__ = (
|
25
|
-
'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
|
26
|
+
'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
|
26
27
|
'__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
|
28
|
+
'__keep_cdata', '__raw_body'
|
27
29
|
)
|
28
30
|
|
29
31
|
def __init__(
|
@@ -35,10 +37,10 @@ class Adaptor(SelectorsGeneration):
|
|
35
37
|
huge_tree: bool = True,
|
36
38
|
root: Optional[html.HtmlElement] = None,
|
37
39
|
keep_comments: Optional[bool] = False,
|
40
|
+
keep_cdata: Optional[bool] = False,
|
38
41
|
auto_match: Optional[bool] = True,
|
39
42
|
storage: Any = SQLiteStorageSystem,
|
40
43
|
storage_args: Optional[Dict] = None,
|
41
|
-
debug: Optional[bool] = True,
|
42
44
|
**kwargs
|
43
45
|
):
|
44
46
|
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
@@ -58,33 +60,36 @@ class Adaptor(SelectorsGeneration):
|
|
58
60
|
:param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
|
59
61
|
Don't use it unless you know what you are doing!
|
60
62
|
:param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
|
63
|
+
:param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
|
61
64
|
:param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
|
62
65
|
priority over all auto-match related arguments/functions in the class.
|
63
66
|
:param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
|
64
67
|
:param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
|
65
68
|
If empty, default values will be used.
|
66
|
-
:param debug: Enable debug mode
|
67
69
|
"""
|
68
70
|
if root is None and not body and text is None:
|
69
71
|
raise ValueError("Adaptor class needs text, body, or root arguments to work")
|
70
72
|
|
71
73
|
self.__text = None
|
74
|
+
self.__raw_body = ''
|
72
75
|
if root is None:
|
73
76
|
if text is None:
|
74
77
|
if not body or not isinstance(body, bytes):
|
75
78
|
raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
|
76
79
|
|
77
80
|
body = body.replace(b"\x00", b"").strip()
|
81
|
+
self.__raw_body = body.replace(b"\x00", b"").strip().decode()
|
78
82
|
else:
|
79
83
|
if not isinstance(text, str):
|
80
84
|
raise TypeError(f"text argument must be of type str, got {text.__class__}")
|
81
85
|
|
82
86
|
body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
|
87
|
+
self.__raw_body = text.strip()
|
83
88
|
|
84
89
|
# https://lxml.de/api/lxml.etree.HTMLParser-class.html
|
85
90
|
parser = html.HTMLParser(
|
86
|
-
recover=True, remove_blank_text=True, remove_comments=(keep_comments
|
87
|
-
compact=True, huge_tree=huge_tree, default_doctype=True
|
91
|
+
recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
|
92
|
+
compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
|
88
93
|
)
|
89
94
|
self._root = etree.fromstring(body, parser=parser, base_url=url)
|
90
95
|
if is_jsonable(text or body.decode()):
|
@@ -99,7 +104,6 @@ class Adaptor(SelectorsGeneration):
|
|
99
104
|
|
100
105
|
self._root = root
|
101
106
|
|
102
|
-
setup_basic_logging(level='debug' if debug else 'info')
|
103
107
|
self.__auto_match_enabled = auto_match
|
104
108
|
|
105
109
|
if self.__auto_match_enabled:
|
@@ -110,7 +114,7 @@ class Adaptor(SelectorsGeneration):
|
|
110
114
|
}
|
111
115
|
|
112
116
|
if not hasattr(storage, '__wrapped__'):
|
113
|
-
raise ValueError("Storage class must be wrapped with
|
117
|
+
raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
|
114
118
|
|
115
119
|
if not issubclass(storage.__wrapped__, StorageSystemMixin):
|
116
120
|
raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
|
@@ -118,13 +122,13 @@ class Adaptor(SelectorsGeneration):
|
|
118
122
|
self._storage = storage(**storage_args)
|
119
123
|
|
120
124
|
self.__keep_comments = keep_comments
|
125
|
+
self.__keep_cdata = keep_cdata
|
121
126
|
self.__huge_tree_enabled = huge_tree
|
122
127
|
self.encoding = encoding
|
123
128
|
self.url = url
|
124
129
|
# For selector stuff
|
125
130
|
self.__attributes = None
|
126
131
|
self.__tag = None
|
127
|
-
self.__debug = debug
|
128
132
|
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
129
133
|
self.__response_data = {
|
130
134
|
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
|
@@ -151,12 +155,12 @@ class Adaptor(SelectorsGeneration):
|
|
151
155
|
else:
|
152
156
|
if issubclass(type(element), html.HtmlMixin):
|
153
157
|
|
154
|
-
return
|
158
|
+
return Adaptor(
|
155
159
|
root=element,
|
156
160
|
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
157
161
|
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
158
|
-
keep_comments=
|
159
|
-
huge_tree=self.__huge_tree_enabled,
|
162
|
+
keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
|
163
|
+
huge_tree=self.__huge_tree_enabled,
|
160
164
|
**self.__response_data
|
161
165
|
)
|
162
166
|
return element
|
@@ -243,6 +247,10 @@ class Adaptor(SelectorsGeneration):
|
|
243
247
|
|
244
248
|
return TextHandler(separator.join([s for s in _all_strings]))
|
245
249
|
|
250
|
+
def urljoin(self, relative_url: str) -> str:
|
251
|
+
"""Join this Adaptor's url with a relative url to form an absolute full URL."""
|
252
|
+
return urljoin(self.url, relative_url)
|
253
|
+
|
246
254
|
@property
|
247
255
|
def attrib(self) -> AttributesHandler:
|
248
256
|
"""Get attributes of the element"""
|
@@ -255,7 +263,10 @@ class Adaptor(SelectorsGeneration):
|
|
255
263
|
"""Return the inner html code of the element"""
|
256
264
|
return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
|
257
265
|
|
258
|
-
|
266
|
+
@property
|
267
|
+
def body(self) -> str:
|
268
|
+
"""Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
|
269
|
+
return self.__raw_body or self.html_content
|
259
270
|
|
260
271
|
def prettify(self) -> str:
|
261
272
|
"""Return a prettified version of the element's inner html-code"""
|
@@ -330,6 +341,16 @@ class Adaptor(SelectorsGeneration):
|
|
330
341
|
|
331
342
|
return self.__convert_results(prev_element)
|
332
343
|
|
344
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
345
|
+
def get(self, default=None):
|
346
|
+
return self
|
347
|
+
|
348
|
+
def get_all(self):
|
349
|
+
return self
|
350
|
+
|
351
|
+
extract = get_all
|
352
|
+
extract_first = get
|
353
|
+
|
333
354
|
def __str__(self) -> str:
|
334
355
|
return self.html_content
|
335
356
|
|
@@ -392,10 +413,10 @@ class Adaptor(SelectorsGeneration):
|
|
392
413
|
if score_table:
|
393
414
|
highest_probability = max(score_table.keys())
|
394
415
|
if score_table[highest_probability] and highest_probability >= percentage:
|
395
|
-
|
396
|
-
|
416
|
+
log.debug(f'Highest probability was {highest_probability}%')
|
417
|
+
log.debug('Top 5 best matching elements are: ')
|
397
418
|
for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
|
398
|
-
|
419
|
+
log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
|
399
420
|
if not adaptor_type:
|
400
421
|
return score_table[highest_probability]
|
401
422
|
return self.__convert_results(score_table[highest_probability])
|
@@ -521,7 +542,7 @@ class Adaptor(SelectorsGeneration):
|
|
521
542
|
|
522
543
|
if selected_elements:
|
523
544
|
if not self.__auto_match_enabled and auto_save:
|
524
|
-
|
545
|
+
log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
525
546
|
|
526
547
|
elif self.__auto_match_enabled and auto_save:
|
527
548
|
self.save(selected_elements[0], identifier or selector)
|
@@ -540,7 +561,7 @@ class Adaptor(SelectorsGeneration):
|
|
540
561
|
return self.__convert_results(selected_elements)
|
541
562
|
|
542
563
|
elif not self.__auto_match_enabled and auto_match:
|
543
|
-
|
564
|
+
log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
|
544
565
|
|
545
566
|
return self.__convert_results(selected_elements)
|
546
567
|
|
@@ -744,8 +765,8 @@ class Adaptor(SelectorsGeneration):
|
|
744
765
|
|
745
766
|
self._storage.save(element, identifier)
|
746
767
|
else:
|
747
|
-
|
748
|
-
"Can't use Auto-match features
|
768
|
+
log.critical(
|
769
|
+
"Can't use Auto-match features while disabled globally, you have to start a new class instance."
|
749
770
|
)
|
750
771
|
|
751
772
|
def retrieve(self, identifier: str) -> Optional[Dict]:
|
@@ -758,8 +779,8 @@ class Adaptor(SelectorsGeneration):
|
|
758
779
|
if self.__auto_match_enabled:
|
759
780
|
return self._storage.retrieve(identifier)
|
760
781
|
|
761
|
-
|
762
|
-
"Can't use Auto-match features
|
782
|
+
log.critical(
|
783
|
+
"Can't use Auto-match features while disabled globally, you have to start a new class instance."
|
763
784
|
)
|
764
785
|
|
765
786
|
# Operations on text functions
|
@@ -1073,12 +1094,19 @@ class Adaptors(List[Adaptor]):
|
|
1073
1094
|
]
|
1074
1095
|
return self.__class__(results) if results else results
|
1075
1096
|
|
1097
|
+
# For easy copy-paste from Scrapy/parsel code when needed :)
|
1076
1098
|
def get(self, default=None):
|
1077
1099
|
"""Returns the first item of the current list
|
1078
1100
|
:param default: the default value to return if the current list is empty
|
1079
1101
|
"""
|
1080
1102
|
return self[0] if len(self) > 0 else default
|
1081
1103
|
|
1104
|
+
def extract(self):
|
1105
|
+
return self
|
1106
|
+
|
1107
|
+
extract_first = get
|
1108
|
+
get_all = extract
|
1109
|
+
|
1082
1110
|
@property
|
1083
1111
|
def first(self):
|
1084
1112
|
"""Returns the first item of the current list or `None` if the list is empty"""
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.91
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
|
|
21
21
|
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
22
22
|
Classifier: Programming Language :: Python :: 3
|
23
23
|
Classifier: Programming Language :: Python :: 3 :: Only
|
24
|
-
Classifier: Programming Language :: Python :: 3.8
|
25
24
|
Classifier: Programming Language :: Python :: 3.9
|
26
25
|
Classifier: Programming Language :: Python :: 3.10
|
27
26
|
Classifier: Programming Language :: Python :: 3.11
|
@@ -29,7 +28,7 @@ Classifier: Programming Language :: Python :: 3.12
|
|
29
28
|
Classifier: Programming Language :: Python :: 3.13
|
30
29
|
Classifier: Programming Language :: Python :: Implementation :: CPython
|
31
30
|
Classifier: Typing :: Typed
|
32
|
-
Requires-Python: >=3.
|
31
|
+
Requires-Python: >=3.9
|
33
32
|
Description-Content-Type: text/markdown
|
34
33
|
License-File: LICENSE
|
35
34
|
Requires-Dist: requests>=2.3
|
@@ -38,11 +37,10 @@ Requires-Dist: cssselect>=1.2
|
|
38
37
|
Requires-Dist: w3lib
|
39
38
|
Requires-Dist: orjson>=3
|
40
39
|
Requires-Dist: tldextract
|
41
|
-
Requires-Dist: httpx[brotli,zstd]
|
42
|
-
Requires-Dist: playwright
|
43
|
-
Requires-Dist: rebrowser-playwright
|
44
|
-
Requires-Dist: camoufox>=0.4.
|
45
|
-
Requires-Dist: browserforge
|
40
|
+
Requires-Dist: httpx[brotli,socks,zstd]
|
41
|
+
Requires-Dist: playwright>=1.49.1
|
42
|
+
Requires-Dist: rebrowser-playwright>=1.49.1
|
43
|
+
Requires-Dist: camoufox[geoip]>=0.4.9
|
46
44
|
|
47
45
|
# 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
|
48
46
|
[](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [](https://badge.fury.io/py/Scrapling) [](https://pypi.org/project/scrapling/) [](https://pepy.tech/project/scrapling)
|
@@ -52,7 +50,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
50
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
51
|
|
54
52
|
```python
|
55
|
-
>> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
53
|
+
>> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
|
56
54
|
# Fetch websites' source under the radar!
|
57
55
|
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
56
|
>> print(page.status)
|
@@ -81,7 +79,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
81
79
|
|
82
80
|
## Table of content
|
83
81
|
* [Key Features](#key-features)
|
84
|
-
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
|
82
|
+
* [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
|
85
83
|
* [Adaptive Scraping](#adaptive-scraping)
|
86
84
|
* [Performance](#performance)
|
87
85
|
* [Developing Experience](#developing-experience)
|
@@ -122,7 +120,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
|
|
122
120
|
|
123
121
|
## Key Features
|
124
122
|
|
125
|
-
### Fetch websites as you prefer
|
123
|
+
### Fetch websites as you prefer with async support
|
126
124
|
- **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
|
127
125
|
- **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
|
128
126
|
- **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
|
@@ -213,7 +211,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
|
|
213
211
|
> All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
|
214
212
|
|
215
213
|
## Installation
|
216
|
-
Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.
|
214
|
+
Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
|
217
215
|
```bash
|
218
216
|
pip3 install scrapling
|
219
217
|
```
|
@@ -265,11 +263,11 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
|
|
265
263
|
```python
|
266
264
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
267
265
|
```
|
268
|
-
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `
|
266
|
+
All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
|
269
267
|
|
270
268
|
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
271
269
|
```python
|
272
|
-
from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
|
270
|
+
from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
|
273
271
|
```
|
274
272
|
then use it right away without initializing like:
|
275
273
|
```python
|
@@ -282,21 +280,32 @@ Also, the `Response` object returned from all fetchers is the same as the `Adapt
|
|
282
280
|
### Fetcher
|
283
281
|
This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
|
284
282
|
|
285
|
-
For all methods, you have `
|
283
|
+
For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
|
286
284
|
|
287
285
|
You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
|
288
286
|
```python
|
289
|
-
>> page = Fetcher().get('https://httpbin.org/get',
|
287
|
+
>> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
290
288
|
>> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
291
289
|
>> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
292
290
|
>> page = Fetcher().delete('https://httpbin.org/delete')
|
293
291
|
```
|
292
|
+
For Async requests, you will just replace the import like below:
|
293
|
+
```python
|
294
|
+
>> from scrapling import AsyncFetcher
|
295
|
+
>> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
|
296
|
+
>> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
|
297
|
+
>> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
|
298
|
+
>> page = await AsyncFetcher().delete('https://httpbin.org/delete')
|
299
|
+
```
|
294
300
|
### StealthyFetcher
|
295
301
|
This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
|
296
302
|
```python
|
297
303
|
>> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
|
298
304
|
>> page.status == 200
|
299
305
|
True
|
306
|
+
>> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
|
307
|
+
>> page.status == 200
|
308
|
+
True
|
300
309
|
```
|
301
310
|
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
302
311
|
|
@@ -314,7 +323,8 @@ True
|
|
314
323
|
| page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
|
315
324
|
| addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
|
316
325
|
| humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
|
317
|
-
| allow_webgl |
|
326
|
+
| allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
|
327
|
+
| geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
|
318
328
|
| disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
|
319
329
|
| network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
|
320
330
|
| timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
|
@@ -333,6 +343,9 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
|
|
333
343
|
>> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
|
334
344
|
>> page.css_first("#search a::attr(href)")
|
335
345
|
'https://github.com/D4Vinci/Scrapling'
|
346
|
+
>> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
|
347
|
+
>> page.css_first("#search a::attr(href)")
|
348
|
+
'https://github.com/D4Vinci/Scrapling'
|
336
349
|
```
|
337
350
|
> Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
|
338
351
|
|
@@ -437,6 +450,9 @@ You can select elements by their text content in multiple ways, here's a full ex
|
|
437
450
|
>>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
|
438
451
|
<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
|
439
452
|
|
453
|
+
>>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
|
454
|
+
'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
|
455
|
+
|
440
456
|
>>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
|
441
457
|
[<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
|
442
458
|
|
@@ -850,7 +866,6 @@ This project includes code adapted from:
|
|
850
866
|
|
851
867
|
## Known Issues
|
852
868
|
- In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
|
853
|
-
- Currently, Scrapling is not compatible with async/await.
|
854
869
|
|
855
870
|
---
|
856
871
|
<div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
|
@@ -0,0 +1,47 @@
|
|
1
|
+
scrapling/__init__.py,sha256=pfbhEm1kcriA9pFR3JUUFEE3v4_ykB35SYbeHKzFxHw,500
|
2
|
+
scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
|
3
|
+
scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
|
4
|
+
scrapling/parser.py,sha256=Fl9cdbR58GuoPbWN5hZI6ToPSl0_rQFXMskTdzpoxWs,55208
|
5
|
+
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
6
|
+
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
7
|
+
scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
|
8
|
+
scrapling/core/custom_types.py,sha256=ZRzpoT6qQ4vU_ejhLXa7WYuYLGl5HwAjLPe01xdhuvM,10808
|
9
|
+
scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
|
10
|
+
scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
|
11
|
+
scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
|
12
|
+
scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
|
13
|
+
scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
|
14
|
+
scrapling/engines/camo.py,sha256=g12IVIPy4Uyp_jngtu8Qcvy7PSMHjURAHUGXdM58Kks,13778
|
15
|
+
scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
|
16
|
+
scrapling/engines/pw.py,sha256=Eq4_oQA5eX666chiNpXsBqhWONzleniyXjKdmCpXj_Y,18630
|
17
|
+
scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
|
18
|
+
scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
|
20
|
+
scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
|
21
|
+
scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
|
22
|
+
scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
|
23
|
+
scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
|
24
|
+
scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
|
25
|
+
scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
|
26
|
+
scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
|
27
|
+
scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
|
28
|
+
scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
|
29
|
+
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
30
|
+
tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
|
31
|
+
tests/fetchers/test_utils.py,sha256=ANFu-4FFhtyGFGIwJksUO2M2tTTcKU2M_t6F2aav8lM,4967
|
32
|
+
tests/fetchers/async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
33
|
+
tests/fetchers/async/test_camoufox.py,sha256=BANJ0TVqEdsjkYlsyU-q_spfaMsqTLOBQU8LUDurL9I,3685
|
34
|
+
tests/fetchers/async/test_httpx.py,sha256=6WgsvqV1-rYTjZ9na5x-wt49C3Ur9D99HXBFbewO0gc,3888
|
35
|
+
tests/fetchers/async/test_playwright.py,sha256=zzSYnfRksjNep_YipTiYAB9eQaIo3fssKLrsGzXEakw,4068
|
36
|
+
tests/fetchers/sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
37
|
+
tests/fetchers/sync/test_camoufox.py,sha256=IcDXPAWSSJnYT6psDFKSbCeym5n7hCrMPYQEghaOX3A,3165
|
38
|
+
tests/fetchers/sync/test_httpx.py,sha256=xItYWjnDOIswKJzua2tDq8Oy43nTeFl0O1bci7lzGmg,3615
|
39
|
+
tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxnmprW0WO6Q,3780
|
40
|
+
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
41
|
+
tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
|
42
|
+
tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
|
43
|
+
scrapling-0.2.91.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
44
|
+
scrapling-0.2.91.dist-info/METADATA,sha256=ajc8n5Hjl--ZdGXwHxmfMEWyCMgbw1waZNovoPFxrUc,68339
|
45
|
+
scrapling-0.2.91.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
|
46
|
+
scrapling-0.2.91.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
47
|
+
scrapling-0.2.91.dist-info/RECORD,,
|
File without changes
|
@@ -0,0 +1,95 @@
|
|
1
|
+
import pytest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling import StealthyFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
@pytest.mark.asyncio
|
9
|
+
class TestStealthyFetcher:
|
10
|
+
@pytest.fixture(scope="class")
|
11
|
+
def fetcher(self):
|
12
|
+
return StealthyFetcher(auto_match=False)
|
13
|
+
|
14
|
+
@pytest.fixture(scope="class")
|
15
|
+
def urls(self, httpbin):
|
16
|
+
url = httpbin.url
|
17
|
+
return {
|
18
|
+
'status_200': f'{url}/status/200',
|
19
|
+
'status_404': f'{url}/status/404',
|
20
|
+
'status_501': f'{url}/status/501',
|
21
|
+
'basic_url': f'{url}/get',
|
22
|
+
'html_url': f'{url}/html',
|
23
|
+
'delayed_url': f'{url}/delay/10', # 10 Seconds delay response
|
24
|
+
'cookies_url': f"{url}/cookies/set/test/value"
|
25
|
+
}
|
26
|
+
|
27
|
+
async def test_basic_fetch(self, fetcher, urls):
|
28
|
+
"""Test doing basic fetch request with multiple statuses"""
|
29
|
+
assert (await fetcher.async_fetch(urls['status_200'])).status == 200
|
30
|
+
assert (await fetcher.async_fetch(urls['status_404'])).status == 404
|
31
|
+
assert (await fetcher.async_fetch(urls['status_501'])).status == 501
|
32
|
+
|
33
|
+
async def test_networkidle(self, fetcher, urls):
|
34
|
+
"""Test if waiting for `networkidle` make page does not finish loading or not"""
|
35
|
+
assert (await fetcher.async_fetch(urls['basic_url'], network_idle=True)).status == 200
|
36
|
+
|
37
|
+
async def test_blocking_resources(self, fetcher, urls):
|
38
|
+
"""Test if blocking resources make page does not finish loading or not"""
|
39
|
+
assert (await fetcher.async_fetch(urls['basic_url'], block_images=True)).status == 200
|
40
|
+
assert (await fetcher.async_fetch(urls['basic_url'], disable_resources=True)).status == 200
|
41
|
+
|
42
|
+
async def test_waiting_selector(self, fetcher, urls):
|
43
|
+
"""Test if waiting for a selector make page does not finish loading or not"""
|
44
|
+
assert (await fetcher.async_fetch(urls['html_url'], wait_selector='h1')).status == 200
|
45
|
+
assert (await fetcher.async_fetch(
|
46
|
+
urls['html_url'],
|
47
|
+
wait_selector='h1',
|
48
|
+
wait_selector_state='visible'
|
49
|
+
)).status == 200
|
50
|
+
|
51
|
+
async def test_cookies_loading(self, fetcher, urls):
|
52
|
+
"""Test if cookies are set after the request"""
|
53
|
+
response = await fetcher.async_fetch(urls['cookies_url'])
|
54
|
+
assert response.cookies == {'test': 'value'}
|
55
|
+
|
56
|
+
async def test_automation(self, fetcher, urls):
|
57
|
+
"""Test if automation break the code or not"""
|
58
|
+
|
59
|
+
async def scroll_page(page):
|
60
|
+
await page.mouse.wheel(10, 0)
|
61
|
+
await page.mouse.move(100, 400)
|
62
|
+
await page.mouse.up()
|
63
|
+
return page
|
64
|
+
|
65
|
+
assert (await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)).status == 200
|
66
|
+
|
67
|
+
async def test_properties(self, fetcher, urls):
|
68
|
+
"""Test if different arguments breaks the code or not"""
|
69
|
+
assert (await fetcher.async_fetch(
|
70
|
+
urls['html_url'],
|
71
|
+
block_webrtc=True,
|
72
|
+
allow_webgl=True
|
73
|
+
)).status == 200
|
74
|
+
|
75
|
+
assert (await fetcher.async_fetch(
|
76
|
+
urls['html_url'],
|
77
|
+
block_webrtc=False,
|
78
|
+
allow_webgl=True
|
79
|
+
)).status == 200
|
80
|
+
|
81
|
+
assert (await fetcher.async_fetch(
|
82
|
+
urls['html_url'],
|
83
|
+
block_webrtc=True,
|
84
|
+
allow_webgl=False
|
85
|
+
)).status == 200
|
86
|
+
|
87
|
+
assert (await fetcher.async_fetch(
|
88
|
+
urls['html_url'],
|
89
|
+
extra_headers={'ayo': ''},
|
90
|
+
os_randomize=True
|
91
|
+
)).status == 200
|
92
|
+
|
93
|
+
async def test_infinite_timeout(self, fetcher, urls):
|
94
|
+
"""Test if infinite timeout breaks the code or not"""
|
95
|
+
assert (await fetcher.async_fetch(urls['delayed_url'], timeout=None)).status == 200
|
@@ -0,0 +1,83 @@
|
|
1
|
+
import pytest
|
2
|
+
import pytest_httpbin
|
3
|
+
|
4
|
+
from scrapling.fetchers import AsyncFetcher
|
5
|
+
|
6
|
+
|
7
|
+
@pytest_httpbin.use_class_based_httpbin
|
8
|
+
@pytest.mark.asyncio
|
9
|
+
class TestAsyncFetcher:
|
10
|
+
@pytest.fixture(scope="class")
|
11
|
+
def fetcher(self):
|
12
|
+
return AsyncFetcher(auto_match=True)
|
13
|
+
|
14
|
+
@pytest.fixture(scope="class")
|
15
|
+
def urls(self, httpbin):
|
16
|
+
return {
|
17
|
+
'status_200': f'{httpbin.url}/status/200',
|
18
|
+
'status_404': f'{httpbin.url}/status/404',
|
19
|
+
'status_501': f'{httpbin.url}/status/501',
|
20
|
+
'basic_url': f'{httpbin.url}/get',
|
21
|
+
'post_url': f'{httpbin.url}/post',
|
22
|
+
'put_url': f'{httpbin.url}/put',
|
23
|
+
'delete_url': f'{httpbin.url}/delete',
|
24
|
+
'html_url': f'{httpbin.url}/html'
|
25
|
+
}
|
26
|
+
|
27
|
+
async def test_basic_get(self, fetcher, urls):
|
28
|
+
"""Test doing basic get request with multiple statuses"""
|
29
|
+
assert (await fetcher.get(urls['status_200'])).status == 200
|
30
|
+
assert (await fetcher.get(urls['status_404'])).status == 404
|
31
|
+
assert (await fetcher.get(urls['status_501'])).status == 501
|
32
|
+
|
33
|
+
async def test_get_properties(self, fetcher, urls):
|
34
|
+
"""Test if different arguments with GET request breaks the code or not"""
|
35
|
+
assert (await fetcher.get(urls['status_200'], stealthy_headers=True)).status == 200
|
36
|
+
assert (await fetcher.get(urls['status_200'], follow_redirects=True)).status == 200
|
37
|
+
assert (await fetcher.get(urls['status_200'], timeout=None)).status == 200
|
38
|
+
assert (await fetcher.get(
|
39
|
+
urls['status_200'],
|
40
|
+
stealthy_headers=True,
|
41
|
+
follow_redirects=True,
|
42
|
+
timeout=None
|
43
|
+
)).status == 200
|
44
|
+
|
45
|
+
async def test_post_properties(self, fetcher, urls):
|
46
|
+
"""Test if different arguments with POST request breaks the code or not"""
|
47
|
+
assert (await fetcher.post(urls['post_url'], data={'key': 'value'})).status == 200
|
48
|
+
assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, stealthy_headers=True)).status == 200
|
49
|
+
assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, follow_redirects=True)).status == 200
|
50
|
+
assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, timeout=None)).status == 200
|
51
|
+
assert (await fetcher.post(
|
52
|
+
urls['post_url'],
|
53
|
+
data={'key': 'value'},
|
54
|
+
stealthy_headers=True,
|
55
|
+
follow_redirects=True,
|
56
|
+
timeout=None
|
57
|
+
)).status == 200
|
58
|
+
|
59
|
+
async def test_put_properties(self, fetcher, urls):
|
60
|
+
"""Test if different arguments with PUT request breaks the code or not"""
|
61
|
+
assert (await fetcher.put(urls['put_url'], data={'key': 'value'})).status in [200, 405]
|
62
|
+
assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, stealthy_headers=True)).status in [200, 405]
|
63
|
+
assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, follow_redirects=True)).status in [200, 405]
|
64
|
+
assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, timeout=None)).status in [200, 405]
|
65
|
+
assert (await fetcher.put(
|
66
|
+
urls['put_url'],
|
67
|
+
data={'key': 'value'},
|
68
|
+
stealthy_headers=True,
|
69
|
+
follow_redirects=True,
|
70
|
+
timeout=None
|
71
|
+
)).status in [200, 405]
|
72
|
+
|
73
|
+
async def test_delete_properties(self, fetcher, urls):
|
74
|
+
"""Test if different arguments with DELETE request breaks the code or not"""
|
75
|
+
assert (await fetcher.delete(urls['delete_url'], stealthy_headers=True)).status == 200
|
76
|
+
assert (await fetcher.delete(urls['delete_url'], follow_redirects=True)).status == 200
|
77
|
+
assert (await fetcher.delete(urls['delete_url'], timeout=None)).status == 200
|
78
|
+
assert (await fetcher.delete(
|
79
|
+
urls['delete_url'],
|
80
|
+
stealthy_headers=True,
|
81
|
+
follow_redirects=True,
|
82
|
+
timeout=None
|
83
|
+
)).status == 200
|