scrapling 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- scrapling/__init__.py +1 -1
- scrapling/defaults.py +6 -0
- scrapling/engines/camo.py +2 -2
- scrapling/engines/pw.py +2 -2
- scrapling/engines/static.py +2 -2
- scrapling/engines/toolbelt/custom.py +3 -4
- scrapling/parser.py +11 -2
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/METADATA +15 -4
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/RECORD +12 -11
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/LICENSE +0 -0
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/WHEEL +0 -0
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.2"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/defaults.py
ADDED
@@ -0,0 +1,6 @@
|
|
1
|
+
from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
|
2
|
+
|
3
|
+
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
|
+
Fetcher = Fetcher()
|
5
|
+
StealthyFetcher = StealthyFetcher()
|
6
|
+
PlayWrightFetcher = PlayWrightFetcher()
|
scrapling/engines/camo.py
CHANGED
@@ -114,14 +114,14 @@ class CamoufoxEngine:
|
|
114
114
|
response = Response(
|
115
115
|
url=res.url,
|
116
116
|
text=page.content(),
|
117
|
-
|
117
|
+
body=res.body(),
|
118
118
|
status=res.status,
|
119
119
|
reason=res.status_text,
|
120
120
|
encoding=encoding,
|
121
121
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
122
122
|
headers=res.all_headers(),
|
123
123
|
request_headers=res.request.all_headers(),
|
124
|
-
|
124
|
+
**self.adaptor_arguments
|
125
125
|
)
|
126
126
|
page.close()
|
127
127
|
|
scrapling/engines/pw.py
CHANGED
@@ -224,14 +224,14 @@ class PlaywrightEngine:
|
|
224
224
|
response = Response(
|
225
225
|
url=res.url,
|
226
226
|
text=page.content(),
|
227
|
-
|
227
|
+
body=res.body(),
|
228
228
|
status=res.status,
|
229
229
|
reason=res.status_text,
|
230
230
|
encoding=encoding,
|
231
231
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
232
232
|
headers=res.all_headers(),
|
233
233
|
request_headers=res.request.all_headers(),
|
234
|
-
|
234
|
+
**self.adaptor_arguments
|
235
235
|
)
|
236
236
|
page.close()
|
237
237
|
return response
|
scrapling/engines/static.py
CHANGED
@@ -53,14 +53,14 @@ class StaticEngine:
|
|
53
53
|
return Response(
|
54
54
|
url=str(response.url),
|
55
55
|
text=response.text,
|
56
|
-
|
56
|
+
body=response.content,
|
57
57
|
status=response.status_code,
|
58
58
|
reason=response.reason_phrase,
|
59
59
|
encoding=response.encoding or 'utf-8',
|
60
60
|
cookies=dict(response.cookies),
|
61
61
|
headers=dict(response.headers),
|
62
62
|
request_headers=dict(response.request.headers),
|
63
|
-
|
63
|
+
**self.adaptor_arguments
|
64
64
|
)
|
65
65
|
|
66
66
|
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
@@ -12,15 +12,14 @@ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callab
|
|
12
12
|
class Response(Adaptor):
|
13
13
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
14
14
|
|
15
|
-
def __init__(self, url: str, text: str,
|
15
|
+
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', **adaptor_arguments: Dict):
|
16
16
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
17
|
-
super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
18
|
-
|
19
17
|
self.status = status
|
20
18
|
self.reason = reason
|
21
19
|
self.cookies = cookies
|
22
20
|
self.headers = headers
|
23
21
|
self.request_headers = request_headers
|
22
|
+
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
24
23
|
# For back-ward compatibility
|
25
24
|
self.adaptor = self
|
26
25
|
|
@@ -31,7 +30,7 @@ class Response(Adaptor):
|
|
31
30
|
class BaseFetcher:
|
32
31
|
def __init__(
|
33
32
|
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
34
|
-
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] =
|
33
|
+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
|
35
34
|
automatch_domain: Optional[str] = None,
|
36
35
|
):
|
37
36
|
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
scrapling/parser.py
CHANGED
@@ -32,6 +32,7 @@ class Adaptor(SelectorsGeneration):
|
|
32
32
|
storage: Any = SQLiteStorageSystem,
|
33
33
|
storage_args: Optional[Dict] = None,
|
34
34
|
debug: Optional[bool] = True,
|
35
|
+
**kwargs
|
35
36
|
):
|
36
37
|
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
37
38
|
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
@@ -117,6 +118,10 @@ class Adaptor(SelectorsGeneration):
|
|
117
118
|
self.__attributes = None
|
118
119
|
self.__tag = None
|
119
120
|
self.__debug = debug
|
121
|
+
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
122
|
+
self.__response_data = {
|
123
|
+
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
|
124
|
+
} if hasattr(self, 'status') else {}
|
120
125
|
|
121
126
|
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
122
127
|
@staticmethod
|
@@ -138,10 +143,14 @@ class Adaptor(SelectorsGeneration):
|
|
138
143
|
return TextHandler(str(element))
|
139
144
|
else:
|
140
145
|
if issubclass(type(element), html.HtmlMixin):
|
146
|
+
|
141
147
|
return self.__class__(
|
142
|
-
root=element,
|
148
|
+
root=element,
|
149
|
+
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
150
|
+
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
143
151
|
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
|
144
|
-
huge_tree=self.__huge_tree_enabled, debug=self.__debug
|
152
|
+
huge_tree=self.__huge_tree_enabled, debug=self.__debug,
|
153
|
+
**self.__response_data
|
145
154
|
)
|
146
155
|
return element
|
147
156
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -52,9 +52,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
52
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
53
|
|
54
54
|
```python
|
55
|
-
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
55
|
+
>> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
56
56
|
# Fetch websites' source under the radar!
|
57
|
-
>> page = StealthyFetcher
|
57
|
+
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
58
|
>> print(page.status)
|
59
59
|
200
|
60
60
|
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
@@ -257,12 +257,21 @@ python -m browserforge update
|
|
257
257
|
```
|
258
258
|
|
259
259
|
## Fetching Websites Features
|
260
|
-
All fetcher-type classes are imported in the same way
|
260
|
+
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
261
|
```python
|
262
262
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
263
263
|
```
|
264
264
|
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
|
265
265
|
|
266
|
+
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
267
|
+
```python
|
268
|
+
from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
269
|
+
```
|
270
|
+
then use it right away without initializing like:
|
271
|
+
```python
|
272
|
+
page = StealthyFetcher.fetch('https://example.com')
|
273
|
+
```
|
274
|
+
|
266
275
|
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
267
276
|
> [!NOTE]
|
268
277
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
@@ -803,6 +812,8 @@ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its st
|
|
803
812
|
|
804
813
|
## More Sponsors!
|
805
814
|
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
815
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
816
|
+
|
806
817
|
|
807
818
|
## Contributing
|
808
819
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
@@ -1,6 +1,7 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=lpRuPRo5y_KrUeY78qgX5H_C2dWFV33VqrTX0OafHO8,435
|
2
|
+
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
2
3
|
scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
|
3
|
-
scrapling/parser.py,sha256=
|
4
|
+
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
4
5
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
5
6
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
7
|
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
@@ -10,12 +11,12 @@ scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHA
|
|
10
11
|
scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
|
11
12
|
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
12
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
13
|
-
scrapling/engines/camo.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=41vp2Nh51kKuOSZ1PijsIpROpQZgFfUPybVbEX8pEXk,7530
|
14
15
|
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
15
|
-
scrapling/engines/pw.py,sha256=
|
16
|
-
scrapling/engines/static.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=l5MrSW_WNBKAxAlyxbt09ka_lEGo61XKuaOgWpYmvHk,12102
|
17
|
+
scrapling/engines/static.py,sha256=Wsp6_-soZUQJT6kHoKPkLOdHU9J50chLdYxDmQjO4FQ,7101
|
17
18
|
scrapling/engines/toolbelt/__init__.py,sha256=BnBp34aDeohYgqdysEAAWnGZgA02YlExkc5FJLetMSo,367
|
18
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=8lvGHWIZoOotSTF97KgPb3CbJquel2QFx8rP8Hf2sQ4,7469
|
19
20
|
scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
|
20
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
21
22
|
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
@@ -26,8 +27,8 @@ tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-T
|
|
26
27
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
28
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
28
29
|
tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
|
29
|
-
scrapling-0.2.
|
30
|
-
scrapling-0.2.
|
31
|
-
scrapling-0.2.
|
32
|
-
scrapling-0.2.
|
33
|
-
scrapling-0.2.
|
30
|
+
scrapling-0.2.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
31
|
+
scrapling-0.2.2.dist-info/METADATA,sha256=gk7fij0BkRwA51dJlCbARlx_FW9_U9v9ptk3Mc5-YKQ,64784
|
32
|
+
scrapling-0.2.2.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
33
|
+
scrapling-0.2.2.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
34
|
+
scrapling-0.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|