scrapling 0.2.1__py3-none-any.whl → 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- scrapling/__init__.py +1 -1
- scrapling/defaults.py +6 -0
- scrapling/engines/camo.py +2 -2
- scrapling/engines/pw.py +2 -2
- scrapling/engines/static.py +2 -2
- scrapling/engines/toolbelt/custom.py +3 -4
- scrapling/parser.py +11 -2
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/METADATA +15 -4
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/RECORD +12 -11
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/LICENSE +0 -0
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/WHEEL +0 -0
- {scrapling-0.2.1.dist-info → scrapling-0.2.2.dist-info}/top_level.txt +0 -0
scrapling/__init__.py
CHANGED
@@ -4,7 +4,7 @@ from scrapling.parser import Adaptor, Adaptors
|
|
4
4
|
from scrapling.core.custom_types import TextHandler, AttributesHandler
|
5
5
|
|
6
6
|
__author__ = "Karim Shoair (karim.shoair@pm.me)"
|
7
|
-
__version__ = "0.2.
|
7
|
+
__version__ = "0.2.2"
|
8
8
|
__copyright__ = "Copyright (c) 2024 Karim Shoair"
|
9
9
|
|
10
10
|
|
scrapling/defaults.py
ADDED
@@ -0,0 +1,6 @@
|
|
1
|
+
from .fetchers import Fetcher, StealthyFetcher, PlayWrightFetcher
|
2
|
+
|
3
|
+
# If you are going to use Fetchers with the default settings, import them from this file instead for a cleaner looking code
|
4
|
+
Fetcher = Fetcher()
|
5
|
+
StealthyFetcher = StealthyFetcher()
|
6
|
+
PlayWrightFetcher = PlayWrightFetcher()
|
scrapling/engines/camo.py
CHANGED
@@ -114,14 +114,14 @@ class CamoufoxEngine:
|
|
114
114
|
response = Response(
|
115
115
|
url=res.url,
|
116
116
|
text=page.content(),
|
117
|
-
|
117
|
+
body=res.body(),
|
118
118
|
status=res.status,
|
119
119
|
reason=res.status_text,
|
120
120
|
encoding=encoding,
|
121
121
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
122
122
|
headers=res.all_headers(),
|
123
123
|
request_headers=res.request.all_headers(),
|
124
|
-
|
124
|
+
**self.adaptor_arguments
|
125
125
|
)
|
126
126
|
page.close()
|
127
127
|
|
scrapling/engines/pw.py
CHANGED
@@ -224,14 +224,14 @@ class PlaywrightEngine:
|
|
224
224
|
response = Response(
|
225
225
|
url=res.url,
|
226
226
|
text=page.content(),
|
227
|
-
|
227
|
+
body=res.body(),
|
228
228
|
status=res.status,
|
229
229
|
reason=res.status_text,
|
230
230
|
encoding=encoding,
|
231
231
|
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
|
232
232
|
headers=res.all_headers(),
|
233
233
|
request_headers=res.request.all_headers(),
|
234
|
-
|
234
|
+
**self.adaptor_arguments
|
235
235
|
)
|
236
236
|
page.close()
|
237
237
|
return response
|
scrapling/engines/static.py
CHANGED
@@ -53,14 +53,14 @@ class StaticEngine:
|
|
53
53
|
return Response(
|
54
54
|
url=str(response.url),
|
55
55
|
text=response.text,
|
56
|
-
|
56
|
+
body=response.content,
|
57
57
|
status=response.status_code,
|
58
58
|
reason=response.reason_phrase,
|
59
59
|
encoding=response.encoding or 'utf-8',
|
60
60
|
cookies=dict(response.cookies),
|
61
61
|
headers=dict(response.headers),
|
62
62
|
request_headers=dict(response.request.headers),
|
63
|
-
|
63
|
+
**self.adaptor_arguments
|
64
64
|
)
|
65
65
|
|
66
66
|
def get(self, url: str, stealthy_headers: Optional[bool] = True, **kwargs: Dict) -> Response:
|
@@ -12,15 +12,14 @@ from scrapling.core._types import Any, List, Type, Union, Optional, Dict, Callab
|
|
12
12
|
class Response(Adaptor):
|
13
13
|
"""This class is returned by all engines as a way to unify response type between different libraries."""
|
14
14
|
|
15
|
-
def __init__(self, url: str, text: str,
|
15
|
+
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict, encoding: str = 'utf-8', **adaptor_arguments: Dict):
|
16
16
|
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
|
17
|
-
super().__init__(text=text, body=content, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
18
|
-
|
19
17
|
self.status = status
|
20
18
|
self.reason = reason
|
21
19
|
self.cookies = cookies
|
22
20
|
self.headers = headers
|
23
21
|
self.request_headers = request_headers
|
22
|
+
super().__init__(text=text, body=body, url=automatch_domain or url, encoding=encoding, **adaptor_arguments)
|
24
23
|
# For back-ward compatibility
|
25
24
|
self.adaptor = self
|
26
25
|
|
@@ -31,7 +30,7 @@ class Response(Adaptor):
|
|
31
30
|
class BaseFetcher:
|
32
31
|
def __init__(
|
33
32
|
self, huge_tree: bool = True, keep_comments: Optional[bool] = False, auto_match: Optional[bool] = True,
|
34
|
-
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] =
|
33
|
+
storage: Any = SQLiteStorageSystem, storage_args: Optional[Dict] = None, debug: Optional[bool] = False,
|
35
34
|
automatch_domain: Optional[str] = None,
|
36
35
|
):
|
37
36
|
"""Arguments below are the same from the Adaptor class so you can pass them directly, the rest of Adaptor's arguments
|
scrapling/parser.py
CHANGED
@@ -32,6 +32,7 @@ class Adaptor(SelectorsGeneration):
|
|
32
32
|
storage: Any = SQLiteStorageSystem,
|
33
33
|
storage_args: Optional[Dict] = None,
|
34
34
|
debug: Optional[bool] = True,
|
35
|
+
**kwargs
|
35
36
|
):
|
36
37
|
"""The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
|
37
38
|
with expressions in CSS, XPath, or with simply text. Check the docs for more info.
|
@@ -117,6 +118,10 @@ class Adaptor(SelectorsGeneration):
|
|
117
118
|
self.__attributes = None
|
118
119
|
self.__tag = None
|
119
120
|
self.__debug = debug
|
121
|
+
# No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
|
122
|
+
self.__response_data = {
|
123
|
+
key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
|
124
|
+
} if hasattr(self, 'status') else {}
|
120
125
|
|
121
126
|
# Node functionalities, I wanted to move to separate Mixin class but it had slight impact on performance
|
122
127
|
@staticmethod
|
@@ -138,10 +143,14 @@ class Adaptor(SelectorsGeneration):
|
|
138
143
|
return TextHandler(str(element))
|
139
144
|
else:
|
140
145
|
if issubclass(type(element), html.HtmlMixin):
|
146
|
+
|
141
147
|
return self.__class__(
|
142
|
-
root=element,
|
148
|
+
root=element,
|
149
|
+
text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
|
150
|
+
url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
|
143
151
|
keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
|
144
|
-
huge_tree=self.__huge_tree_enabled, debug=self.__debug
|
152
|
+
huge_tree=self.__huge_tree_enabled, debug=self.__debug,
|
153
|
+
**self.__response_data
|
145
154
|
)
|
146
155
|
return element
|
147
156
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: scrapling
|
3
|
-
Version: 0.2.
|
3
|
+
Version: 0.2.2
|
4
4
|
Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
|
5
5
|
Home-page: https://github.com/D4Vinci/Scrapling
|
6
6
|
Author: Karim Shoair
|
@@ -52,9 +52,9 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
|
|
52
52
|
Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
|
53
53
|
|
54
54
|
```python
|
55
|
-
>> from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
55
|
+
>> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
56
56
|
# Fetch websites' source under the radar!
|
57
|
-
>> page = StealthyFetcher
|
57
|
+
>> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
|
58
58
|
>> print(page.status)
|
59
59
|
200
|
60
60
|
>> products = page.css('.product', auto_save=True) # Scrape data that survives website design changes!
|
@@ -257,12 +257,21 @@ python -m browserforge update
|
|
257
257
|
```
|
258
258
|
|
259
259
|
## Fetching Websites Features
|
260
|
-
All fetcher-type classes are imported in the same way
|
260
|
+
You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
|
261
261
|
```python
|
262
262
|
from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
|
263
263
|
```
|
264
264
|
And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
|
265
265
|
|
266
|
+
If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
|
267
|
+
```python
|
268
|
+
from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
|
269
|
+
```
|
270
|
+
then use it right away without initializing like:
|
271
|
+
```python
|
272
|
+
page = StealthyFetcher.fetch('https://example.com')
|
273
|
+
```
|
274
|
+
|
266
275
|
Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
|
267
276
|
> [!NOTE]
|
268
277
|
> The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
|
@@ -803,6 +812,8 @@ Yes, Scrapling instances are thread-safe. Each Adaptor instance maintains its st
|
|
803
812
|
|
804
813
|
## More Sponsors!
|
805
814
|
[](https://www.capsolver.com/?utm_source=github&utm_medium=repo&utm_campaign=scraping&utm_term=Scrapling)
|
815
|
+
<a href="https://serpapi.com/?utm_source=scrapling"><img src="https://raw.githubusercontent.com/D4Vinci/Scrapling/main/images/SerpApi.png" height="500" width="500" alt="SerpApi Banner" ></a>
|
816
|
+
|
806
817
|
|
807
818
|
## Contributing
|
808
819
|
Everybody is invited and welcome to contribute to Scrapling. There is a lot to do!
|
@@ -1,6 +1,7 @@
|
|
1
|
-
scrapling/__init__.py,sha256=
|
1
|
+
scrapling/__init__.py,sha256=lpRuPRo5y_KrUeY78qgX5H_C2dWFV33VqrTX0OafHO8,435
|
2
|
+
scrapling/defaults.py,sha256=blYDLiuI5DgDSLRWnUgpp21WtFOsv1BsCRCmPeg8Xc4,287
|
2
3
|
scrapling/fetchers.py,sha256=_6mL7XSTZE1fHXBqbxE2bBHnlQP1lH-4MCiQHQd5hQs,16017
|
3
|
-
scrapling/parser.py,sha256=
|
4
|
+
scrapling/parser.py,sha256=d2n00uF5i7W5lf0afLNRdk17ZFcNyiF9EzXLRQGA0NM,54111
|
4
5
|
scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
|
5
6
|
scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
6
7
|
scrapling/core/_types.py,sha256=nD2ZY_fitLohx3MfDmqoKJ9ZShrnRhQ8-d1SU1zEGAY,552
|
@@ -10,12 +11,12 @@ scrapling/core/storage_adaptors.py,sha256=Kbak0BOJX5e9I1PbUS_4sUJi2Wxw8Bv5XsaLHA
|
|
10
11
|
scrapling/core/translator.py,sha256=oU-dQCkNQOccZPrXbPW_VSgC5ll10Bb89C3ezW2lI0o,5228
|
11
12
|
scrapling/core/utils.py,sha256=fXdANUgRBbVbOerJ94fRY9vi7n5zsbm8t3G4qQ-F3ak,3792
|
12
13
|
scrapling/engines/__init__.py,sha256=zwMqcSdNGh-IX0d4zXazrgAeHrkqIN_v5Ia7RU1g8W0,267
|
13
|
-
scrapling/engines/camo.py,sha256=
|
14
|
+
scrapling/engines/camo.py,sha256=41vp2Nh51kKuOSZ1PijsIpROpQZgFfUPybVbEX8pEXk,7530
|
14
15
|
scrapling/engines/constants.py,sha256=jSDA6lgbvEIB8z2m2SFzCKkvFEZnp28Mondy2__FpkM,3721
|
15
|
-
scrapling/engines/pw.py,sha256=
|
16
|
-
scrapling/engines/static.py,sha256=
|
16
|
+
scrapling/engines/pw.py,sha256=l5MrSW_WNBKAxAlyxbt09ka_lEGo61XKuaOgWpYmvHk,12102
|
17
|
+
scrapling/engines/static.py,sha256=Wsp6_-soZUQJT6kHoKPkLOdHU9J50chLdYxDmQjO4FQ,7101
|
17
18
|
scrapling/engines/toolbelt/__init__.py,sha256=BnBp34aDeohYgqdysEAAWnGZgA02YlExkc5FJLetMSo,367
|
18
|
-
scrapling/engines/toolbelt/custom.py,sha256=
|
19
|
+
scrapling/engines/toolbelt/custom.py,sha256=8lvGHWIZoOotSTF97KgPb3CbJquel2QFx8rP8Hf2sQ4,7469
|
19
20
|
scrapling/engines/toolbelt/fingerprints.py,sha256=kkVtZKSt2ukc0CV0g6QUvSWR0Yx5p8Mv8xiqACAsMBo,2917
|
20
21
|
scrapling/engines/toolbelt/navigation.py,sha256=Tde5_6Wv7lOeWXMzs9D6TRaxAbJ3b-zIX6-4HggZbCQ,4017
|
21
22
|
tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
|
@@ -26,8 +27,8 @@ tests/fetchers/test_playwright.py,sha256=YOWn89urd9NwoCHfTFj8fY4xYrRY2BeszTt5Q-T
|
|
26
27
|
tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
27
28
|
tests/parser/test_automatch.py,sha256=BeeYJi3cYCghbiZmi57z4bqcGPaoUA8GAm7MALBBkkk,2486
|
28
29
|
tests/parser/test_general.py,sha256=NfTuGLgAm-LH0dVV0pvbRcYSNI-wSu05rdnuRzmB0m4,11664
|
29
|
-
scrapling-0.2.
|
30
|
-
scrapling-0.2.
|
31
|
-
scrapling-0.2.
|
32
|
-
scrapling-0.2.
|
33
|
-
scrapling-0.2.
|
30
|
+
scrapling-0.2.2.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
|
31
|
+
scrapling-0.2.2.dist-info/METADATA,sha256=gk7fij0BkRwA51dJlCbARlx_FW9_U9v9ptk3Mc5-YKQ,64784
|
32
|
+
scrapling-0.2.2.dist-info/WHEEL,sha256=R06PA3UVYHThwHvxuRWMqaGcr-PuniXahwjmQRFMEkY,91
|
33
|
+
scrapling-0.2.2.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
|
34
|
+
scrapling-0.2.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|