scrapling 0.2.8__py3-none-any.whl → 0.2.91__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (37) hide show
  1. scrapling/__init__.py +4 -4
  2. scrapling/core/_types.py +2 -0
  3. scrapling/core/custom_types.py +88 -6
  4. scrapling/core/storage_adaptors.py +5 -6
  5. scrapling/core/translator.py +2 -2
  6. scrapling/core/utils.py +29 -27
  7. scrapling/defaults.py +2 -1
  8. scrapling/engines/camo.py +124 -24
  9. scrapling/engines/constants.py +4 -4
  10. scrapling/engines/pw.py +195 -91
  11. scrapling/engines/static.py +91 -48
  12. scrapling/engines/toolbelt/__init__.py +3 -3
  13. scrapling/engines/toolbelt/custom.py +16 -22
  14. scrapling/engines/toolbelt/fingerprints.py +3 -3
  15. scrapling/engines/toolbelt/navigation.py +21 -8
  16. scrapling/fetchers.py +231 -16
  17. scrapling/parser.py +50 -22
  18. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/METADATA +33 -18
  19. scrapling-0.2.91.dist-info/RECORD +47 -0
  20. tests/fetchers/async/__init__.py +0 -0
  21. tests/fetchers/async/test_camoufox.py +95 -0
  22. tests/fetchers/async/test_httpx.py +83 -0
  23. tests/fetchers/async/test_playwright.py +99 -0
  24. tests/fetchers/sync/__init__.py +0 -0
  25. tests/fetchers/sync/test_camoufox.py +68 -0
  26. tests/fetchers/sync/test_httpx.py +82 -0
  27. tests/fetchers/sync/test_playwright.py +87 -0
  28. tests/fetchers/test_utils.py +90 -122
  29. tests/parser/test_automatch.py +64 -9
  30. tests/parser/test_general.py +260 -218
  31. scrapling-0.2.8.dist-info/RECORD +0 -42
  32. tests/fetchers/test_camoufox.py +0 -65
  33. tests/fetchers/test_httpx.py +0 -68
  34. tests/fetchers/test_playwright.py +0 -77
  35. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/LICENSE +0 -0
  36. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/WHEEL +0 -0
  37. {scrapling-0.2.8.dist-info → scrapling-0.2.91.dist-info}/top_level.txt +0 -0
scrapling/parser.py CHANGED
@@ -2,6 +2,7 @@ import inspect
2
2
  import os
3
3
  import re
4
4
  from difflib import SequenceMatcher
5
+ from urllib.parse import urljoin
5
6
 
6
7
  from cssselect import SelectorError, SelectorSyntaxError
7
8
  from cssselect import parse as split_selectors
@@ -17,13 +18,14 @@ from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
17
18
  StorageSystemMixin, _StorageTools)
18
19
  from scrapling.core.translator import HTMLTranslator
19
20
  from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
20
- is_jsonable, logging, setup_basic_logging)
21
+ is_jsonable, log)
21
22
 
22
23
 
23
24
  class Adaptor(SelectorsGeneration):
24
25
  __slots__ = (
25
- 'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
26
+ 'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
26
27
  '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
28
+ '__keep_cdata', '__raw_body'
27
29
  )
28
30
 
29
31
  def __init__(
@@ -35,10 +37,10 @@ class Adaptor(SelectorsGeneration):
35
37
  huge_tree: bool = True,
36
38
  root: Optional[html.HtmlElement] = None,
37
39
  keep_comments: Optional[bool] = False,
40
+ keep_cdata: Optional[bool] = False,
38
41
  auto_match: Optional[bool] = True,
39
42
  storage: Any = SQLiteStorageSystem,
40
43
  storage_args: Optional[Dict] = None,
41
- debug: Optional[bool] = True,
42
44
  **kwargs
43
45
  ):
44
46
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
@@ -58,33 +60,36 @@ class Adaptor(SelectorsGeneration):
58
60
  :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
59
61
  Don't use it unless you know what you are doing!
60
62
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
63
+ :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
61
64
  :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
62
65
  priority over all auto-match related arguments/functions in the class.
63
66
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
64
67
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
65
68
  If empty, default values will be used.
66
- :param debug: Enable debug mode
67
69
  """
68
70
  if root is None and not body and text is None:
69
71
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
70
72
 
71
73
  self.__text = None
74
+ self.__raw_body = ''
72
75
  if root is None:
73
76
  if text is None:
74
77
  if not body or not isinstance(body, bytes):
75
78
  raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
76
79
 
77
80
  body = body.replace(b"\x00", b"").strip()
81
+ self.__raw_body = body.replace(b"\x00", b"").strip().decode()
78
82
  else:
79
83
  if not isinstance(text, str):
80
84
  raise TypeError(f"text argument must be of type str, got {text.__class__}")
81
85
 
82
86
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
87
+ self.__raw_body = text.strip()
83
88
 
84
89
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
85
90
  parser = html.HTMLParser(
86
- recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
87
- compact=True, huge_tree=huge_tree, default_doctype=True
91
+ recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
92
+ compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
88
93
  )
89
94
  self._root = etree.fromstring(body, parser=parser, base_url=url)
90
95
  if is_jsonable(text or body.decode()):
@@ -99,7 +104,6 @@ class Adaptor(SelectorsGeneration):
99
104
 
100
105
  self._root = root
101
106
 
102
- setup_basic_logging(level='debug' if debug else 'info')
103
107
  self.__auto_match_enabled = auto_match
104
108
 
105
109
  if self.__auto_match_enabled:
@@ -110,7 +114,7 @@ class Adaptor(SelectorsGeneration):
110
114
  }
111
115
 
112
116
  if not hasattr(storage, '__wrapped__'):
113
- raise ValueError("Storage class must be wrapped with cache decorator, see docs for info")
117
+ raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
114
118
 
115
119
  if not issubclass(storage.__wrapped__, StorageSystemMixin):
116
120
  raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
@@ -118,13 +122,13 @@ class Adaptor(SelectorsGeneration):
118
122
  self._storage = storage(**storage_args)
119
123
 
120
124
  self.__keep_comments = keep_comments
125
+ self.__keep_cdata = keep_cdata
121
126
  self.__huge_tree_enabled = huge_tree
122
127
  self.encoding = encoding
123
128
  self.url = url
124
129
  # For selector stuff
125
130
  self.__attributes = None
126
131
  self.__tag = None
127
- self.__debug = debug
128
132
  # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
129
133
  self.__response_data = {
130
134
  key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
@@ -151,12 +155,12 @@ class Adaptor(SelectorsGeneration):
151
155
  else:
152
156
  if issubclass(type(element), html.HtmlMixin):
153
157
 
154
- return self.__class__(
158
+ return Adaptor(
155
159
  root=element,
156
160
  text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
157
161
  url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
158
- keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
159
- huge_tree=self.__huge_tree_enabled, debug=self.__debug,
162
+ keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
163
+ huge_tree=self.__huge_tree_enabled,
160
164
  **self.__response_data
161
165
  )
162
166
  return element
@@ -243,6 +247,10 @@ class Adaptor(SelectorsGeneration):
243
247
 
244
248
  return TextHandler(separator.join([s for s in _all_strings]))
245
249
 
250
+ def urljoin(self, relative_url: str) -> str:
251
+ """Join this Adaptor's url with a relative url to form an absolute full URL."""
252
+ return urljoin(self.url, relative_url)
253
+
246
254
  @property
247
255
  def attrib(self) -> AttributesHandler:
248
256
  """Get attributes of the element"""
@@ -255,7 +263,10 @@ class Adaptor(SelectorsGeneration):
255
263
  """Return the inner html code of the element"""
256
264
  return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
257
265
 
258
- body = html_content
266
+ @property
267
+ def body(self) -> str:
268
+ """Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
269
+ return self.__raw_body or self.html_content
259
270
 
260
271
  def prettify(self) -> str:
261
272
  """Return a prettified version of the element's inner html-code"""
@@ -330,6 +341,16 @@ class Adaptor(SelectorsGeneration):
330
341
 
331
342
  return self.__convert_results(prev_element)
332
343
 
344
+ # For easy copy-paste from Scrapy/parsel code when needed :)
345
+ def get(self, default=None):
346
+ return self
347
+
348
+ def get_all(self):
349
+ return self
350
+
351
+ extract = get_all
352
+ extract_first = get
353
+
333
354
  def __str__(self) -> str:
334
355
  return self.html_content
335
356
 
@@ -392,10 +413,10 @@ class Adaptor(SelectorsGeneration):
392
413
  if score_table:
393
414
  highest_probability = max(score_table.keys())
394
415
  if score_table[highest_probability] and highest_probability >= percentage:
395
- logging.debug(f'Highest probability was {highest_probability}%')
396
- logging.debug('Top 5 best matching elements are: ')
416
+ log.debug(f'Highest probability was {highest_probability}%')
417
+ log.debug('Top 5 best matching elements are: ')
397
418
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
398
- logging.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
419
+ log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
399
420
  if not adaptor_type:
400
421
  return score_table[highest_probability]
401
422
  return self.__convert_results(score_table[highest_probability])
@@ -521,7 +542,7 @@ class Adaptor(SelectorsGeneration):
521
542
 
522
543
  if selected_elements:
523
544
  if not self.__auto_match_enabled and auto_save:
524
- logging.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
545
+ log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
525
546
 
526
547
  elif self.__auto_match_enabled and auto_save:
527
548
  self.save(selected_elements[0], identifier or selector)
@@ -540,7 +561,7 @@ class Adaptor(SelectorsGeneration):
540
561
  return self.__convert_results(selected_elements)
541
562
 
542
563
  elif not self.__auto_match_enabled and auto_match:
543
- logging.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
564
+ log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
544
565
 
545
566
  return self.__convert_results(selected_elements)
546
567
 
@@ -744,8 +765,8 @@ class Adaptor(SelectorsGeneration):
744
765
 
745
766
  self._storage.save(element, identifier)
746
767
  else:
747
- logging.critical(
748
- "Can't use Auto-match features with disabled globally, you have to start a new class instance."
768
+ log.critical(
769
+ "Can't use Auto-match features while disabled globally, you have to start a new class instance."
749
770
  )
750
771
 
751
772
  def retrieve(self, identifier: str) -> Optional[Dict]:
@@ -758,8 +779,8 @@ class Adaptor(SelectorsGeneration):
758
779
  if self.__auto_match_enabled:
759
780
  return self._storage.retrieve(identifier)
760
781
 
761
- logging.critical(
762
- "Can't use Auto-match features with disabled globally, you have to start a new class instance."
782
+ log.critical(
783
+ "Can't use Auto-match features while disabled globally, you have to start a new class instance."
763
784
  )
764
785
 
765
786
  # Operations on text functions
@@ -1073,12 +1094,19 @@ class Adaptors(List[Adaptor]):
1073
1094
  ]
1074
1095
  return self.__class__(results) if results else results
1075
1096
 
1097
+ # For easy copy-paste from Scrapy/parsel code when needed :)
1076
1098
  def get(self, default=None):
1077
1099
  """Returns the first item of the current list
1078
1100
  :param default: the default value to return if the current list is empty
1079
1101
  """
1080
1102
  return self[0] if len(self) > 0 else default
1081
1103
 
1104
+ def extract(self):
1105
+ return self
1106
+
1107
+ extract_first = get
1108
+ get_all = extract
1109
+
1082
1110
  @property
1083
1111
  def first(self):
1084
1112
  """Returns the first item of the current list or `None` if the list is empty"""
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.8
3
+ Version: 0.2.91
4
4
  Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
@@ -21,7 +21,6 @@ Classifier: Topic :: Text Processing :: Markup :: HTML
21
21
  Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
22
  Classifier: Programming Language :: Python :: 3
23
23
  Classifier: Programming Language :: Python :: 3 :: Only
24
- Classifier: Programming Language :: Python :: 3.8
25
24
  Classifier: Programming Language :: Python :: 3.9
26
25
  Classifier: Programming Language :: Python :: 3.10
27
26
  Classifier: Programming Language :: Python :: 3.11
@@ -29,7 +28,7 @@ Classifier: Programming Language :: Python :: 3.12
29
28
  Classifier: Programming Language :: Python :: 3.13
30
29
  Classifier: Programming Language :: Python :: Implementation :: CPython
31
30
  Classifier: Typing :: Typed
32
- Requires-Python: >=3.8
31
+ Requires-Python: >=3.9
33
32
  Description-Content-Type: text/markdown
34
33
  License-File: LICENSE
35
34
  Requires-Dist: requests>=2.3
@@ -38,11 +37,10 @@ Requires-Dist: cssselect>=1.2
38
37
  Requires-Dist: w3lib
39
38
  Requires-Dist: orjson>=3
40
39
  Requires-Dist: tldextract
41
- Requires-Dist: httpx[brotli,zstd]
42
- Requires-Dist: playwright==1.48
43
- Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox>=0.4.4
45
- Requires-Dist: browserforge
40
+ Requires-Dist: httpx[brotli,socks,zstd]
41
+ Requires-Dist: playwright>=1.49.1
42
+ Requires-Dist: rebrowser-playwright>=1.49.1
43
+ Requires-Dist: camoufox[geoip]>=0.4.9
46
44
 
47
45
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
48
46
  [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
@@ -52,7 +50,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
52
50
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
53
51
 
54
52
  ```python
55
- >> from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
53
+ >> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
56
54
  # Fetch websites' source under the radar!
57
55
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
56
  >> print(page.status)
@@ -81,7 +79,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
81
79
 
82
80
  ## Table of content
83
81
  * [Key Features](#key-features)
84
- * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
82
+ * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
85
83
  * [Adaptive Scraping](#adaptive-scraping)
86
84
  * [Performance](#performance)
87
85
  * [Developing Experience](#developing-experience)
@@ -122,7 +120,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
122
120
 
123
121
  ## Key Features
124
122
 
125
- ### Fetch websites as you prefer
123
+ ### Fetch websites as you prefer with async support
126
124
  - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
127
125
  - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
128
126
  - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
@@ -213,7 +211,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
213
211
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
214
212
 
215
213
  ## Installation
216
- Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.8 to work.
214
+ Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
217
215
  ```bash
218
216
  pip3 install scrapling
219
217
  ```
@@ -265,11 +263,11 @@ You might be slightly confused by now so let me clear things up. All fetcher-typ
265
263
  ```python
266
264
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
267
265
  ```
268
- All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug`, which are the same ones you give to the `Adaptor` class.
266
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
269
267
 
270
268
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
271
269
  ```python
272
- from scrapling.defaults import Fetcher, StealthyFetcher, PlayWrightFetcher
270
+ from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
273
271
  ```
274
272
  then use it right away without initializing like:
275
273
  ```python
@@ -282,21 +280,32 @@ Also, the `Response` object returned from all fetchers is the same as the `Adapt
282
280
  ### Fetcher
283
281
  This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
284
282
 
285
- For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
283
+ For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
286
284
 
287
285
  You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
288
286
  ```python
289
- >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
287
+ >> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
290
288
  >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
291
289
  >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
292
290
  >> page = Fetcher().delete('https://httpbin.org/delete')
293
291
  ```
292
+ For Async requests, you will just replace the import like below:
293
+ ```python
294
+ >> from scrapling import AsyncFetcher
295
+ >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
296
+ >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
297
+ >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
298
+ >> page = await AsyncFetcher().delete('https://httpbin.org/delete')
299
+ ```
294
300
  ### StealthyFetcher
295
301
  This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
296
302
  ```python
297
303
  >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
298
304
  >> page.status == 200
299
305
  True
306
+ >> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
307
+ >> page.status == 200
308
+ True
300
309
  ```
301
310
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
302
311
 
@@ -314,7 +323,8 @@ True
314
323
  | page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
315
324
  | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
316
325
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
317
- | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
326
+ | allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
327
+ | geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
318
328
  | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
319
329
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
320
330
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
@@ -333,6 +343,9 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
333
343
  >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
334
344
  >> page.css_first("#search a::attr(href)")
335
345
  'https://github.com/D4Vinci/Scrapling'
346
+ >> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
347
+ >> page.css_first("#search a::attr(href)")
348
+ 'https://github.com/D4Vinci/Scrapling'
336
349
  ```
337
350
  > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
338
351
 
@@ -437,6 +450,9 @@ You can select elements by their text content in multiple ways, here's a full ex
437
450
  >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
438
451
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
439
452
 
453
+ >>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
454
+ 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
455
+
440
456
  >>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
441
457
  [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
442
458
 
@@ -850,7 +866,6 @@ This project includes code adapted from:
850
866
 
851
867
  ## Known Issues
852
868
  - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
853
- - Currently, Scrapling is not compatible with async/await.
854
869
 
855
870
  ---
856
871
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -0,0 +1,47 @@
1
+ scrapling/__init__.py,sha256=pfbhEm1kcriA9pFR3JUUFEE3v4_ykB35SYbeHKzFxHw,500
2
+ scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
3
+ scrapling/fetchers.py,sha256=K3MKBqKDOXItJNwxFY2fe1C21Vz6QSd91fFtN98Mpg4,35402
4
+ scrapling/parser.py,sha256=Fl9cdbR58GuoPbWN5hZI6ToPSl0_rQFXMskTdzpoxWs,55208
5
+ scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
6
+ scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ scrapling/core/_types.py,sha256=OcsP1WeQEOlEVo9OzTrLQfgZZfXuJ0civVs31SynwGA,641
8
+ scrapling/core/custom_types.py,sha256=ZRzpoT6qQ4vU_ejhLXa7WYuYLGl5HwAjLPe01xdhuvM,10808
9
+ scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
10
+ scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
11
+ scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
12
+ scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
13
+ scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
14
+ scrapling/engines/camo.py,sha256=g12IVIPy4Uyp_jngtu8Qcvy7PSMHjURAHUGXdM58Kks,13778
15
+ scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
16
+ scrapling/engines/pw.py,sha256=Eq4_oQA5eX666chiNpXsBqhWONzleniyXjKdmCpXj_Y,18630
17
+ scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
18
+ scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
19
+ scrapling/engines/toolbelt/custom.py,sha256=d3qyeCg_qHm1RRE7yv5hyU9b17Y7YDPGBOVhEH1CAT0,12754
20
+ scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
21
+ scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
22
+ scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
23
+ scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
24
+ scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
25
+ scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
26
+ scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
27
+ scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
28
+ scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
29
+ tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
30
+ tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
31
+ tests/fetchers/test_utils.py,sha256=ANFu-4FFhtyGFGIwJksUO2M2tTTcKU2M_t6F2aav8lM,4967
32
+ tests/fetchers/async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ tests/fetchers/async/test_camoufox.py,sha256=BANJ0TVqEdsjkYlsyU-q_spfaMsqTLOBQU8LUDurL9I,3685
34
+ tests/fetchers/async/test_httpx.py,sha256=6WgsvqV1-rYTjZ9na5x-wt49C3Ur9D99HXBFbewO0gc,3888
35
+ tests/fetchers/async/test_playwright.py,sha256=zzSYnfRksjNep_YipTiYAB9eQaIo3fssKLrsGzXEakw,4068
36
+ tests/fetchers/sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ tests/fetchers/sync/test_camoufox.py,sha256=IcDXPAWSSJnYT6psDFKSbCeym5n7hCrMPYQEghaOX3A,3165
38
+ tests/fetchers/sync/test_httpx.py,sha256=xItYWjnDOIswKJzua2tDq8Oy43nTeFl0O1bci7lzGmg,3615
39
+ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxnmprW0WO6Q,3780
40
+ tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
42
+ tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
43
+ scrapling-0.2.91.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
44
+ scrapling-0.2.91.dist-info/METADATA,sha256=ajc8n5Hjl--ZdGXwHxmfMEWyCMgbw1waZNovoPFxrUc,68339
45
+ scrapling-0.2.91.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
46
+ scrapling-0.2.91.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
47
+ scrapling-0.2.91.dist-info/RECORD,,
File without changes
@@ -0,0 +1,95 @@
1
+ import pytest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import StealthyFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ @pytest.mark.asyncio
9
+ class TestStealthyFetcher:
10
+ @pytest.fixture(scope="class")
11
+ def fetcher(self):
12
+ return StealthyFetcher(auto_match=False)
13
+
14
+ @pytest.fixture(scope="class")
15
+ def urls(self, httpbin):
16
+ url = httpbin.url
17
+ return {
18
+ 'status_200': f'{url}/status/200',
19
+ 'status_404': f'{url}/status/404',
20
+ 'status_501': f'{url}/status/501',
21
+ 'basic_url': f'{url}/get',
22
+ 'html_url': f'{url}/html',
23
+ 'delayed_url': f'{url}/delay/10', # 10 Seconds delay response
24
+ 'cookies_url': f"{url}/cookies/set/test/value"
25
+ }
26
+
27
+ async def test_basic_fetch(self, fetcher, urls):
28
+ """Test doing basic fetch request with multiple statuses"""
29
+ assert (await fetcher.async_fetch(urls['status_200'])).status == 200
30
+ assert (await fetcher.async_fetch(urls['status_404'])).status == 404
31
+ assert (await fetcher.async_fetch(urls['status_501'])).status == 501
32
+
33
+ async def test_networkidle(self, fetcher, urls):
34
+ """Test if waiting for `networkidle` make page does not finish loading or not"""
35
+ assert (await fetcher.async_fetch(urls['basic_url'], network_idle=True)).status == 200
36
+
37
+ async def test_blocking_resources(self, fetcher, urls):
38
+ """Test if blocking resources make page does not finish loading or not"""
39
+ assert (await fetcher.async_fetch(urls['basic_url'], block_images=True)).status == 200
40
+ assert (await fetcher.async_fetch(urls['basic_url'], disable_resources=True)).status == 200
41
+
42
+ async def test_waiting_selector(self, fetcher, urls):
43
+ """Test if waiting for a selector make page does not finish loading or not"""
44
+ assert (await fetcher.async_fetch(urls['html_url'], wait_selector='h1')).status == 200
45
+ assert (await fetcher.async_fetch(
46
+ urls['html_url'],
47
+ wait_selector='h1',
48
+ wait_selector_state='visible'
49
+ )).status == 200
50
+
51
+ async def test_cookies_loading(self, fetcher, urls):
52
+ """Test if cookies are set after the request"""
53
+ response = await fetcher.async_fetch(urls['cookies_url'])
54
+ assert response.cookies == {'test': 'value'}
55
+
56
+ async def test_automation(self, fetcher, urls):
57
+ """Test if automation break the code or not"""
58
+
59
+ async def scroll_page(page):
60
+ await page.mouse.wheel(10, 0)
61
+ await page.mouse.move(100, 400)
62
+ await page.mouse.up()
63
+ return page
64
+
65
+ assert (await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)).status == 200
66
+
67
+ async def test_properties(self, fetcher, urls):
68
+ """Test if different arguments breaks the code or not"""
69
+ assert (await fetcher.async_fetch(
70
+ urls['html_url'],
71
+ block_webrtc=True,
72
+ allow_webgl=True
73
+ )).status == 200
74
+
75
+ assert (await fetcher.async_fetch(
76
+ urls['html_url'],
77
+ block_webrtc=False,
78
+ allow_webgl=True
79
+ )).status == 200
80
+
81
+ assert (await fetcher.async_fetch(
82
+ urls['html_url'],
83
+ block_webrtc=True,
84
+ allow_webgl=False
85
+ )).status == 200
86
+
87
+ assert (await fetcher.async_fetch(
88
+ urls['html_url'],
89
+ extra_headers={'ayo': ''},
90
+ os_randomize=True
91
+ )).status == 200
92
+
93
+ async def test_infinite_timeout(self, fetcher, urls):
94
+ """Test if infinite timeout breaks the code or not"""
95
+ assert (await fetcher.async_fetch(urls['delayed_url'], timeout=None)).status == 200
@@ -0,0 +1,83 @@
1
+ import pytest
2
+ import pytest_httpbin
3
+
4
+ from scrapling.fetchers import AsyncFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ @pytest.mark.asyncio
9
+ class TestAsyncFetcher:
10
+ @pytest.fixture(scope="class")
11
+ def fetcher(self):
12
+ return AsyncFetcher(auto_match=True)
13
+
14
+ @pytest.fixture(scope="class")
15
+ def urls(self, httpbin):
16
+ return {
17
+ 'status_200': f'{httpbin.url}/status/200',
18
+ 'status_404': f'{httpbin.url}/status/404',
19
+ 'status_501': f'{httpbin.url}/status/501',
20
+ 'basic_url': f'{httpbin.url}/get',
21
+ 'post_url': f'{httpbin.url}/post',
22
+ 'put_url': f'{httpbin.url}/put',
23
+ 'delete_url': f'{httpbin.url}/delete',
24
+ 'html_url': f'{httpbin.url}/html'
25
+ }
26
+
27
+ async def test_basic_get(self, fetcher, urls):
28
+ """Test doing basic get request with multiple statuses"""
29
+ assert (await fetcher.get(urls['status_200'])).status == 200
30
+ assert (await fetcher.get(urls['status_404'])).status == 404
31
+ assert (await fetcher.get(urls['status_501'])).status == 501
32
+
33
+ async def test_get_properties(self, fetcher, urls):
34
+ """Test if different arguments with GET request breaks the code or not"""
35
+ assert (await fetcher.get(urls['status_200'], stealthy_headers=True)).status == 200
36
+ assert (await fetcher.get(urls['status_200'], follow_redirects=True)).status == 200
37
+ assert (await fetcher.get(urls['status_200'], timeout=None)).status == 200
38
+ assert (await fetcher.get(
39
+ urls['status_200'],
40
+ stealthy_headers=True,
41
+ follow_redirects=True,
42
+ timeout=None
43
+ )).status == 200
44
+
45
+ async def test_post_properties(self, fetcher, urls):
46
+ """Test if different arguments with POST request breaks the code or not"""
47
+ assert (await fetcher.post(urls['post_url'], data={'key': 'value'})).status == 200
48
+ assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, stealthy_headers=True)).status == 200
49
+ assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, follow_redirects=True)).status == 200
50
+ assert (await fetcher.post(urls['post_url'], data={'key': 'value'}, timeout=None)).status == 200
51
+ assert (await fetcher.post(
52
+ urls['post_url'],
53
+ data={'key': 'value'},
54
+ stealthy_headers=True,
55
+ follow_redirects=True,
56
+ timeout=None
57
+ )).status == 200
58
+
59
+ async def test_put_properties(self, fetcher, urls):
60
+ """Test if different arguments with PUT request breaks the code or not"""
61
+ assert (await fetcher.put(urls['put_url'], data={'key': 'value'})).status in [200, 405]
62
+ assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, stealthy_headers=True)).status in [200, 405]
63
+ assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, follow_redirects=True)).status in [200, 405]
64
+ assert (await fetcher.put(urls['put_url'], data={'key': 'value'}, timeout=None)).status in [200, 405]
65
+ assert (await fetcher.put(
66
+ urls['put_url'],
67
+ data={'key': 'value'},
68
+ stealthy_headers=True,
69
+ follow_redirects=True,
70
+ timeout=None
71
+ )).status in [200, 405]
72
+
73
+ async def test_delete_properties(self, fetcher, urls):
74
+ """Test if different arguments with DELETE request breaks the code or not"""
75
+ assert (await fetcher.delete(urls['delete_url'], stealthy_headers=True)).status == 200
76
+ assert (await fetcher.delete(urls['delete_url'], follow_redirects=True)).status == 200
77
+ assert (await fetcher.delete(urls['delete_url'], timeout=None)).status == 200
78
+ assert (await fetcher.delete(
79
+ urls['delete_url'],
80
+ stealthy_headers=True,
81
+ follow_redirects=True,
82
+ timeout=None
83
+ )).status == 200