scrapling 0.2.7__py3-none-any.whl → 0.2.9__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (38) hide show
  1. scrapling/__init__.py +5 -4
  2. scrapling/core/_types.py +2 -3
  3. scrapling/core/custom_types.py +93 -11
  4. scrapling/core/storage_adaptors.py +9 -10
  5. scrapling/core/translator.py +6 -7
  6. scrapling/core/utils.py +35 -30
  7. scrapling/defaults.py +2 -1
  8. scrapling/engines/__init__.py +2 -2
  9. scrapling/engines/camo.py +96 -26
  10. scrapling/engines/constants.py +4 -4
  11. scrapling/engines/pw.py +166 -96
  12. scrapling/engines/static.py +94 -50
  13. scrapling/engines/toolbelt/__init__.py +6 -20
  14. scrapling/engines/toolbelt/custom.py +22 -23
  15. scrapling/engines/toolbelt/fingerprints.py +7 -7
  16. scrapling/engines/toolbelt/navigation.py +25 -12
  17. scrapling/fetchers.py +233 -17
  18. scrapling/parser.py +63 -28
  19. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/METADATA +41 -25
  20. scrapling-0.2.9.dist-info/RECORD +47 -0
  21. tests/fetchers/async/__init__.py +0 -0
  22. tests/fetchers/async/test_camoufox.py +95 -0
  23. tests/fetchers/async/test_httpx.py +83 -0
  24. tests/fetchers/async/test_playwright.py +99 -0
  25. tests/fetchers/sync/__init__.py +0 -0
  26. tests/fetchers/sync/test_camoufox.py +68 -0
  27. tests/fetchers/sync/test_httpx.py +82 -0
  28. tests/fetchers/sync/test_playwright.py +87 -0
  29. tests/fetchers/test_utils.py +90 -122
  30. tests/parser/test_automatch.py +64 -9
  31. tests/parser/test_general.py +263 -219
  32. scrapling-0.2.7.dist-info/RECORD +0 -42
  33. tests/fetchers/test_camoufox.py +0 -64
  34. tests/fetchers/test_httpx.py +0 -67
  35. tests/fetchers/test_playwright.py +0 -76
  36. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/LICENSE +0 -0
  37. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/WHEEL +0 -0
  38. {scrapling-0.2.7.dist-info → scrapling-0.2.9.dist-info}/top_level.txt +0 -0
scrapling/parser.py CHANGED
@@ -1,22 +1,31 @@
1
+ import inspect
1
2
  import os
2
3
  import re
3
- import inspect
4
4
  from difflib import SequenceMatcher
5
+ from urllib.parse import urljoin
5
6
 
6
- from scrapling.core.translator import HTMLTranslator
7
- from scrapling.core.mixins import SelectorsGeneration
8
- from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
9
- from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
10
- from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
11
- from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
7
+ from cssselect import SelectorError, SelectorSyntaxError
8
+ from cssselect import parse as split_selectors
12
9
  from lxml import etree, html
13
- from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
10
+
11
+ from scrapling.core._types import (Any, Callable, Dict, Generator, Iterable,
12
+ List, Optional, Pattern, SupportsIndex,
13
+ Tuple, Union)
14
+ from scrapling.core.custom_types import (AttributesHandler, TextHandler,
15
+ TextHandlers)
16
+ from scrapling.core.mixins import SelectorsGeneration
17
+ from scrapling.core.storage_adaptors import (SQLiteStorageSystem,
18
+ StorageSystemMixin, _StorageTools)
19
+ from scrapling.core.translator import HTMLTranslator
20
+ from scrapling.core.utils import (clean_spaces, flatten, html_forbidden,
21
+ is_jsonable, log)
14
22
 
15
23
 
16
24
  class Adaptor(SelectorsGeneration):
17
25
  __slots__ = (
18
- 'url', 'encoding', '__auto_match_enabled', '_root', '_storage', '__debug',
26
+ 'url', 'encoding', '__auto_match_enabled', '_root', '_storage',
19
27
  '__keep_comments', '__huge_tree_enabled', '__attributes', '__text', '__tag',
28
+ '__keep_cdata', '__raw_body'
20
29
  )
21
30
 
22
31
  def __init__(
@@ -28,10 +37,10 @@ class Adaptor(SelectorsGeneration):
28
37
  huge_tree: bool = True,
29
38
  root: Optional[html.HtmlElement] = None,
30
39
  keep_comments: Optional[bool] = False,
40
+ keep_cdata: Optional[bool] = False,
31
41
  auto_match: Optional[bool] = True,
32
42
  storage: Any = SQLiteStorageSystem,
33
43
  storage_args: Optional[Dict] = None,
34
- debug: Optional[bool] = True,
35
44
  **kwargs
36
45
  ):
37
46
  """The main class that works as a wrapper for the HTML input data. Using this class, you can search for elements
@@ -51,33 +60,36 @@ class Adaptor(SelectorsGeneration):
51
60
  :param root: Used internally to pass etree objects instead of text/body arguments, it takes highest priority.
52
61
  Don't use it unless you know what you are doing!
53
62
  :param keep_comments: While parsing the HTML body, drop comments or not. Disabled by default for obvious reasons
63
+ :param keep_cdata: While parsing the HTML body, drop cdata or not. Disabled by default for cleaner HTML.
54
64
  :param auto_match: Globally turn-off the auto-match feature in all functions, this argument takes higher
55
65
  priority over all auto-match related arguments/functions in the class.
56
66
  :param storage: The storage class to be passed for auto-matching functionalities, see ``Docs`` for more info.
57
67
  :param storage_args: A dictionary of ``argument->value`` pairs to be passed for the storage class.
58
68
  If empty, default values will be used.
59
- :param debug: Enable debug mode
60
69
  """
61
70
  if root is None and not body and text is None:
62
71
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
63
72
 
64
73
  self.__text = None
74
+ self.__raw_body = ''
65
75
  if root is None:
66
76
  if text is None:
67
77
  if not body or not isinstance(body, bytes):
68
78
  raise TypeError(f"body argument must be valid and of type bytes, got {body.__class__}")
69
79
 
70
80
  body = body.replace(b"\x00", b"").strip()
81
+ self.__raw_body = body.replace(b"\x00", b"").strip().decode()
71
82
  else:
72
83
  if not isinstance(text, str):
73
84
  raise TypeError(f"text argument must be of type str, got {text.__class__}")
74
85
 
75
86
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
87
+ self.__raw_body = text.strip()
76
88
 
77
89
  # https://lxml.de/api/lxml.etree.HTMLParser-class.html
78
90
  parser = html.HTMLParser(
79
- recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
80
- compact=True, huge_tree=huge_tree, default_doctype=True
91
+ recover=True, remove_blank_text=True, remove_comments=(not keep_comments), encoding=encoding,
92
+ compact=True, huge_tree=huge_tree, default_doctype=True, strip_cdata=(not keep_cdata),
81
93
  )
82
94
  self._root = etree.fromstring(body, parser=parser, base_url=url)
83
95
  if is_jsonable(text or body.decode()):
@@ -92,7 +104,6 @@ class Adaptor(SelectorsGeneration):
92
104
 
93
105
  self._root = root
94
106
 
95
- setup_basic_logging(level='debug' if debug else 'info')
96
107
  self.__auto_match_enabled = auto_match
97
108
 
98
109
  if self.__auto_match_enabled:
@@ -103,7 +114,7 @@ class Adaptor(SelectorsGeneration):
103
114
  }
104
115
 
105
116
  if not hasattr(storage, '__wrapped__'):
106
- raise ValueError("Storage class must be wrapped with cache decorator, see docs for info")
117
+ raise ValueError("Storage class must be wrapped with lru_cache decorator, see docs for info")
107
118
 
108
119
  if not issubclass(storage.__wrapped__, StorageSystemMixin):
109
120
  raise ValueError("Storage system must be inherited from class `StorageSystemMixin`")
@@ -111,13 +122,13 @@ class Adaptor(SelectorsGeneration):
111
122
  self._storage = storage(**storage_args)
112
123
 
113
124
  self.__keep_comments = keep_comments
125
+ self.__keep_cdata = keep_cdata
114
126
  self.__huge_tree_enabled = huge_tree
115
127
  self.encoding = encoding
116
128
  self.url = url
117
129
  # For selector stuff
118
130
  self.__attributes = None
119
131
  self.__tag = None
120
- self.__debug = debug
121
132
  # No need to check if all response attributes exist or not because if `status` exist, then the rest exist (Save some CPU cycles for speed)
122
133
  self.__response_data = {
123
134
  key: getattr(self, key) for key in ('status', 'reason', 'cookies', 'headers', 'request_headers',)
@@ -148,8 +159,8 @@ class Adaptor(SelectorsGeneration):
148
159
  root=element,
149
160
  text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
150
161
  url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
151
- keep_comments=True, # if the comments are already removed in initialization, no need to try to delete them in sub-elements
152
- huge_tree=self.__huge_tree_enabled, debug=self.__debug,
162
+ keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
163
+ huge_tree=self.__huge_tree_enabled,
153
164
  **self.__response_data
154
165
  )
155
166
  return element
@@ -236,6 +247,10 @@ class Adaptor(SelectorsGeneration):
236
247
 
237
248
  return TextHandler(separator.join([s for s in _all_strings]))
238
249
 
250
+ def urljoin(self, relative_url: str) -> str:
251
+ """Join this Adaptor's url with a relative url to form an absolute full URL."""
252
+ return urljoin(self.url, relative_url)
253
+
239
254
  @property
240
255
  def attrib(self) -> AttributesHandler:
241
256
  """Get attributes of the element"""
@@ -248,7 +263,10 @@ class Adaptor(SelectorsGeneration):
248
263
  """Return the inner html code of the element"""
249
264
  return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
250
265
 
251
- body = html_content
266
+ @property
267
+ def body(self) -> str:
268
+ """Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
269
+ return self.__raw_body or self.html_content
252
270
 
253
271
  def prettify(self) -> str:
254
272
  """Return a prettified version of the element's inner html-code"""
@@ -323,6 +341,16 @@ class Adaptor(SelectorsGeneration):
323
341
 
324
342
  return self.__convert_results(prev_element)
325
343
 
344
+ # For easy copy-paste from Scrapy/parsel code when needed :)
345
+ def get(self, default=None):
346
+ return self
347
+
348
+ def get_all(self):
349
+ return self
350
+
351
+ extract = get_all
352
+ extract_first = get
353
+
326
354
  def __str__(self) -> str:
327
355
  return self.html_content
328
356
 
@@ -385,10 +413,10 @@ class Adaptor(SelectorsGeneration):
385
413
  if score_table:
386
414
  highest_probability = max(score_table.keys())
387
415
  if score_table[highest_probability] and highest_probability >= percentage:
388
- logging.debug(f'Highest probability was {highest_probability}%')
389
- logging.debug('Top 5 best matching elements are: ')
416
+ log.debug(f'Highest probability was {highest_probability}%')
417
+ log.debug('Top 5 best matching elements are: ')
390
418
  for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
391
- logging.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
419
+ log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
392
420
  if not adaptor_type:
393
421
  return score_table[highest_probability]
394
422
  return self.__convert_results(score_table[highest_probability])
@@ -514,7 +542,7 @@ class Adaptor(SelectorsGeneration):
514
542
 
515
543
  if selected_elements:
516
544
  if not self.__auto_match_enabled and auto_save:
517
- logging.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
545
+ log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
518
546
 
519
547
  elif self.__auto_match_enabled and auto_save:
520
548
  self.save(selected_elements[0], identifier or selector)
@@ -533,7 +561,7 @@ class Adaptor(SelectorsGeneration):
533
561
  return self.__convert_results(selected_elements)
534
562
 
535
563
  elif not self.__auto_match_enabled and auto_match:
536
- logging.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
564
+ log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
537
565
 
538
566
  return self.__convert_results(selected_elements)
539
567
 
@@ -737,8 +765,8 @@ class Adaptor(SelectorsGeneration):
737
765
 
738
766
  self._storage.save(element, identifier)
739
767
  else:
740
- logging.critical(
741
- "Can't use Auto-match features with disabled globally, you have to start a new class instance."
768
+ log.critical(
769
+ "Can't use Auto-match features while disabled globally, you have to start a new class instance."
742
770
  )
743
771
 
744
772
  def retrieve(self, identifier: str) -> Optional[Dict]:
@@ -751,8 +779,8 @@ class Adaptor(SelectorsGeneration):
751
779
  if self.__auto_match_enabled:
752
780
  return self._storage.retrieve(identifier)
753
781
 
754
- logging.critical(
755
- "Can't use Auto-match features with disabled globally, you have to start a new class instance."
782
+ log.critical(
783
+ "Can't use Auto-match features while disabled globally, you have to start a new class instance."
756
784
  )
757
785
 
758
786
  # Operations on text functions
@@ -1066,12 +1094,19 @@ class Adaptors(List[Adaptor]):
1066
1094
  ]
1067
1095
  return self.__class__(results) if results else results
1068
1096
 
1097
+ # For easy copy-paste from Scrapy/parsel code when needed :)
1069
1098
  def get(self, default=None):
1070
1099
  """Returns the first item of the current list
1071
1100
  :param default: the default value to return if the current list is empty
1072
1101
  """
1073
1102
  return self[0] if len(self) > 0 else default
1074
1103
 
1104
+ def extract(self):
1105
+ return self
1106
+
1107
+ extract_first = get
1108
+ get_all = extract
1109
+
1075
1110
  @property
1076
1111
  def first(self):
1077
1112
  """Returns the first item of the current list or `None` if the list is empty"""
@@ -1,7 +1,7 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: scrapling
3
- Version: 0.2.7
4
- Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
3
+ Version: 0.2.9
4
+ Summary: Scrapling is a powerful, flexible, and high-performance web scraping library for Python. It
5
5
  Home-page: https://github.com/D4Vinci/Scrapling
6
6
  Author: Karim Shoair
7
7
  Author-email: karim.shoair@pm.me
@@ -29,7 +29,7 @@ Classifier: Programming Language :: Python :: 3.12
29
29
  Classifier: Programming Language :: Python :: 3.13
30
30
  Classifier: Programming Language :: Python :: Implementation :: CPython
31
31
  Classifier: Typing :: Typed
32
- Requires-Python: >=3.8
32
+ Requires-Python: >=3.9
33
33
  Description-Content-Type: text/markdown
34
34
  License-File: LICENSE
35
35
  Requires-Dist: requests>=2.3
@@ -39,10 +39,9 @@ Requires-Dist: w3lib
39
39
  Requires-Dist: orjson>=3
40
40
  Requires-Dist: tldextract
41
41
  Requires-Dist: httpx[brotli,zstd]
42
- Requires-Dist: playwright==1.48
43
- Requires-Dist: rebrowser-playwright
44
- Requires-Dist: camoufox>=0.3.10
45
- Requires-Dist: browserforge
42
+ Requires-Dist: playwright>=1.49.1
43
+ Requires-Dist: rebrowser-playwright>=1.49.1
44
+ Requires-Dist: camoufox[geoip]>=0.4.9
46
45
 
47
46
  # 🕷️ Scrapling: Undetectable, Lightning-Fast, and Adaptive Web Scraping for Python
48
47
  [![Tests](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml/badge.svg)](https://github.com/D4Vinci/Scrapling/actions/workflows/tests.yml) [![PyPI version](https://badge.fury.io/py/Scrapling.svg)](https://badge.fury.io/py/Scrapling) [![Supported Python versions](https://img.shields.io/pypi/pyversions/scrapling.svg)](https://pypi.org/project/scrapling/) [![PyPI Downloads](https://static.pepy.tech/badge/scrapling)](https://pepy.tech/project/scrapling)
@@ -52,7 +51,7 @@ Dealing with failing web scrapers due to anti-bot protections or website changes
52
51
  Scrapling is a high-performance, intelligent web scraping library for Python that automatically adapts to website changes while significantly outperforming popular alternatives. For both beginners and experts, Scrapling provides powerful features while maintaining simplicity.
53
52
 
54
53
  ```python
55
- >> from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
54
+ >> from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
56
55
  # Fetch websites' source under the radar!
57
56
  >> page = StealthyFetcher.fetch('https://example.com', headless=True, network_idle=True)
58
57
  >> print(page.status)
@@ -81,7 +80,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
81
80
 
82
81
  ## Table of content
83
82
  * [Key Features](#key-features)
84
- * [Fetch websites as you prefer](#fetch-websites-as-you-prefer)
83
+ * [Fetch websites as you prefer](#fetch-websites-as-you-prefer-with-async-support)
85
84
  * [Adaptive Scraping](#adaptive-scraping)
86
85
  * [Performance](#performance)
87
86
  * [Developing Experience](#developing-experience)
@@ -122,7 +121,7 @@ Scrapling is a high-performance, intelligent web scraping library for Python tha
122
121
 
123
122
  ## Key Features
124
123
 
125
- ### Fetch websites as you prefer
124
+ ### Fetch websites as you prefer with async support
126
125
  - **HTTP requests**: Stealthy and fast HTTP requests with `Fetcher`
127
126
  - **Stealthy fetcher**: Annoying anti-bot protection? No problem! Scrapling can bypass almost all of them with `StealthyFetcher` with default configuration!
128
127
  - **Your preferred browser**: Use your real browser with CDP, [NSTbrowser](https://app.nstbrowser.io/r/1vO5e5)'s browserless, PlayWright with stealth mode, or even vanilla PlayWright - All is possible with `PlayWrightFetcher`!
@@ -213,7 +212,7 @@ Scrapling can find elements with more methods and it returns full element `Adapt
213
212
  > All benchmarks' results are an average of 100 runs. See our [benchmarks.py](https://github.com/D4Vinci/Scrapling/blob/main/benchmarks.py) for methodology and to run your comparisons.
214
213
 
215
214
  ## Installation
216
- Scrapling is a breeze to get started with - Starting from version 0.2, we require at least Python 3.8 to work.
215
+ Scrapling is a breeze to get started with - Starting from version 0.2.9, we require at least Python 3.9 to work.
217
216
  ```bash
218
217
  pip3 install scrapling
219
218
  ```
@@ -258,47 +257,58 @@ python -m browserforge update
258
257
  ```
259
258
 
260
259
  ## Fetching Websites
261
- Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you want then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
260
+ Fetchers are basically interfaces that do requests or fetch pages for you in a single request fashion and then return an `Adaptor` object for you. This feature was introduced because the only option we had before was to fetch the page as you wanted it, then pass it manually to the `Adaptor` class to create an `Adaptor` instance and start playing around with the page.
262
261
 
263
262
  ### Features
264
- You might be a little bit confused by now so let me clear things up. All fetcher-type classes are imported in the same way
263
+ You might be slightly confused by now so let me clear things up. All fetcher-type classes are imported in the same way
265
264
  ```python
266
265
  from scrapling import Fetcher, StealthyFetcher, PlayWrightFetcher
267
266
  ```
268
- And all of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `storage`, `storage_args`, and `debug` which are the same ones you give to the `Adaptor` class.
267
+ All of them can take these initialization arguments: `auto_match`, `huge_tree`, `keep_comments`, `keep_cdata`, `storage`, and `storage_args`, which are the same ones you give to the `Adaptor` class.
269
268
 
270
269
  If you don't want to pass arguments to the generated `Adaptor` object and want to use the default values, you can use this import instead for cleaner code:
271
270
  ```python
272
- from scrapling.default import Fetcher, StealthyFetcher, PlayWrightFetcher
271
+ from scrapling.defaults import Fetcher, AsyncFetcher, StealthyFetcher, PlayWrightFetcher
273
272
  ```
274
273
  then use it right away without initializing like:
275
274
  ```python
276
275
  page = StealthyFetcher.fetch('https://example.com')
277
276
  ```
278
277
 
279
- Also, the `Response` object returned from all fetchers is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
278
+ Also, the `Response` object returned from all fetchers is the same as the `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`. All `cookies`, `headers`, and `request_headers` are always of type `dictionary`.
280
279
  > [!NOTE]
281
280
  > The `auto_match` argument is enabled by default which is the one you should care about the most as you will see later.
282
281
  ### Fetcher
283
282
  This class is built on top of [httpx](https://www.python-httpx.org/) with additional configuration options, here you can do `GET`, `POST`, `PUT`, and `DELETE` requests.
284
283
 
285
- For all methods, you have `stealth_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default.
284
+ For all methods, you have `stealthy_headers` which makes `Fetcher` create and use real browser's headers then create a referer header as if this request came from Google's search of this URL's domain. It's enabled by default. You can also set the number of retries with the argument `retries` for all methods and this will make httpx retry requests if it failed for any reason. The default number of retries for all `Fetcher` methods is 3.
286
285
 
287
286
  You can route all traffic (HTTP and HTTPS) to a proxy for any of these methods in this format `http://username:password@localhost:8030`
288
287
  ```python
289
- >> page = Fetcher().get('https://httpbin.org/get', stealth_headers=True, follow_redirects=True)
288
+ >> page = Fetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
290
289
  >> page = Fetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
291
290
  >> page = Fetcher().put('https://httpbin.org/put', data={'key': 'value'})
292
291
  >> page = Fetcher().delete('https://httpbin.org/delete')
293
292
  ```
293
+ For Async requests, you will just replace the import like below:
294
+ ```python
295
+ >> from scrapling import AsyncFetcher
296
+ >> page = await AsyncFetcher().get('https://httpbin.org/get', stealthy_headers=True, follow_redirects=True)
297
+ >> page = await AsyncFetcher().post('https://httpbin.org/post', data={'key': 'value'}, proxy='http://username:password@localhost:8030')
298
+ >> page = await AsyncFetcher().put('https://httpbin.org/put', data={'key': 'value'})
299
+ >> page = await AsyncFetcher().delete('https://httpbin.org/delete')
300
+ ```
294
301
  ### StealthyFetcher
295
- This class is built on top of [Camoufox](https://github.com/daijro/camoufox) which by default bypasses most of the anti-bot protections. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
302
+ This class is built on top of [Camoufox](https://github.com/daijro/camoufox), bypassing most anti-bot protections by default. Scrapling adds extra layers of flavors and configurations to increase performance and undetectability even further.
296
303
  ```python
297
304
  >> page = StealthyFetcher().fetch('https://www.browserscan.net/bot-detection') # Running headless by default
298
305
  >> page.status == 200
299
306
  True
307
+ >> page = await StealthyFetcher().async_fetch('https://www.browserscan.net/bot-detection') # the async version of fetch
308
+ >> page.status == 200
309
+ True
300
310
  ```
301
- > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
311
+ > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
302
312
 
303
313
  <details><summary><strong>For the sake of simplicity, expand this for the complete list of arguments</strong></summary>
304
314
 
@@ -314,7 +324,8 @@ True
314
324
  | page_action | Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again. | ✔️ |
315
325
  | addons | List of Firefox addons to use. **Must be paths to extracted addons.** | ✔️ |
316
326
  | humanize | Humanize the cursor movement. Takes either True or the MAX duration in seconds of the cursor movement. The cursor typically takes up to 1.5 seconds to move across the window. | ✔️ |
317
- | allow_webgl | Whether to allow WebGL. To prevent leaks, only use this for special cases. | ✔️ |
327
+ | allow_webgl | Enabled by default. Disabling it WebGL not recommended as many WAFs now checks if WebGL is enabled. | ✔️ |
328
+ | geoip | Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, & spoof the WebRTC IP address. It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region. | ✔️ |
318
329
  | disable_ads | Enabled by default, this installs `uBlock Origin` addon on the browser if enabled. | ✔️ |
319
330
  | network_idle | Wait for the page until there are no network connections for at least 500 ms. | ✔️ |
320
331
  | timeout | The timeout in milliseconds that is used in all operations and waits through the page. The default is 30000. | ✔️ |
@@ -333,8 +344,11 @@ This class is built on top of [Playwright](https://playwright.dev/python/) which
333
344
  >> page = PlayWrightFetcher().fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # Vanilla Playwright option
334
345
  >> page.css_first("#search a::attr(href)")
335
346
  'https://github.com/D4Vinci/Scrapling'
347
+ >> page = await PlayWrightFetcher().async_fetch('https://www.google.com/search?q=%22Scrapling%22', disable_resources=True) # the async version of fetch
348
+ >> page.css_first("#search a::attr(href)")
349
+ 'https://github.com/D4Vinci/Scrapling'
336
350
  ```
337
- > Note: all requests done by this fetcher is waiting by default for all JS to be fully loaded and executed so you don't have to :)
351
+ > Note: all requests done by this fetcher are waiting by default for all JS to be fully loaded and executed so you don't have to :)
338
352
 
339
353
  Using this Fetcher class, you can make requests with:
340
354
  1) Vanilla Playwright without any modifications other than the ones you chose.
@@ -346,7 +360,7 @@ Using this Fetcher class, you can make requests with:
346
360
  3) Real browsers by passing the `real_chrome` argument or the CDP URL of your browser to be controlled by the Fetcher and most of the options can be enabled on it.
347
361
  4) [NSTBrowser](https://app.nstbrowser.io/r/1vO5e5)'s [docker browserless](https://hub.docker.com/r/nstbrowser/browserless) option by passing the CDP URL and enabling `nstbrowser_mode` option.
348
362
 
349
- > Hence using the `real_chrome` argument requires that you have chrome browser installed on your device
363
+ > Hence using the `real_chrome` argument requires that you have Chrome browser installed on your device
350
364
 
351
365
  Add that to a lot of controlling/hiding options as you will see in the arguments list below.
352
366
 
@@ -369,7 +383,7 @@ Add that to a lot of controlling/hiding options as you will see in the arguments
369
383
  | hide_canvas | Add random noise to canvas operations to prevent fingerprinting. | ✔️ |
370
384
  | disable_webgl | Disables WebGL and WebGL 2.0 support entirely. | ✔️ |
371
385
  | stealth | Enables stealth mode, always check the documentation to see what stealth mode does currently. | ✔️ |
372
- | real_chrome | If you have chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
386
+ | real_chrome | If you have Chrome browser installed on your device, enable this and the Fetcher will launch an instance of your browser and use it. | ✔️ |
373
387
  | locale | Set the locale for the browser if wanted. The default value is `en-US`. | ✔️ |
374
388
  | cdp_url | Instead of launching a new browser instance, connect to this CDP URL to control real browsers/NSTBrowser through CDP. | ✔️ |
375
389
  | nstbrowser_mode | Enables NSTBrowser mode, **it have to be used with `cdp_url` argument or it will get completely ignored.** | ✔️ |
@@ -437,6 +451,9 @@ You can select elements by their text content in multiple ways, here's a full ex
437
451
  >>> page.find_by_text('Tipping the Velvet') # Find the first element whose text fully matches this text
438
452
  <data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>
439
453
 
454
+ >>> page.urljoin(page.find_by_text('Tipping the Velvet').attrib['href']) # We use `page.urljoin` to return the full URL from the relative `href`
455
+ 'https://books.toscrape.com/catalogue/tipping-the-velvet_999/index.html'
456
+
440
457
  >>> page.find_by_text('Tipping the Velvet', first_match=False) # Get all matches if there are more
441
458
  [<data='<a href="catalogue/tipping-the-velvet_99...' parent='<h3><a href="catalogue/tipping-the-velve...'>]
442
459
 
@@ -850,7 +867,6 @@ This project includes code adapted from:
850
867
 
851
868
  ## Known Issues
852
869
  - In the auto-matching save process, the unique properties of the first element from the selection results are the only ones that get saved. So if the selector you are using selects different elements on the page that are in different locations, auto-matching will probably return to you the first element only when you relocate it later. This doesn't include combined CSS selectors (Using commas to combine more than one selector for example) as these selectors get separated and each selector gets executed alone.
853
- - Currently, Scrapling is not compatible with async/await.
854
870
 
855
871
  ---
856
872
  <div align="center"><small>Designed & crafted with ❤️ by Karim Shoair.</small></div><br>
@@ -0,0 +1,47 @@
1
+ scrapling/__init__.py,sha256=4adit4xM1Io6mBz-VnnSHcPCQxIYhvDmDVMhbXu8VF4,499
2
+ scrapling/defaults.py,sha256=tJAOMB-PMd3aLZz3j_yr6haBxxaklAvWdS_hP-GFFdU,331
3
+ scrapling/fetchers.py,sha256=I_N32DMjCzNCMmrkGYoX480x1Eh5Lka6cMJ-EcSfszk,35342
4
+ scrapling/parser.py,sha256=NKwOsGR6TB7XC9lMkA418_DRWE6pyUqK0XtmTAA51ic,55215
5
+ scrapling/py.typed,sha256=frcCV1k9oG9oKj3dpUqdJg1PxRT2RSN_XKdLCPjaYaY,2
6
+ scrapling/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
7
+ scrapling/core/_types.py,sha256=__HJ2JTk5vx5eg_7HAJmDjaHrMDIaoxNG8fadLLyKV8,566
8
+ scrapling/core/custom_types.py,sha256=ZRzpoT6qQ4vU_ejhLXa7WYuYLGl5HwAjLPe01xdhuvM,10808
9
+ scrapling/core/mixins.py,sha256=sozbpaGL1_O_x3U-ABM5aYWpnxpCLfdbcA9SG3P7weY,3532
10
+ scrapling/core/storage_adaptors.py,sha256=l_ZYcdn1y69AcoPuRrPoaxqKysN62pMExrwJWYdu5MA,6220
11
+ scrapling/core/translator.py,sha256=ojDmNi5pFZE6Ke-AiSsTilXiPRdR8yhX3o-uVGMkap8,5236
12
+ scrapling/core/utils.py,sha256=03LzCDzmeK1TXPjIKVzHSUgSfhpe36XE8AwxlgxzJoU,3705
13
+ scrapling/engines/__init__.py,sha256=zA7tzqcDXP0hllwmjVewNHWipIA4JSU9mRG4J-cud0c,267
14
+ scrapling/engines/camo.py,sha256=L5jRNUgJSAY5hE8KCD-tz4SFrx7ZjowJoWpHrl7havI,12359
15
+ scrapling/engines/constants.py,sha256=Gb_nXFoBB4ujJkd05SKkenMe1UDiRYQA3dkmA3DunLg,3723
16
+ scrapling/engines/pw.py,sha256=0vCDaodve_WcOdbGqBdyRwMECPZmQ0eGLQikh4WHKFc,17011
17
+ scrapling/engines/static.py,sha256=7SVEfeigCPfwC1ukx0zIFFe96Bo5fox6qOq2IWrP6P8,10319
18
+ scrapling/engines/toolbelt/__init__.py,sha256=VQDdYm1zY9Apno6d8UrULk29vUjllZrQqD8mXL1E2Fc,402
19
+ scrapling/engines/toolbelt/custom.py,sha256=FbWTUC0Z8NTmTLFDiiCchs4W0_Q40lz2ONnhInRNuvA,12947
20
+ scrapling/engines/toolbelt/fingerprints.py,sha256=ajEHdXHr7W4hw9KcNS7XlyxNBZu37p1bRj18TiICLzU,2929
21
+ scrapling/engines/toolbelt/navigation.py,sha256=xEfZRJefuxOCGxQOSI2llS0du0Y2XmoIPdVGUSHOd7k,4567
22
+ scrapling/engines/toolbelt/bypasses/navigator_plugins.js,sha256=tbnnk3nCXB6QEQnOhDlu3n-s7lnUTAkrUsjP6FDQIQg,2104
23
+ scrapling/engines/toolbelt/bypasses/notification_permission.js,sha256=poPM3o5WYgEX-EdiUfDCllpWfc3Umvw4jr2u6O6elus,237
24
+ scrapling/engines/toolbelt/bypasses/pdf_viewer.js,sha256=mKjjSuP1-BOGC_2WhRYHJo_LP7lTBi2KXmP_zsHO_tI,173
25
+ scrapling/engines/toolbelt/bypasses/playwright_fingerprint.js,sha256=3RP1AE_XZRvpupeV_i-WSNVqRxyUy0qd8rQV8j_4j3U,221
26
+ scrapling/engines/toolbelt/bypasses/screen_props.js,sha256=fZEuHMQ1-fYuxxUMoQXUvVWYUkPUbblkfMfpiLvBY7w,599
27
+ scrapling/engines/toolbelt/bypasses/webdriver_fully.js,sha256=hdJw4clRAJQqIdq5gIFC_eC-x7C1i2ab01KV5ylmOBs,728
28
+ scrapling/engines/toolbelt/bypasses/window_chrome.js,sha256=D7hqzNGGDorh8JVlvm2YIv7Bk2CoVkG55MDIdyqhT1w,6808
29
+ tests/__init__.py,sha256=YHFB5ftzgLQVh6gbPfbYcY4yOS9DOBp5dBa6I-qtm8U,32
30
+ tests/fetchers/__init__.py,sha256=6H4NgARhyTcGGd3dNCKQJ8kUFdrAEMSScQL7Ga_vU3c,43
31
+ tests/fetchers/test_utils.py,sha256=ANFu-4FFhtyGFGIwJksUO2M2tTTcKU2M_t6F2aav8lM,4967
32
+ tests/fetchers/async/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
33
+ tests/fetchers/async/test_camoufox.py,sha256=BANJ0TVqEdsjkYlsyU-q_spfaMsqTLOBQU8LUDurL9I,3685
34
+ tests/fetchers/async/test_httpx.py,sha256=6WgsvqV1-rYTjZ9na5x-wt49C3Ur9D99HXBFbewO0gc,3888
35
+ tests/fetchers/async/test_playwright.py,sha256=zzSYnfRksjNep_YipTiYAB9eQaIo3fssKLrsGzXEakw,4068
36
+ tests/fetchers/sync/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ tests/fetchers/sync/test_camoufox.py,sha256=IcDXPAWSSJnYT6psDFKSbCeym5n7hCrMPYQEghaOX3A,3165
38
+ tests/fetchers/sync/test_httpx.py,sha256=xItYWjnDOIswKJzua2tDq8Oy43nTeFl0O1bci7lzGmg,3615
39
+ tests/fetchers/sync/test_playwright.py,sha256=5eZdPwk3JGeaO7GuExv_QsByLyWDE9joxnmprW0WO6Q,3780
40
+ tests/parser/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
41
+ tests/parser/test_automatch.py,sha256=SxsNdExE8zz8AcPRQFBUjZ3Q_1-tPOd9dzVvMSZpOYQ,4908
42
+ tests/parser/test_general.py,sha256=dyfOsc8lleoY4AxcfDUBUaD1i95xecfYuTUhKBsYjwo,12100
43
+ scrapling-0.2.9.dist-info/LICENSE,sha256=XHgu8DRuT7_g3Hb9Q18YGg8eShp6axPBacbnQxT_WWQ,1499
44
+ scrapling-0.2.9.dist-info/METADATA,sha256=Wg6lcRo_5LcyotrB1ZXagT5-gToAyRmtNKsq6TJoNk4,68382
45
+ scrapling-0.2.9.dist-info/WHEEL,sha256=PZUExdf71Ui_so67QXpySuHtCi3-J3wvF4ORK6k_S8U,91
46
+ scrapling-0.2.9.dist-info/top_level.txt,sha256=ub7FkOEXeYmmYTUxd4pCrwXfBfAMIpZ1sCGmXCc14tI,16
47
+ scrapling-0.2.9.dist-info/RECORD,,
File without changes
@@ -0,0 +1,95 @@
1
+ import pytest
2
+ import pytest_httpbin
3
+
4
+ from scrapling import StealthyFetcher
5
+
6
+
7
+ @pytest_httpbin.use_class_based_httpbin
8
+ @pytest.mark.asyncio
9
+ class TestStealthyFetcher:
10
+ @pytest.fixture(scope="class")
11
+ def fetcher(self):
12
+ return StealthyFetcher(auto_match=False)
13
+
14
+ @pytest.fixture(scope="class")
15
+ def urls(self, httpbin):
16
+ url = httpbin.url
17
+ return {
18
+ 'status_200': f'{url}/status/200',
19
+ 'status_404': f'{url}/status/404',
20
+ 'status_501': f'{url}/status/501',
21
+ 'basic_url': f'{url}/get',
22
+ 'html_url': f'{url}/html',
23
+ 'delayed_url': f'{url}/delay/10', # 10 Seconds delay response
24
+ 'cookies_url': f"{url}/cookies/set/test/value"
25
+ }
26
+
27
+ async def test_basic_fetch(self, fetcher, urls):
28
+ """Test doing basic fetch request with multiple statuses"""
29
+ assert (await fetcher.async_fetch(urls['status_200'])).status == 200
30
+ assert (await fetcher.async_fetch(urls['status_404'])).status == 404
31
+ assert (await fetcher.async_fetch(urls['status_501'])).status == 501
32
+
33
+ async def test_networkidle(self, fetcher, urls):
34
+ """Test if waiting for `networkidle` make page does not finish loading or not"""
35
+ assert (await fetcher.async_fetch(urls['basic_url'], network_idle=True)).status == 200
36
+
37
+ async def test_blocking_resources(self, fetcher, urls):
38
+ """Test if blocking resources make page does not finish loading or not"""
39
+ assert (await fetcher.async_fetch(urls['basic_url'], block_images=True)).status == 200
40
+ assert (await fetcher.async_fetch(urls['basic_url'], disable_resources=True)).status == 200
41
+
42
+ async def test_waiting_selector(self, fetcher, urls):
43
+ """Test if waiting for a selector make page does not finish loading or not"""
44
+ assert (await fetcher.async_fetch(urls['html_url'], wait_selector='h1')).status == 200
45
+ assert (await fetcher.async_fetch(
46
+ urls['html_url'],
47
+ wait_selector='h1',
48
+ wait_selector_state='visible'
49
+ )).status == 200
50
+
51
+ async def test_cookies_loading(self, fetcher, urls):
52
+ """Test if cookies are set after the request"""
53
+ response = await fetcher.async_fetch(urls['cookies_url'])
54
+ assert response.cookies == {'test': 'value'}
55
+
56
+ async def test_automation(self, fetcher, urls):
57
+ """Test if automation break the code or not"""
58
+
59
+ async def scroll_page(page):
60
+ await page.mouse.wheel(10, 0)
61
+ await page.mouse.move(100, 400)
62
+ await page.mouse.up()
63
+ return page
64
+
65
+ assert (await fetcher.async_fetch(urls['html_url'], page_action=scroll_page)).status == 200
66
+
67
+ async def test_properties(self, fetcher, urls):
68
+ """Test if different arguments breaks the code or not"""
69
+ assert (await fetcher.async_fetch(
70
+ urls['html_url'],
71
+ block_webrtc=True,
72
+ allow_webgl=True
73
+ )).status == 200
74
+
75
+ assert (await fetcher.async_fetch(
76
+ urls['html_url'],
77
+ block_webrtc=False,
78
+ allow_webgl=True
79
+ )).status == 200
80
+
81
+ assert (await fetcher.async_fetch(
82
+ urls['html_url'],
83
+ block_webrtc=True,
84
+ allow_webgl=False
85
+ )).status == 200
86
+
87
+ assert (await fetcher.async_fetch(
88
+ urls['html_url'],
89
+ extra_headers={'ayo': ''},
90
+ os_randomize=True
91
+ )).status == 200
92
+
93
+ async def test_infinite_timeout(self, fetcher, urls):
94
+ """Test if infinite timeout breaks the code or not"""
95
+ assert (await fetcher.async_fetch(urls['delayed_url'], timeout=None)).status == 200