scrapling 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. scrapling/__init__.py +4 -3
  2. scrapling/core/__init__.py +0 -0
  3. scrapling/core/_types.py +25 -0
  4. scrapling/{custom_types.py → core/custom_types.py} +48 -3
  5. scrapling/{mixins.py → core/mixins.py} +22 -7
  6. scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
  7. scrapling/{translator.py → core/translator.py} +2 -12
  8. scrapling/{utils.py → core/utils.py} +14 -61
  9. scrapling/engines/__init__.py +7 -0
  10. scrapling/engines/camo.py +128 -0
  11. scrapling/engines/constants.py +108 -0
  12. scrapling/engines/pw.py +237 -0
  13. scrapling/engines/static.py +112 -0
  14. scrapling/engines/toolbelt/__init__.py +19 -0
  15. scrapling/engines/toolbelt/custom.py +154 -0
  16. scrapling/engines/toolbelt/fingerprints.py +81 -0
  17. scrapling/engines/toolbelt/navigation.py +108 -0
  18. scrapling/fetchers.py +198 -0
  19. scrapling/parser.py +223 -70
  20. scrapling/py.typed +1 -0
  21. scrapling-0.2.1.dist-info/METADATA +835 -0
  22. scrapling-0.2.1.dist-info/RECORD +33 -0
  23. {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/WHEEL +1 -1
  24. {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/top_level.txt +1 -0
  25. tests/__init__.py +1 -0
  26. tests/fetchers/__init__.py +1 -0
  27. tests/fetchers/test_camoufox.py +62 -0
  28. tests/fetchers/test_httpx.py +67 -0
  29. tests/fetchers/test_playwright.py +74 -0
  30. tests/parser/__init__.py +0 -0
  31. tests/parser/test_automatch.py +56 -0
  32. tests/parser/test_general.py +286 -0
  33. scrapling-0.1.2.dist-info/METADATA +0 -477
  34. scrapling-0.1.2.dist-info/RECORD +0 -12
  35. {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/LICENSE +0 -0
scrapling/parser.py CHANGED
@@ -1,18 +1,14 @@
1
1
  import os
2
+ import re
3
+ import inspect
2
4
  from difflib import SequenceMatcher
3
- from typing import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator
4
- try:
5
- from typing import SupportsIndex
6
- except ImportError:
7
- # 'SupportsIndex' got added in Python 3.8
8
- SupportsIndex = None
9
-
10
- from scrapling.translator import HTMLTranslator
11
- from scrapling.mixins import SelectorsGeneration
12
- from scrapling.custom_types import TextHandler, AttributesHandler
13
- from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
14
- from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
15
5
 
6
+ from scrapling.core.translator import HTMLTranslator
7
+ from scrapling.core.mixins import SelectorsGeneration
8
+ from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
9
+ from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
10
+ from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
11
+ from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
16
12
  from lxml import etree, html
17
13
  from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
18
14
 
@@ -32,7 +28,7 @@ class Adaptor(SelectorsGeneration):
32
28
  huge_tree: bool = True,
33
29
  root: Optional[html.HtmlElement] = None,
34
30
  keep_comments: Optional[bool] = False,
35
- auto_match: Optional[bool] = False,
31
+ auto_match: Optional[bool] = True,
36
32
  storage: Any = SQLiteStorageSystem,
37
33
  storage_args: Optional[Dict] = None,
38
34
  debug: Optional[bool] = True,
@@ -64,6 +60,7 @@ class Adaptor(SelectorsGeneration):
64
60
  if root is None and not body and text is None:
65
61
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
66
62
 
63
+ self.__text = None
67
64
  if root is None:
68
65
  if text is None:
69
66
  if not body or not isinstance(body, bytes):
@@ -76,12 +73,14 @@ class Adaptor(SelectorsGeneration):
76
73
 
77
74
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
78
75
 
76
+ # https://lxml.de/api/lxml.etree.HTMLParser-class.html
79
77
  parser = html.HTMLParser(
80
- # https://lxml.de/api/lxml.etree.HTMLParser-class.html
81
78
  recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
82
79
  compact=True, huge_tree=huge_tree, default_doctype=True
83
80
  )
84
81
  self._root = etree.fromstring(body, parser=parser, base_url=url)
82
+ if is_jsonable(text or body.decode()):
83
+ self.__text = TextHandler(text or body.decode())
85
84
 
86
85
  else:
87
86
  # All html types inherits from HtmlMixin so this to check for all at once
@@ -116,7 +115,6 @@ class Adaptor(SelectorsGeneration):
116
115
  self.url = url
117
116
  # For selector stuff
118
117
  self.__attributes = None
119
- self.__text = None
120
118
  self.__tag = None
121
119
  self.__debug = debug
122
120
 
@@ -125,7 +123,7 @@ class Adaptor(SelectorsGeneration):
125
123
  def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
126
124
  """Return True if given element is a result of a string expression
127
125
  Examples:
128
- Xpath -> '/text()', '/@attribute' etc...
126
+ XPath -> '/text()', '/@attribute' etc...
129
127
  CSS3 -> '::text', '::attr(attrib)'...
130
128
  """
131
129
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
@@ -163,6 +161,8 @@ class Adaptor(SelectorsGeneration):
163
161
  results = [self.__get_correct_result(n) for n in result]
164
162
  if all(isinstance(res, self.__class__) for res in results):
165
163
  return Adaptors(results)
164
+ elif all(isinstance(res, TextHandler) for res in results):
165
+ return TextHandlers(results)
166
166
  return results
167
167
 
168
168
  return self.__get_correct_result(result)
@@ -187,23 +187,9 @@ class Adaptor(SelectorsGeneration):
187
187
  def text(self) -> TextHandler:
188
188
  """Get text content of the element"""
189
189
  if not self.__text:
190
- if self.__keep_comments:
191
- if not self.children:
192
- # If use chose to keep comments, remove comments from text
193
- # Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
194
- # This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
195
- code = self.html_content
196
- parser = html.HTMLParser(
197
- recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
198
- compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
199
- )
200
- fragment_root = html.fragment_fromstring(code, parser=parser)
201
- self.__text = TextHandler(fragment_root.text)
202
- else:
203
- self.__text = TextHandler(self._root.text)
204
- else:
205
- # If user already chose to not keep comments then all is good
206
- self.__text = TextHandler(self._root.text)
190
+ # If you want to escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
191
+ # before extracting text then keep `keep_comments` set to False while initializing the first class
192
+ self.__text = TextHandler(self._root.text)
207
193
  return self.__text
208
194
 
209
195
  def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
@@ -399,6 +385,56 @@ class Adaptor(SelectorsGeneration):
399
385
  return self.__convert_results(score_table[highest_probability])
400
386
  return []
401
387
 
388
+ def css_first(self, selector: str, identifier: str = '',
389
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0
390
+ ) -> Union['Adaptor', 'TextHandler', None]:
391
+ """Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
392
+
393
+ **Important:
394
+ It's recommended to use the identifier argument if you plan to use different selector later
395
+ and want to relocate the same element(s)**
396
+
397
+ :param selector: The CSS3 selector to be used.
398
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
399
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
400
+ otherwise the selector will be used.
401
+ :param auto_save: Automatically save new elements for `auto_match` later
402
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
403
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
404
+ number unless you must know what you are doing!
405
+
406
+ :return: List as :class:`Adaptors`
407
+ """
408
+ for element in self.css(selector, identifier, auto_match, auto_save, percentage):
409
+ return element
410
+ return None
411
+
412
+ def xpath_first(self, selector: str, identifier: str = '',
413
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
414
+ ) -> Union['Adaptor', 'TextHandler', None]:
415
+ """Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
416
+
417
+ **Important:
418
+ It's recommended to use the identifier argument if you plan to use different selector later
419
+ and want to relocate the same element(s)**
420
+
421
+ Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
422
+
423
+ :param selector: The XPath selector to be used.
424
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
425
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
426
+ otherwise the selector will be used.
427
+ :param auto_save: Automatically save new elements for `auto_match` later
428
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
429
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
430
+ number unless you must know what you are doing!
431
+
432
+ :return: List as :class:`Adaptors`
433
+ """
434
+ for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
435
+ return element
436
+ return None
437
+
402
438
  def css(self, selector: str, identifier: str = '',
403
439
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0
404
440
  ) -> Union['Adaptors[Adaptor]', List]:
@@ -495,6 +531,113 @@ class Adaptor(SelectorsGeneration):
495
531
  except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
496
532
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
497
533
 
534
+ def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
535
+ """Find elements by filters of your creations for ease..
536
+
537
+ :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
538
+ :param kwargs: The attributes you want to filter elements based on it.
539
+ :return: The `Adaptors` object of the elements or empty list
540
+ """
541
+ # Attributes that are Python reserved words and can't be used directly
542
+ # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
543
+ # https://www.w3schools.com/python/python_ref_keywords.asp
544
+ whitelisted = {
545
+ 'class_': 'class',
546
+ 'for_': 'for',
547
+ }
548
+
549
+ if not args and not kwargs:
550
+ raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
551
+
552
+ attributes = dict()
553
+ tags, patterns = set(), set()
554
+ results, functions, selectors = [], [], []
555
+
556
+ def _search_tree(element: Adaptor, filter_function: Callable) -> None:
557
+ """Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
558
+ if filter_function(element):
559
+ results.append(element)
560
+
561
+ for branch in element.children:
562
+ _search_tree(branch, filter_function)
563
+
564
+ # Brace yourself for a wonderful journey!
565
+ for arg in args:
566
+ if type(arg) is str:
567
+ tags.add(arg)
568
+
569
+ elif type(arg) in [list, tuple, set]:
570
+ if not all(map(lambda x: type(x) is str, arg)):
571
+ raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
572
+ tags.update(set(arg))
573
+
574
+ elif type(arg) is dict:
575
+ if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
576
+ raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
577
+ attributes.update(arg)
578
+
579
+ elif type(arg) is re.Pattern:
580
+ patterns.add(arg)
581
+
582
+ elif callable(arg):
583
+ if len(inspect.signature(arg).parameters) > 0:
584
+ functions.append(arg)
585
+ else:
586
+ raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
587
+
588
+ else:
589
+ raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
590
+
591
+ if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
592
+ raise TypeError('Only string values are accepted for arguments')
593
+
594
+ for attribute_name, value in kwargs.items():
595
+ # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
596
+ attribute_name = whitelisted.get(attribute_name, attribute_name)
597
+ attributes[attribute_name] = value
598
+
599
+ # It's easier and faster to build a selector than traversing the tree
600
+ tags = tags or ['']
601
+ for tag in tags:
602
+ selector = tag
603
+ for key, value in attributes.items():
604
+ value = value.replace('"', r'\"') # Escape double quotes in user input
605
+ # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
606
+ selector += '[{}="{}"]'.format(key, value)
607
+ if selector:
608
+ selectors.append(selector)
609
+
610
+ if selectors:
611
+ results = self.css(', '.join(selectors))
612
+ if results:
613
+ # From the results, get the ones that fulfill passed regex patterns
614
+ for pattern in patterns:
615
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
616
+
617
+ # From the results, get the ones that fulfill passed functions
618
+ for function in functions:
619
+ results = results.filter(function)
620
+ else:
621
+ for pattern in patterns:
622
+ results.extend(self.find_by_regex(pattern, first_match=False))
623
+
624
+ for result in (results or [self]):
625
+ for function in functions:
626
+ _search_tree(result, function)
627
+
628
+ return self.__convert_results(results)
629
+
630
+ def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
631
+ """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
632
+
633
+ :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
634
+ :param kwargs: The attributes you want to filter elements based on it.
635
+ :return: The `Adaptor` object of the element or `None` if the result didn't match
636
+ """
637
+ for element in self.find_all(*args, **kwargs):
638
+ return element
639
+ return None
640
+
498
641
  def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
499
642
  """Used internally to calculate a score that shows how candidate element similar to the original one
500
643
 
@@ -606,25 +749,33 @@ class Adaptor(SelectorsGeneration):
606
749
  # Operations on text functions
607
750
  def json(self) -> Dict:
608
751
  """Return json response if the response is jsonable otherwise throws error"""
609
- return self.text.json()
752
+ if self.text:
753
+ return self.text.json()
754
+ else:
755
+ return self.get_all_text(strip=True).json()
610
756
 
611
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
757
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
758
+ clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
612
759
  """Apply the given regex to the current text and return a list of strings with the matches.
613
760
 
614
761
  :param regex: Can be either a compiled regular expression or a string.
615
762
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
763
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
764
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
616
765
  """
617
- return self.text.re(regex, replace_entities)
766
+ return self.text.re(regex, replace_entities, clean_match, case_sensitive)
618
767
 
619
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
768
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
769
+ clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
620
770
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
621
771
 
622
772
  :param regex: Can be either a compiled regular expression or a string.
623
773
  :param default: The default value to be returned if there is no match
624
774
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
625
-
775
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
776
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
626
777
  """
627
- return self.text.re_first(regex, default, replace_entities)
778
+ return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
628
779
 
629
780
  def find_similar(
630
781
  self,
@@ -757,10 +908,10 @@ class Adaptor(SelectorsGeneration):
757
908
  return self.__convert_results(results)
758
909
 
759
910
  def find_by_regex(
760
- self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
911
+ self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
761
912
  ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
762
913
  """Find elements that its text content matches the input regex pattern.
763
- :param query: Regex query to match
914
+ :param query: Regex query/pattern to match
764
915
  :param first_match: Return first element that matches conditions, enabled by default
765
916
  :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
766
917
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
@@ -855,54 +1006,56 @@ class Adaptors(List[Adaptor]):
855
1006
  ]
856
1007
  return self.__class__(flatten(results))
857
1008
 
858
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
1009
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
1010
+ clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
859
1011
  """Call the ``.re()`` method for each element in this list and return
860
1012
  their results flattened as List of TextHandler.
861
1013
 
862
1014
  :param regex: Can be either a compiled regular expression or a string.
863
1015
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
1016
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1017
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
864
1018
  """
865
1019
  results = [
866
- n.text.re(regex, replace_entities) for n in self
1020
+ n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
867
1021
  ]
868
1022
  return flatten(results)
869
1023
 
870
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
1024
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
1025
+ clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
871
1026
  """Call the ``.re_first()`` method for each element in this list and return
872
- their results flattened as List of TextHandler.
1027
+ the first result or the default value otherwise.
873
1028
 
874
1029
  :param regex: Can be either a compiled regular expression or a string.
875
1030
  :param default: The default value to be returned if there is no match
876
1031
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
1032
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1033
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
1034
+ """
1035
+ for n in self:
1036
+ for result in n.re(regex, replace_entities, clean_match, case_sensitive):
1037
+ return result
1038
+ return default
1039
+
1040
+ def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
1041
+ """Loop over all current elements and return the first element that matches the passed function
1042
+ :param func: A function that takes each element as an argument and returns True/False
1043
+ :return: The first element that match the function or ``None`` otherwise.
1044
+ """
1045
+ for element in self:
1046
+ if func(element):
1047
+ return element
1048
+ return None
877
1049
 
1050
+ def filter(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptors', List]:
1051
+ """Filter current elements based on the passed function
1052
+ :param func: A function that takes each element as an argument and returns True/False
1053
+ :return: The new `Adaptors` object or empty list otherwise.
878
1054
  """
879
1055
  results = [
880
- n.text.re_first(regex, default, replace_entities) for n in self
1056
+ element for element in self if func(element)
881
1057
  ]
882
- return flatten(results)
883
-
884
- # def __getattr__(self, name):
885
- # if name in dir(self.__class__):
886
- # return super().__getattribute__(name)
887
- #
888
- # # Execute the method itself on each Adaptor
889
- # results = []
890
- # for item in self:
891
- # results.append(getattr(item, name))
892
- #
893
- # if all(callable(r) for r in results):
894
- # def call_all(*args, **kwargs):
895
- # final_results = [r(*args, **kwargs) for r in results]
896
- # if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
897
- # return self.__class__(final_results)
898
- # return final_results
899
- #
900
- # return call_all
901
- # else:
902
- # # Flatten the result if it's a single-item list containing a list
903
- # if len(self) == 1 and isinstance(results[0], list):
904
- # return self.__class__(results[0])
905
- # return self.__class__(results)
1058
+ return self.__class__(results) if results else results
906
1059
 
907
1060
  def get(self, default=None):
908
1061
  """Returns the first item of the current list
scrapling/py.typed ADDED
@@ -0,0 +1 @@
1
+