scrapling 0.1.2__py3-none-any.whl → 0.2.1__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
Files changed (35) hide show
  1. scrapling/__init__.py +4 -3
  2. scrapling/core/__init__.py +0 -0
  3. scrapling/core/_types.py +25 -0
  4. scrapling/{custom_types.py → core/custom_types.py} +48 -3
  5. scrapling/{mixins.py → core/mixins.py} +22 -7
  6. scrapling/{storage_adaptors.py → core/storage_adaptors.py} +2 -2
  7. scrapling/{translator.py → core/translator.py} +2 -12
  8. scrapling/{utils.py → core/utils.py} +14 -61
  9. scrapling/engines/__init__.py +7 -0
  10. scrapling/engines/camo.py +128 -0
  11. scrapling/engines/constants.py +108 -0
  12. scrapling/engines/pw.py +237 -0
  13. scrapling/engines/static.py +112 -0
  14. scrapling/engines/toolbelt/__init__.py +19 -0
  15. scrapling/engines/toolbelt/custom.py +154 -0
  16. scrapling/engines/toolbelt/fingerprints.py +81 -0
  17. scrapling/engines/toolbelt/navigation.py +108 -0
  18. scrapling/fetchers.py +198 -0
  19. scrapling/parser.py +223 -70
  20. scrapling/py.typed +1 -0
  21. scrapling-0.2.1.dist-info/METADATA +835 -0
  22. scrapling-0.2.1.dist-info/RECORD +33 -0
  23. {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/WHEEL +1 -1
  24. {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/top_level.txt +1 -0
  25. tests/__init__.py +1 -0
  26. tests/fetchers/__init__.py +1 -0
  27. tests/fetchers/test_camoufox.py +62 -0
  28. tests/fetchers/test_httpx.py +67 -0
  29. tests/fetchers/test_playwright.py +74 -0
  30. tests/parser/__init__.py +0 -0
  31. tests/parser/test_automatch.py +56 -0
  32. tests/parser/test_general.py +286 -0
  33. scrapling-0.1.2.dist-info/METADATA +0 -477
  34. scrapling-0.1.2.dist-info/RECORD +0 -12
  35. {scrapling-0.1.2.dist-info → scrapling-0.2.1.dist-info}/LICENSE +0 -0
scrapling/parser.py CHANGED
@@ -1,18 +1,14 @@
1
1
  import os
2
+ import re
3
+ import inspect
2
4
  from difflib import SequenceMatcher
3
- from typing import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator
4
- try:
5
- from typing import SupportsIndex
6
- except ImportError:
7
- # 'SupportsIndex' got added in Python 3.8
8
- SupportsIndex = None
9
-
10
- from scrapling.translator import HTMLTranslator
11
- from scrapling.mixins import SelectorsGeneration
12
- from scrapling.custom_types import TextHandler, AttributesHandler
13
- from scrapling.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
14
- from scrapling.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden
15
5
 
6
+ from scrapling.core.translator import HTMLTranslator
7
+ from scrapling.core.mixins import SelectorsGeneration
8
+ from scrapling.core.custom_types import TextHandler, TextHandlers, AttributesHandler
9
+ from scrapling.core.storage_adaptors import SQLiteStorageSystem, StorageSystemMixin, _StorageTools
10
+ from scrapling.core.utils import setup_basic_logging, logging, clean_spaces, flatten, html_forbidden, is_jsonable
11
+ from scrapling.core._types import Any, Dict, List, Tuple, Optional, Pattern, Union, Callable, Generator, SupportsIndex, Iterable
16
12
  from lxml import etree, html
17
13
  from cssselect import SelectorError, SelectorSyntaxError, parse as split_selectors
18
14
 
@@ -32,7 +28,7 @@ class Adaptor(SelectorsGeneration):
32
28
  huge_tree: bool = True,
33
29
  root: Optional[html.HtmlElement] = None,
34
30
  keep_comments: Optional[bool] = False,
35
- auto_match: Optional[bool] = False,
31
+ auto_match: Optional[bool] = True,
36
32
  storage: Any = SQLiteStorageSystem,
37
33
  storage_args: Optional[Dict] = None,
38
34
  debug: Optional[bool] = True,
@@ -64,6 +60,7 @@ class Adaptor(SelectorsGeneration):
64
60
  if root is None and not body and text is None:
65
61
  raise ValueError("Adaptor class needs text, body, or root arguments to work")
66
62
 
63
+ self.__text = None
67
64
  if root is None:
68
65
  if text is None:
69
66
  if not body or not isinstance(body, bytes):
@@ -76,12 +73,14 @@ class Adaptor(SelectorsGeneration):
76
73
 
77
74
  body = text.strip().replace("\x00", "").encode(encoding) or b"<html/>"
78
75
 
76
+ # https://lxml.de/api/lxml.etree.HTMLParser-class.html
79
77
  parser = html.HTMLParser(
80
- # https://lxml.de/api/lxml.etree.HTMLParser-class.html
81
78
  recover=True, remove_blank_text=True, remove_comments=(keep_comments is False), encoding=encoding,
82
79
  compact=True, huge_tree=huge_tree, default_doctype=True
83
80
  )
84
81
  self._root = etree.fromstring(body, parser=parser, base_url=url)
82
+ if is_jsonable(text or body.decode()):
83
+ self.__text = TextHandler(text or body.decode())
85
84
 
86
85
  else:
87
86
  # All html types inherits from HtmlMixin so this to check for all at once
@@ -116,7 +115,6 @@ class Adaptor(SelectorsGeneration):
116
115
  self.url = url
117
116
  # For selector stuff
118
117
  self.__attributes = None
119
- self.__text = None
120
118
  self.__tag = None
121
119
  self.__debug = debug
122
120
 
@@ -125,7 +123,7 @@ class Adaptor(SelectorsGeneration):
125
123
  def _is_text_node(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> bool:
126
124
  """Return True if given element is a result of a string expression
127
125
  Examples:
128
- Xpath -> '/text()', '/@attribute' etc...
126
+ XPath -> '/text()', '/@attribute' etc...
129
127
  CSS3 -> '::text', '::attr(attrib)'...
130
128
  """
131
129
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
@@ -163,6 +161,8 @@ class Adaptor(SelectorsGeneration):
163
161
  results = [self.__get_correct_result(n) for n in result]
164
162
  if all(isinstance(res, self.__class__) for res in results):
165
163
  return Adaptors(results)
164
+ elif all(isinstance(res, TextHandler) for res in results):
165
+ return TextHandlers(results)
166
166
  return results
167
167
 
168
168
  return self.__get_correct_result(result)
@@ -187,23 +187,9 @@ class Adaptor(SelectorsGeneration):
187
187
  def text(self) -> TextHandler:
188
188
  """Get text content of the element"""
189
189
  if not self.__text:
190
- if self.__keep_comments:
191
- if not self.children:
192
- # If use chose to keep comments, remove comments from text
193
- # Escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
194
- # This issue is present in parsel/scrapy as well so no need to repeat it here so the user can run regex on the full text.
195
- code = self.html_content
196
- parser = html.HTMLParser(
197
- recover=True, remove_blank_text=True, remove_comments=True, encoding=self.encoding,
198
- compact=True, huge_tree=self.__huge_tree_enabled, default_doctype=True
199
- )
200
- fragment_root = html.fragment_fromstring(code, parser=parser)
201
- self.__text = TextHandler(fragment_root.text)
202
- else:
203
- self.__text = TextHandler(self._root.text)
204
- else:
205
- # If user already chose to not keep comments then all is good
206
- self.__text = TextHandler(self._root.text)
190
+ # If you want to escape lxml default behaviour and remove comments like this `<span>CONDITION: <!-- -->Excellent</span>`
191
+ # before extracting text then keep `keep_comments` set to False while initializing the first class
192
+ self.__text = TextHandler(self._root.text)
207
193
  return self.__text
208
194
 
209
195
  def get_all_text(self, separator: str = "\n", strip: bool = False, ignore_tags: Tuple = ('script', 'style',), valid_values: bool = True) -> TextHandler:
@@ -399,6 +385,56 @@ class Adaptor(SelectorsGeneration):
399
385
  return self.__convert_results(score_table[highest_probability])
400
386
  return []
401
387
 
388
+ def css_first(self, selector: str, identifier: str = '',
389
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0
390
+ ) -> Union['Adaptor', 'TextHandler', None]:
391
+ """Search current tree with CSS3 selectors and return the first result if possible, otherwise return `None`
392
+
393
+ **Important:
394
+ It's recommended to use the identifier argument if you plan to use different selector later
395
+ and want to relocate the same element(s)**
396
+
397
+ :param selector: The CSS3 selector to be used.
398
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
399
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
400
+ otherwise the selector will be used.
401
+ :param auto_save: Automatically save new elements for `auto_match` later
402
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
403
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
404
+ number unless you must know what you are doing!
405
+
406
+ :return: List as :class:`Adaptors`
407
+ """
408
+ for element in self.css(selector, identifier, auto_match, auto_save, percentage):
409
+ return element
410
+ return None
411
+
412
+ def xpath_first(self, selector: str, identifier: str = '',
413
+ auto_match: bool = False, auto_save: bool = False, percentage: int = 0, **kwargs: Any
414
+ ) -> Union['Adaptor', 'TextHandler', None]:
415
+ """Search current tree with XPath selectors and return the first result if possible, otherwise return `None`
416
+
417
+ **Important:
418
+ It's recommended to use the identifier argument if you plan to use different selector later
419
+ and want to relocate the same element(s)**
420
+
421
+ Note: **Additional keyword arguments will be passed as XPath variables in the XPath expression!**
422
+
423
+ :param selector: The XPath selector to be used.
424
+ :param auto_match: Enabled will make function try to relocate the element if it was 'saved' before
425
+ :param identifier: A string that will be used to save/retrieve element's data in auto-matching
426
+ otherwise the selector will be used.
427
+ :param auto_save: Automatically save new elements for `auto_match` later
428
+ :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
429
+ Be aware that the percentage calculation depends solely on the page structure so don't play with this
430
+ number unless you must know what you are doing!
431
+
432
+ :return: List as :class:`Adaptors`
433
+ """
434
+ for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
435
+ return element
436
+ return None
437
+
402
438
  def css(self, selector: str, identifier: str = '',
403
439
  auto_match: bool = False, auto_save: bool = False, percentage: int = 0
404
440
  ) -> Union['Adaptors[Adaptor]', List]:
@@ -495,6 +531,113 @@ class Adaptor(SelectorsGeneration):
495
531
  except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
496
532
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
497
533
 
534
+ def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
535
+ """Find elements by filters of your creations for ease..
536
+
537
+ :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
538
+ :param kwargs: The attributes you want to filter elements based on it.
539
+ :return: The `Adaptors` object of the elements or empty list
540
+ """
541
+ # Attributes that are Python reserved words and can't be used directly
542
+ # Ex: find_all('a', class="blah") -> find_all('a', class_="blah")
543
+ # https://www.w3schools.com/python/python_ref_keywords.asp
544
+ whitelisted = {
545
+ 'class_': 'class',
546
+ 'for_': 'for',
547
+ }
548
+
549
+ if not args and not kwargs:
550
+ raise TypeError('You have to pass something to search with, like tag name(s), tag attributes, or both.')
551
+
552
+ attributes = dict()
553
+ tags, patterns = set(), set()
554
+ results, functions, selectors = [], [], []
555
+
556
+ def _search_tree(element: Adaptor, filter_function: Callable) -> None:
557
+ """Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
558
+ if filter_function(element):
559
+ results.append(element)
560
+
561
+ for branch in element.children:
562
+ _search_tree(branch, filter_function)
563
+
564
+ # Brace yourself for a wonderful journey!
565
+ for arg in args:
566
+ if type(arg) is str:
567
+ tags.add(arg)
568
+
569
+ elif type(arg) in [list, tuple, set]:
570
+ if not all(map(lambda x: type(x) is str, arg)):
571
+ raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
572
+ tags.update(set(arg))
573
+
574
+ elif type(arg) is dict:
575
+ if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
576
+ raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
577
+ attributes.update(arg)
578
+
579
+ elif type(arg) is re.Pattern:
580
+ patterns.add(arg)
581
+
582
+ elif callable(arg):
583
+ if len(inspect.signature(arg).parameters) > 0:
584
+ functions.append(arg)
585
+ else:
586
+ raise TypeError("Callable filter function must have at least one argument to take `Adaptor` objects.")
587
+
588
+ else:
589
+ raise TypeError(f'Argument with type "{type(arg)}" is not accepted, please read the docs.')
590
+
591
+ if not all([(type(k) is str and type(v) is str) for k, v in kwargs.items()]):
592
+ raise TypeError('Only string values are accepted for arguments')
593
+
594
+ for attribute_name, value in kwargs.items():
595
+ # Only replace names for kwargs, replacing them in dictionaries doesn't make sense
596
+ attribute_name = whitelisted.get(attribute_name, attribute_name)
597
+ attributes[attribute_name] = value
598
+
599
+ # It's easier and faster to build a selector than traversing the tree
600
+ tags = tags or ['']
601
+ for tag in tags:
602
+ selector = tag
603
+ for key, value in attributes.items():
604
+ value = value.replace('"', r'\"') # Escape double quotes in user input
605
+ # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
606
+ selector += '[{}="{}"]'.format(key, value)
607
+ if selector:
608
+ selectors.append(selector)
609
+
610
+ if selectors:
611
+ results = self.css(', '.join(selectors))
612
+ if results:
613
+ # From the results, get the ones that fulfill passed regex patterns
614
+ for pattern in patterns:
615
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
616
+
617
+ # From the results, get the ones that fulfill passed functions
618
+ for function in functions:
619
+ results = results.filter(function)
620
+ else:
621
+ for pattern in patterns:
622
+ results.extend(self.find_by_regex(pattern, first_match=False))
623
+
624
+ for result in (results or [self]):
625
+ for function in functions:
626
+ _search_tree(result, function)
627
+
628
+ return self.__convert_results(results)
629
+
630
+ def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
631
+ """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
632
+
633
+ :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
634
+ :param kwargs: The attributes you want to filter elements based on it.
635
+ :return: The `Adaptor` object of the element or `None` if the result didn't match
636
+ """
637
+ for element in self.find_all(*args, **kwargs):
638
+ return element
639
+ return None
640
+
498
641
  def __calculate_similarity_score(self, original: Dict, candidate: html.HtmlElement) -> float:
499
642
  """Used internally to calculate a score that shows how candidate element similar to the original one
500
643
 
@@ -606,25 +749,33 @@ class Adaptor(SelectorsGeneration):
606
749
  # Operations on text functions
607
750
  def json(self) -> Dict:
608
751
  """Return json response if the response is jsonable otherwise throws error"""
609
- return self.text.json()
752
+ if self.text:
753
+ return self.text.json()
754
+ else:
755
+ return self.get_all_text(strip=True).json()
610
756
 
611
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
757
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
758
+ clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
612
759
  """Apply the given regex to the current text and return a list of strings with the matches.
613
760
 
614
761
  :param regex: Can be either a compiled regular expression or a string.
615
762
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
763
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
764
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
616
765
  """
617
- return self.text.re(regex, replace_entities)
766
+ return self.text.re(regex, replace_entities, clean_match, case_sensitive)
618
767
 
619
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
768
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
769
+ clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
620
770
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
621
771
 
622
772
  :param regex: Can be either a compiled regular expression or a string.
623
773
  :param default: The default value to be returned if there is no match
624
774
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
625
-
775
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
776
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
626
777
  """
627
- return self.text.re_first(regex, default, replace_entities)
778
+ return self.text.re_first(regex, default, replace_entities, clean_match, case_sensitive)
628
779
 
629
780
  def find_similar(
630
781
  self,
@@ -757,10 +908,10 @@ class Adaptor(SelectorsGeneration):
757
908
  return self.__convert_results(results)
758
909
 
759
910
  def find_by_regex(
760
- self, query: str, first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
911
+ self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
761
912
  ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
762
913
  """Find elements that its text content matches the input regex pattern.
763
- :param query: Regex query to match
914
+ :param query: Regex query/pattern to match
764
915
  :param first_match: Return first element that matches conditions, enabled by default
765
916
  :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
766
917
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
@@ -855,54 +1006,56 @@ class Adaptors(List[Adaptor]):
855
1006
  ]
856
1007
  return self.__class__(flatten(results))
857
1008
 
858
- def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True) -> 'List[str]':
1009
+ def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
1010
+ clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
859
1011
  """Call the ``.re()`` method for each element in this list and return
860
1012
  their results flattened as List of TextHandler.
861
1013
 
862
1014
  :param regex: Can be either a compiled regular expression or a string.
863
1015
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
1016
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1017
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
864
1018
  """
865
1019
  results = [
866
- n.text.re(regex, replace_entities) for n in self
1020
+ n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
867
1021
  ]
868
1022
  return flatten(results)
869
1023
 
870
- def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True):
1024
+ def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
1025
+ clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
871
1026
  """Call the ``.re_first()`` method for each element in this list and return
872
- their results flattened as List of TextHandler.
1027
+ the first result or the default value otherwise.
873
1028
 
874
1029
  :param regex: Can be either a compiled regular expression or a string.
875
1030
  :param default: The default value to be returned if there is no match
876
1031
  :param replace_entities: if enabled character entity references are replaced by their corresponding character
1032
+ :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
1033
+ :param case_sensitive: if enabled, function will set the regex to ignore letters case while compiling it
1034
+ """
1035
+ for n in self:
1036
+ for result in n.re(regex, replace_entities, clean_match, case_sensitive):
1037
+ return result
1038
+ return default
1039
+
1040
+ def search(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
1041
+ """Loop over all current elements and return the first element that matches the passed function
1042
+ :param func: A function that takes each element as an argument and returns True/False
1043
+ :return: The first element that match the function or ``None`` otherwise.
1044
+ """
1045
+ for element in self:
1046
+ if func(element):
1047
+ return element
1048
+ return None
877
1049
 
1050
+ def filter(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptors', List]:
1051
+ """Filter current elements based on the passed function
1052
+ :param func: A function that takes each element as an argument and returns True/False
1053
+ :return: The new `Adaptors` object or empty list otherwise.
878
1054
  """
879
1055
  results = [
880
- n.text.re_first(regex, default, replace_entities) for n in self
1056
+ element for element in self if func(element)
881
1057
  ]
882
- return flatten(results)
883
-
884
- # def __getattr__(self, name):
885
- # if name in dir(self.__class__):
886
- # return super().__getattribute__(name)
887
- #
888
- # # Execute the method itself on each Adaptor
889
- # results = []
890
- # for item in self:
891
- # results.append(getattr(item, name))
892
- #
893
- # if all(callable(r) for r in results):
894
- # def call_all(*args, **kwargs):
895
- # final_results = [r(*args, **kwargs) for r in results]
896
- # if all([isinstance(r, (Adaptor, Adaptors,)) for r in results]):
897
- # return self.__class__(final_results)
898
- # return final_results
899
- #
900
- # return call_all
901
- # else:
902
- # # Flatten the result if it's a single-item list containing a list
903
- # if len(self) == 1 and isinstance(results[0], list):
904
- # return self.__class__(results[0])
905
- # return self.__class__(results)
1058
+ return self.__class__(results) if results else results
906
1059
 
907
1060
  def get(self, default=None):
908
1061
  """Returns the first item of the current list
scrapling/py.typed ADDED
@@ -0,0 +1 @@
1
+