scrapling 0.2.92__py3-none-any.whl → 0.2.93__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
scrapling/parser.py CHANGED
@@ -1,6 +1,7 @@
1
1
  import inspect
2
2
  import os
3
3
  import re
4
+ import typing
4
5
  from difflib import SequenceMatcher
5
6
  from urllib.parse import urljoin
6
7
 
@@ -145,47 +146,46 @@ class Adaptor(SelectorsGeneration):
145
146
  # Faster than checking `element.is_attribute or element.is_text or element.is_tail`
146
147
  return issubclass(type(element), etree._ElementUnicodeResult)
147
148
 
148
- def __get_correct_result(
149
- self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]
150
- ) -> Union[TextHandler, html.HtmlElement, 'Adaptor', str]:
151
- """Used internally in all functions to convert results to type (Adaptor|Adaptors) when possible"""
152
- if self._is_text_node(element):
149
+ @staticmethod
150
+ def __content_convertor(element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> TextHandler:
151
+ """Used internally to convert a single element's text content to TextHandler directly without checks
152
+
153
+ This single line has been isolated like this so when it's used with map we get that slight performance boost vs list comprehension
154
+ """
155
+ return TextHandler(str(element))
156
+
157
+ def __element_convertor(self, element: html.HtmlElement) -> 'Adaptor':
158
+ """Used internally to convert a single HtmlElement to Adaptor directly without checks"""
159
+ return Adaptor(
160
+ root=element,
161
+ text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
162
+ url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
163
+ keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
164
+ huge_tree=self.__huge_tree_enabled,
165
+ **self.__response_data
166
+ )
167
+
168
+ def __handle_element(self, element: Union[html.HtmlElement, etree._ElementUnicodeResult]) -> Union[TextHandler, 'Adaptor', None]:
169
+ """Used internally in all functions to convert a single element to type (Adaptor|TextHandler) when possible"""
170
+ if element is None:
171
+ return None
172
+ elif self._is_text_node(element):
153
173
  # etree._ElementUnicodeResult basically inherit from `str` so it's fine
154
- return TextHandler(str(element))
174
+ return self.__content_convertor(element)
155
175
  else:
156
- if issubclass(type(element), html.HtmlMixin):
157
-
158
- return Adaptor(
159
- root=element,
160
- text='', body=b'', # Since root argument is provided, both `text` and `body` will be ignored so this is just a filler
161
- url=self.url, encoding=self.encoding, auto_match=self.__auto_match_enabled,
162
- keep_comments=self.__keep_comments, keep_cdata=self.__keep_cdata,
163
- huge_tree=self.__huge_tree_enabled,
164
- **self.__response_data
165
- )
166
- return element
176
+ return self.__element_convertor(element)
167
177
 
168
- def __convert_results(
169
- self, result: Union[List[html.HtmlElement], html.HtmlElement]
170
- ) -> Union['Adaptors[Adaptor]', 'Adaptor', List, None]:
171
- """Used internally in all functions to convert results to type (Adaptor|Adaptors) in bulk when possible"""
172
- if result is None:
173
- return None
174
- elif result == []: # Lxml will give a warning if I used something like `not result`
175
- return []
176
-
177
- if isinstance(result, Adaptors):
178
- return result
179
-
180
- if type(result) is list:
181
- results = [self.__get_correct_result(n) for n in result]
182
- if all(isinstance(res, self.__class__) for res in results):
183
- return Adaptors(results)
184
- elif all(isinstance(res, TextHandler) for res in results):
185
- return TextHandlers(results)
186
- return results
178
+ def __handle_elements(self, result: List[Union[html.HtmlElement, etree._ElementUnicodeResult]]) -> Union['Adaptors', 'TextHandlers', List]:
179
+ """Used internally in all functions to convert results to type (Adaptors|TextHandlers) in bulk when possible"""
180
+ if not len(result): # Lxml will give a warning if I used something like `not result`
181
+ return Adaptors([])
187
182
 
188
- return self.__get_correct_result(result)
183
+ # From within the code, this method will always get a list of the same type
184
+ # so we will continue without checks for slight performance boost
185
+ if self._is_text_node(result[0]):
186
+ return TextHandlers(list(map(self.__content_convertor, result)))
187
+
188
+ return Adaptors(list(map(self.__element_convertor, result)))
189
189
 
190
190
  def __getstate__(self) -> Any:
191
191
  # lxml don't like it :)
@@ -223,29 +223,16 @@ class Adaptor(SelectorsGeneration):
223
223
  :return: A TextHandler
224
224
  """
225
225
  _all_strings = []
226
-
227
- def _traverse(node: html.HtmlElement) -> None:
228
- """Traverse element children and get text content of each
229
-
230
- :param node: Current node in the tree structure
231
- :return:
232
- """
226
+ for node in self._root.xpath('.//*'):
233
227
  if node.tag not in ignore_tags:
234
228
  text = node.text
235
229
  if text and type(text) is str:
236
- if valid_values:
237
- if text.strip():
238
- _all_strings.append(text if not strip else text.strip())
230
+ if valid_values and text.strip():
231
+ _all_strings.append(text if not strip else text.strip())
239
232
  else:
240
233
  _all_strings.append(text if not strip else text.strip())
241
234
 
242
- for branch in node.iterchildren():
243
- _traverse(branch)
244
-
245
- # We will start using Lxml directly for the speed boost
246
- _traverse(self._root)
247
-
248
- return TextHandler(separator.join([s for s in _all_strings]))
235
+ return TextHandler(separator.join(_all_strings))
249
236
 
250
237
  def urljoin(self, relative_url: str) -> str:
251
238
  """Join this Adaptor's url with a relative url to form an absolute full URL."""
@@ -259,18 +246,18 @@ class Adaptor(SelectorsGeneration):
259
246
  return self.__attributes
260
247
 
261
248
  @property
262
- def html_content(self) -> str:
249
+ def html_content(self) -> TextHandler:
263
250
  """Return the inner html code of the element"""
264
- return etree.tostring(self._root, encoding='unicode', method='html', with_tail=False)
251
+ return TextHandler(etree.tostring(self._root, encoding='unicode', method='html', with_tail=False))
265
252
 
266
253
  @property
267
- def body(self) -> str:
254
+ def body(self) -> TextHandler:
268
255
  """Return raw HTML code of the element/page without any processing when possible or return `Adaptor.html_content`"""
269
- return self.__raw_body or self.html_content
256
+ return TextHandler(self.__raw_body) or self.html_content
270
257
 
271
- def prettify(self) -> str:
258
+ def prettify(self) -> TextHandler:
272
259
  """Return a prettified version of the element's inner html-code"""
273
- return etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False)
260
+ return TextHandler(etree.tostring(self._root, encoding='unicode', pretty_print=True, method='html', with_tail=False))
274
261
 
275
262
  def has_class(self, class_name: str) -> bool:
276
263
  """Check if element has a specific class
@@ -282,26 +269,32 @@ class Adaptor(SelectorsGeneration):
282
269
  @property
283
270
  def parent(self) -> Union['Adaptor', None]:
284
271
  """Return the direct parent of the element or ``None`` otherwise"""
285
- return self.__convert_results(self._root.getparent())
272
+ return self.__handle_element(self._root.getparent())
286
273
 
287
274
  @property
288
- def children(self) -> Union['Adaptors[Adaptor]', List]:
275
+ def below_elements(self) -> 'Adaptors[Adaptor]':
276
+ """Return all elements under the current element in the DOM tree"""
277
+ below = self._root.xpath('.//*')
278
+ return self.__handle_elements(below)
279
+
280
+ @property
281
+ def children(self) -> 'Adaptors[Adaptor]':
289
282
  """Return the children elements of the current element or empty list otherwise"""
290
- return self.__convert_results(list(
291
- child for child in self._root.iterchildren() if type(child) not in html_forbidden
292
- ))
283
+ return Adaptors([
284
+ self.__element_convertor(child) for child in self._root.iterchildren() if type(child) not in html_forbidden
285
+ ])
293
286
 
294
287
  @property
295
- def siblings(self) -> Union['Adaptors[Adaptor]', List]:
288
+ def siblings(self) -> 'Adaptors[Adaptor]':
296
289
  """Return other children of the current element's parent or empty list otherwise"""
297
290
  if self.parent:
298
291
  return Adaptors([child for child in self.parent.children if child._root != self._root])
299
- return []
292
+ return Adaptors([])
300
293
 
301
294
  def iterancestors(self) -> Generator['Adaptor', None, None]:
302
295
  """Return a generator that loops over all ancestors of the element, starting with element's parent."""
303
296
  for ancestor in self._root.iterancestors():
304
- yield self.__convert_results(ancestor)
297
+ yield self.__element_convertor(ancestor)
305
298
 
306
299
  def find_ancestor(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptor', None]:
307
300
  """Loop over all ancestors of the element till one match the passed function
@@ -328,7 +321,7 @@ class Adaptor(SelectorsGeneration):
328
321
  # Ignore html comments and unwanted types
329
322
  next_element = next_element.getnext()
330
323
 
331
- return self.__convert_results(next_element)
324
+ return self.__handle_element(next_element)
332
325
 
333
326
  @property
334
327
  def previous(self) -> Union['Adaptor', None]:
@@ -339,7 +332,7 @@ class Adaptor(SelectorsGeneration):
339
332
  # Ignore html comments and unwanted types
340
333
  prev_element = prev_element.getprevious()
341
334
 
342
- return self.__convert_results(prev_element)
335
+ return self.__handle_element(prev_element)
343
336
 
344
337
  # For easy copy-paste from Scrapy/parsel code when needed :)
345
338
  def get(self, default=None):
@@ -392,34 +385,26 @@ class Adaptor(SelectorsGeneration):
392
385
  if issubclass(type(element), html.HtmlElement):
393
386
  element = _StorageTools.element_to_dict(element)
394
387
 
395
- # TODO: Optimize the traverse logic a bit, maybe later
396
- def _traverse(node: html.HtmlElement, ele: Dict) -> None:
397
- """Get the matching score of the given element against the node then traverse the children
398
-
399
- :param node: Current node in the tree structure
400
- :param ele: The element we are searching for as dictionary
401
- :return:
402
- """
388
+ for node in self._root.xpath('.//*'):
389
+ # Collect all elements in the page then for each element get the matching score of it against the node.
403
390
  # Hence: the code doesn't stop even if the score was 100%
404
391
  # because there might be another element(s) left in page with the same score
405
- score = self.__calculate_similarity_score(ele, node)
392
+ score = self.__calculate_similarity_score(element, node)
406
393
  score_table.setdefault(score, []).append(node)
407
- for branch in node.iterchildren():
408
- _traverse(branch, ele)
409
-
410
- # This will block until we traverse all children/branches
411
- _traverse(self._root, element)
412
394
 
413
395
  if score_table:
414
396
  highest_probability = max(score_table.keys())
415
397
  if score_table[highest_probability] and highest_probability >= percentage:
416
- log.debug(f'Highest probability was {highest_probability}%')
417
- log.debug('Top 5 best matching elements are: ')
418
- for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
419
- log.debug(f'{percent} -> {self.__convert_results(score_table[percent])}')
398
+ if log.getEffectiveLevel() < 20:
399
+ # No need to execute this part if logging level is not debugging
400
+ log.debug(f'Highest probability was {highest_probability}%')
401
+ log.debug('Top 5 best matching elements are: ')
402
+ for percent in tuple(sorted(score_table.keys(), reverse=True))[:5]:
403
+ log.debug(f'{percent} -> {self.__handle_elements(score_table[percent])}')
404
+
420
405
  if not adaptor_type:
421
406
  return score_table[highest_probability]
422
- return self.__convert_results(score_table[highest_probability])
407
+ return self.__handle_elements(score_table[highest_probability])
423
408
  return []
424
409
 
425
410
  def css_first(self, selector: str, identifier: str = '',
@@ -439,8 +424,6 @@ class Adaptor(SelectorsGeneration):
439
424
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
440
425
  Be aware that the percentage calculation depends solely on the page structure so don't play with this
441
426
  number unless you must know what you are doing!
442
-
443
- :return: List as :class:`Adaptors`
444
427
  """
445
428
  for element in self.css(selector, identifier, auto_match, auto_save, percentage):
446
429
  return element
@@ -465,8 +448,6 @@ class Adaptor(SelectorsGeneration):
465
448
  :param percentage: The minimum percentage to accept while auto-matching and not going lower than that.
466
449
  Be aware that the percentage calculation depends solely on the page structure so don't play with this
467
450
  number unless you must know what you are doing!
468
-
469
- :return: List as :class:`Adaptors`
470
451
  """
471
452
  for element in self.xpath(selector, identifier, auto_match, auto_save, percentage, **kwargs):
472
453
  return element
@@ -493,7 +474,7 @@ class Adaptor(SelectorsGeneration):
493
474
  :return: List as :class:`Adaptors`
494
475
  """
495
476
  try:
496
- if not self.__auto_match_enabled:
477
+ if not self.__auto_match_enabled or ',' not in selector:
497
478
  # No need to split selectors in this case, let's save some CPU cycles :)
498
479
  xpath_selector = HTMLTranslator().css_to_xpath(selector)
499
480
  return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
@@ -507,11 +488,8 @@ class Adaptor(SelectorsGeneration):
507
488
  results += self.xpath(
508
489
  xpath_selector, identifier or single_selector.canonical(), auto_match, auto_save, percentage
509
490
  )
510
- else:
511
- xpath_selector = HTMLTranslator().css_to_xpath(selector)
512
- return self.xpath(xpath_selector, identifier or selector, auto_match, auto_save, percentage)
513
491
 
514
- return self.__convert_results(results)
492
+ return results
515
493
  except (SelectorError, SelectorSyntaxError,):
516
494
  raise SelectorSyntaxError(f"Invalid CSS selector: {selector}")
517
495
 
@@ -538,37 +516,37 @@ class Adaptor(SelectorsGeneration):
538
516
  :return: List as :class:`Adaptors`
539
517
  """
540
518
  try:
541
- selected_elements = self._root.xpath(selector, **kwargs)
519
+ elements = self._root.xpath(selector, **kwargs)
542
520
 
543
- if selected_elements:
544
- if not self.__auto_match_enabled and auto_save:
545
- log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
546
-
547
- elif self.__auto_match_enabled and auto_save:
548
- self.save(selected_elements[0], identifier or selector)
521
+ if elements:
522
+ if auto_save:
523
+ if not self.__auto_match_enabled:
524
+ log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
525
+ else:
526
+ self.save(elements[0], identifier or selector)
549
527
 
550
- return self.__convert_results(selected_elements)
551
- else:
552
- if self.__auto_match_enabled and auto_match:
528
+ return self.__handle_elements(elements)
529
+ elif self.__auto_match_enabled:
530
+ if auto_match:
553
531
  element_data = self.retrieve(identifier or selector)
554
532
  if element_data:
555
- relocated = self.relocate(element_data, percentage)
556
- if relocated is not None and auto_save:
557
- self.save(relocated[0], identifier or selector)
558
-
559
- return self.__convert_results(relocated)
560
- else:
561
- return self.__convert_results(selected_elements)
533
+ elements = self.relocate(element_data, percentage)
534
+ if elements is not None and auto_save:
535
+ self.save(elements[0], identifier or selector)
562
536
 
563
- elif not self.__auto_match_enabled and auto_match:
537
+ return self.__handle_elements(elements)
538
+ else:
539
+ if auto_match:
564
540
  log.warning("Argument `auto_match` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
541
+ elif auto_save:
542
+ log.warning("Argument `auto_save` will be ignored because `auto_match` wasn't enabled on initialization. Check docs for more info.")
565
543
 
566
- return self.__convert_results(selected_elements)
544
+ return self.__handle_elements(elements)
567
545
 
568
546
  except (SelectorError, SelectorSyntaxError, etree.XPathError, etree.XPathEvalError):
569
547
  raise SelectorSyntaxError(f"Invalid XPath selector: {selector}")
570
548
 
571
- def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptors[Adaptor]', List]:
549
+ def find_all(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> 'Adaptors':
572
550
  """Find elements by filters of your creations for ease..
573
551
 
574
552
  :param args: Tag name(s), an iterable of tag names, regex patterns, function, or a dictionary of elements' attributes. Leave empty for selecting all.
@@ -588,15 +566,7 @@ class Adaptor(SelectorsGeneration):
588
566
 
589
567
  attributes = dict()
590
568
  tags, patterns = set(), set()
591
- results, functions, selectors = [], [], []
592
-
593
- def _search_tree(element: Adaptor, filter_function: Callable) -> None:
594
- """Collect element if it fulfills passed function otherwise, traverse the children tree and iterate"""
595
- if filter_function(element):
596
- results.append(element)
597
-
598
- for branch in element.children:
599
- _search_tree(branch, filter_function)
569
+ results, functions, selectors = Adaptors([]), [], []
600
570
 
601
571
  # Brace yourself for a wonderful journey!
602
572
  for arg in args:
@@ -608,12 +578,12 @@ class Adaptor(SelectorsGeneration):
608
578
  raise TypeError('Nested Iterables are not accepted, only iterables of tag names are accepted')
609
579
  tags.update(set(arg))
610
580
 
611
- elif type(arg) is dict:
581
+ elif isinstance(arg, dict):
612
582
  if not all([(type(k) is str and type(v) is str) for k, v in arg.items()]):
613
583
  raise TypeError('Nested dictionaries are not accepted, only string keys and string values are accepted')
614
584
  attributes.update(arg)
615
585
 
616
- elif type(arg) is re.Pattern:
586
+ elif isinstance(arg, re.Pattern):
617
587
  patterns.add(arg)
618
588
 
619
589
  elif callable(arg):
@@ -634,14 +604,14 @@ class Adaptor(SelectorsGeneration):
634
604
  attributes[attribute_name] = value
635
605
 
636
606
  # It's easier and faster to build a selector than traversing the tree
637
- tags = tags or ['']
607
+ tags = tags or ['*']
638
608
  for tag in tags:
639
609
  selector = tag
640
610
  for key, value in attributes.items():
641
611
  value = value.replace('"', r'\"') # Escape double quotes in user input
642
612
  # Not escaping anything with the key so the user can pass patterns like {'href*': '/p/'} or get errors :)
643
613
  selector += '[{}="{}"]'.format(key, value)
644
- if selector:
614
+ if selector != '*':
645
615
  selectors.append(selector)
646
616
 
647
617
  if selectors:
@@ -655,14 +625,15 @@ class Adaptor(SelectorsGeneration):
655
625
  for function in functions:
656
626
  results = results.filter(function)
657
627
  else:
628
+ results = results or self.below_elements
658
629
  for pattern in patterns:
659
- results.extend(self.find_by_regex(pattern, first_match=False))
630
+ results = results.filter(lambda e: e.text.re(pattern, check_match=True))
660
631
 
661
- for result in (results or [self]):
662
- for function in functions:
663
- _search_tree(result, function)
632
+ # Collect element if it fulfills passed function otherwise
633
+ for function in functions:
634
+ results = results.filter(function)
664
635
 
665
- return self.__convert_results(results)
636
+ return results
666
637
 
667
638
  def find(self, *args: Union[str, Iterable[str], Pattern, Callable, Dict[str, str]], **kwargs: str) -> Union['Adaptor', None]:
668
639
  """Find elements by filters of your creations for ease then return the first result. Otherwise return `None`.
@@ -792,7 +763,7 @@ class Adaptor(SelectorsGeneration):
792
763
  return self.get_all_text(strip=True).json()
793
764
 
794
765
  def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
795
- clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
766
+ clean_match: bool = False, case_sensitive: bool = False) -> TextHandlers:
796
767
  """Apply the given regex to the current text and return a list of strings with the matches.
797
768
 
798
769
  :param regex: Can be either a compiled regular expression or a string.
@@ -803,7 +774,7 @@ class Adaptor(SelectorsGeneration):
803
774
  return self.text.re(regex, replace_entities, clean_match, case_sensitive)
804
775
 
805
776
  def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
806
- clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
777
+ clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
807
778
  """Apply the given regex to text and return the first match if found, otherwise return the default value.
808
779
 
809
780
  :param regex: Can be either a compiled regular expression or a string.
@@ -894,12 +865,12 @@ class Adaptor(SelectorsGeneration):
894
865
  if potential_match != root and are_alike(root, target_attrs, potential_match):
895
866
  similar_elements.append(potential_match)
896
867
 
897
- return self.__convert_results(similar_elements)
868
+ return self.__handle_elements(similar_elements)
898
869
 
899
870
  def find_by_text(
900
871
  self, text: str, first_match: bool = True, partial: bool = False,
901
872
  case_sensitive: bool = False, clean_match: bool = True
902
- ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
873
+ ) -> Union['Adaptors[Adaptor]', 'Adaptor']:
903
874
  """Find elements that its text content fully/partially matches input.
904
875
  :param text: Text query to match
905
876
  :param first_match: Return first element that matches conditions, enabled by default
@@ -908,74 +879,60 @@ class Adaptor(SelectorsGeneration):
908
879
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
909
880
  """
910
881
 
911
- results = []
882
+ results = Adaptors([])
912
883
  if not case_sensitive:
913
884
  text = text.lower()
914
885
 
915
- def _traverse(node: Adaptor) -> None:
886
+ # This selector gets all elements with text content
887
+ for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
916
888
  """Check if element matches given text otherwise, traverse the children tree and iterate"""
917
889
  node_text = node.text
918
- # if there's already no text in this node, dodge it to save CPU cycles and time
919
- if node_text:
920
- if clean_match:
921
- node_text = node_text.clean()
922
-
923
- if not case_sensitive:
924
- node_text = node_text.lower()
925
-
926
- if partial:
927
- if text in node_text:
928
- results.append(node)
929
- elif text == node_text:
930
- results.append(node)
890
+ if clean_match:
891
+ node_text = node_text.clean()
931
892
 
932
- if results and first_match:
933
- # we got an element so we should stop
934
- return
893
+ if not case_sensitive:
894
+ node_text = node_text.lower()
935
895
 
936
- for branch in node.children:
937
- _traverse(branch)
896
+ if partial:
897
+ if text in node_text:
898
+ results.append(node)
899
+ elif text == node_text:
900
+ results.append(node)
938
901
 
939
- # This will block until we traverse all children/branches
940
- _traverse(self)
902
+ if first_match and results:
903
+ # we got an element so we should stop
904
+ break
941
905
 
942
906
  if first_match:
943
907
  if results:
944
908
  return results[0]
945
- return self.__convert_results(results)
909
+ return results
946
910
 
947
911
  def find_by_regex(
948
912
  self, query: Union[str, Pattern[str]], first_match: bool = True, case_sensitive: bool = False, clean_match: bool = True
949
- ) -> Union['Adaptors[Adaptor]', 'Adaptor', List]:
913
+ ) -> Union['Adaptors[Adaptor]', 'Adaptor']:
950
914
  """Find elements that its text content matches the input regex pattern.
951
915
  :param query: Regex query/pattern to match
952
916
  :param first_match: Return first element that matches conditions, enabled by default
953
917
  :param case_sensitive: if enabled, letters case will be taken into consideration in the regex
954
918
  :param clean_match: if enabled, this will ignore all whitespaces and consecutive spaces while matching
955
919
  """
956
- results = []
920
+ results = Adaptors([])
957
921
 
958
- def _traverse(node: Adaptor) -> None:
922
+ # This selector gets all elements with text content
923
+ for node in self.__handle_elements(self._root.xpath('.//*[normalize-space(text())]')):
959
924
  """Check if element matches given regex otherwise, traverse the children tree and iterate"""
960
925
  node_text = node.text
961
- # if there's already no text in this node, dodge it to save CPU cycles and time
962
- if node_text:
963
- if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
964
- results.append(node)
926
+ if node_text.re(query, check_match=True, clean_match=clean_match, case_sensitive=case_sensitive):
927
+ results.append(node)
965
928
 
966
- if results and first_match:
929
+ if first_match and results:
967
930
  # we got an element so we should stop
968
- return
969
-
970
- for branch in node.children:
971
- _traverse(branch)
972
-
973
- # This will block until we traverse all children/branches
974
- _traverse(self)
931
+ break
975
932
 
976
933
  if results and first_match:
977
934
  return results[0]
978
- return self.__convert_results(results)
935
+ return results
979
936
 
980
937
 
981
938
  class Adaptors(List[Adaptor]):
@@ -984,7 +941,15 @@ class Adaptors(List[Adaptor]):
984
941
  """
985
942
  __slots__ = ()
986
943
 
987
- def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors[Adaptor]"]:
944
+ @typing.overload
945
+ def __getitem__(self, pos: SupportsIndex) -> Adaptor:
946
+ pass
947
+
948
+ @typing.overload
949
+ def __getitem__(self, pos: slice) -> "Adaptors":
950
+ pass
951
+
952
+ def __getitem__(self, pos: Union[SupportsIndex, slice]) -> Union[Adaptor, "Adaptors"]:
988
953
  lst = super().__getitem__(pos)
989
954
  if isinstance(pos, slice):
990
955
  return self.__class__(lst)
@@ -993,7 +958,7 @@ class Adaptors(List[Adaptor]):
993
958
 
994
959
  def xpath(
995
960
  self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0, **kwargs: Any
996
- ) -> Union["Adaptors[Adaptor]", List]:
961
+ ) -> "Adaptors[Adaptor]":
997
962
  """
998
963
  Call the ``.xpath()`` method for each element in this list and return
999
964
  their results as another :class:`Adaptors`.
@@ -1019,7 +984,7 @@ class Adaptors(List[Adaptor]):
1019
984
  ]
1020
985
  return self.__class__(flatten(results))
1021
986
 
1022
- def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> Union["Adaptors[Adaptor]", List]:
987
+ def css(self, selector: str, identifier: str = '', auto_save: bool = False, percentage: int = 0) -> "Adaptors[Adaptor]":
1023
988
  """
1024
989
  Call the ``.css()`` method for each element in this list and return
1025
990
  their results flattened as another :class:`Adaptors`.
@@ -1044,7 +1009,7 @@ class Adaptors(List[Adaptor]):
1044
1009
  return self.__class__(flatten(results))
1045
1010
 
1046
1011
  def re(self, regex: Union[str, Pattern[str]], replace_entities: bool = True,
1047
- clean_match: bool = False, case_sensitive: bool = False) -> 'List[str]':
1012
+ clean_match: bool = False, case_sensitive: bool = False) -> TextHandlers[TextHandler]:
1048
1013
  """Call the ``.re()`` method for each element in this list and return
1049
1014
  their results flattened as List of TextHandler.
1050
1015
 
@@ -1056,10 +1021,10 @@ class Adaptors(List[Adaptor]):
1056
1021
  results = [
1057
1022
  n.text.re(regex, replace_entities, clean_match, case_sensitive) for n in self
1058
1023
  ]
1059
- return flatten(results)
1024
+ return TextHandlers(flatten(results))
1060
1025
 
1061
1026
  def re_first(self, regex: Union[str, Pattern[str]], default=None, replace_entities: bool = True,
1062
- clean_match: bool = False, case_sensitive: bool = False) -> Union[str, None]:
1027
+ clean_match: bool = False, case_sensitive: bool = False) -> TextHandler:
1063
1028
  """Call the ``.re_first()`` method for each element in this list and return
1064
1029
  the first result or the default value otherwise.
1065
1030
 
@@ -1084,15 +1049,14 @@ class Adaptors(List[Adaptor]):
1084
1049
  return element
1085
1050
  return None
1086
1051
 
1087
- def filter(self, func: Callable[['Adaptor'], bool]) -> Union['Adaptors', List]:
1052
+ def filter(self, func: Callable[['Adaptor'], bool]) -> 'Adaptors[Adaptor]':
1088
1053
  """Filter current elements based on the passed function
1089
1054
  :param func: A function that takes each element as an argument and returns True/False
1090
1055
  :return: The new `Adaptors` object or empty list otherwise.
1091
1056
  """
1092
- results = [
1057
+ return self.__class__([
1093
1058
  element for element in self if func(element)
1094
- ]
1095
- return self.__class__(results) if results else results
1059
+ ])
1096
1060
 
1097
1061
  # For easy copy-paste from Scrapy/parsel code when needed :)
1098
1062
  def get(self, default=None):