webscout 6.4__py3-none-any.whl → 6.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (116) hide show
  1. webscout/AIutel.py +7 -54
  2. webscout/DWEBS.py +48 -26
  3. webscout/{YTdownloader.py → Extra/YTToolkit/YTdownloader.py} +990 -1103
  4. webscout/Extra/YTToolkit/__init__.py +3 -0
  5. webscout/{transcriber.py → Extra/YTToolkit/transcriber.py} +1 -1
  6. webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
  7. webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
  8. webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
  9. webscout/Extra/YTToolkit/ytapi/extras.py +45 -0
  10. webscout/Extra/YTToolkit/ytapi/https.py +88 -0
  11. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
  12. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
  13. webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
  14. webscout/Extra/YTToolkit/ytapi/query.py +37 -0
  15. webscout/Extra/YTToolkit/ytapi/stream.py +60 -0
  16. webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
  17. webscout/Extra/YTToolkit/ytapi/video.py +102 -0
  18. webscout/Extra/__init__.py +2 -1
  19. webscout/Extra/autocoder/autocoder_utiles.py +119 -101
  20. webscout/Extra/autocoder/rawdog.py +679 -680
  21. webscout/Extra/gguf.py +441 -441
  22. webscout/Extra/markdownlite/__init__.py +862 -0
  23. webscout/Extra/weather_ascii.py +2 -2
  24. webscout/Provider/AISEARCH/__init__.py +2 -0
  25. webscout/Provider/AISEARCH/ooai.py +155 -0
  26. webscout/Provider/Amigo.py +70 -85
  27. webscout/Provider/{prefind.py → Jadve.py} +72 -70
  28. webscout/Provider/Netwrck.py +235 -0
  29. webscout/Provider/Openai.py +4 -3
  30. webscout/Provider/PI.py +292 -221
  31. webscout/Provider/PizzaGPT.py +3 -3
  32. webscout/Provider/Reka.py +0 -1
  33. webscout/Provider/TTS/__init__.py +5 -1
  34. webscout/Provider/TTS/deepgram.py +183 -0
  35. webscout/Provider/TTS/elevenlabs.py +137 -0
  36. webscout/Provider/TTS/gesserit.py +151 -0
  37. webscout/Provider/TTS/murfai.py +139 -0
  38. webscout/Provider/TTS/parler.py +134 -107
  39. webscout/Provider/TTS/streamElements.py +360 -275
  40. webscout/Provider/TTS/utils.py +280 -0
  41. webscout/Provider/TTS/voicepod.py +116 -116
  42. webscout/Provider/TeachAnything.py +15 -2
  43. webscout/Provider/Youchat.py +42 -8
  44. webscout/Provider/__init__.py +8 -21
  45. webscout/Provider/meta.py +794 -779
  46. webscout/Provider/multichat.py +230 -0
  47. webscout/Provider/promptrefine.py +2 -2
  48. webscout/Provider/talkai.py +10 -13
  49. webscout/Provider/turboseek.py +5 -4
  50. webscout/Provider/tutorai.py +8 -112
  51. webscout/Provider/typegpt.py +5 -7
  52. webscout/Provider/x0gpt.py +81 -9
  53. webscout/Provider/yep.py +123 -361
  54. webscout/__init__.py +33 -28
  55. webscout/conversation.py +24 -9
  56. webscout/exceptions.py +188 -20
  57. webscout/litprinter/__init__.py +719 -831
  58. webscout/litprinter/colors.py +54 -0
  59. webscout/optimizers.py +420 -270
  60. webscout/prompt_manager.py +279 -279
  61. webscout/scout/__init__.py +8 -0
  62. webscout/scout/core/__init__.py +7 -0
  63. webscout/scout/core/crawler.py +140 -0
  64. webscout/scout/core/scout.py +571 -0
  65. webscout/scout/core/search_result.py +96 -0
  66. webscout/scout/core/text_analyzer.py +63 -0
  67. webscout/scout/core/text_utils.py +277 -0
  68. webscout/scout/core/web_analyzer.py +52 -0
  69. webscout/scout/core.py +884 -0
  70. webscout/scout/element.py +460 -0
  71. webscout/scout/parsers/__init__.py +69 -0
  72. webscout/scout/parsers/html5lib_parser.py +172 -0
  73. webscout/scout/parsers/html_parser.py +236 -0
  74. webscout/scout/parsers/lxml_parser.py +178 -0
  75. webscout/scout/utils.py +38 -0
  76. webscout/update_checker.py +184 -125
  77. webscout/version.py +1 -1
  78. webscout/zeroart/__init__.py +55 -0
  79. webscout/zeroart/base.py +60 -0
  80. webscout/zeroart/effects.py +99 -0
  81. webscout/zeroart/fonts.py +816 -0
  82. webscout/zerodir/__init__.py +225 -0
  83. {webscout-6.4.dist-info → webscout-6.6.dist-info}/METADATA +18 -231
  84. webscout-6.6.dist-info/RECORD +197 -0
  85. webscout-6.6.dist-info/top_level.txt +2 -0
  86. webstoken/__init__.py +30 -0
  87. webstoken/classifier.py +189 -0
  88. webstoken/keywords.py +216 -0
  89. webstoken/language.py +128 -0
  90. webstoken/ner.py +164 -0
  91. webstoken/normalizer.py +35 -0
  92. webstoken/processor.py +77 -0
  93. webstoken/sentiment.py +206 -0
  94. webstoken/stemmer.py +73 -0
  95. webstoken/t.py +75 -0
  96. webstoken/tagger.py +60 -0
  97. webstoken/tokenizer.py +158 -0
  98. webscout/Agents/Onlinesearcher.py +0 -182
  99. webscout/Agents/__init__.py +0 -2
  100. webscout/Agents/functioncall.py +0 -248
  101. webscout/Bing_search.py +0 -251
  102. webscout/Provider/Perplexity.py +0 -599
  103. webscout/Provider/RoboCoders.py +0 -206
  104. webscout/Provider/genspark.py +0 -225
  105. webscout/Provider/perplexitylabs.py +0 -265
  106. webscout/Provider/twitterclone.py +0 -251
  107. webscout/Provider/upstage.py +0 -230
  108. webscout/gpt4free.py +0 -666
  109. webscout/requestsHTMLfix.py +0 -775
  110. webscout/webai.py +0 -2590
  111. webscout-6.4.dist-info/RECORD +0 -154
  112. webscout-6.4.dist-info/top_level.txt +0 -1
  113. /webscout/Provider/{felo_search.py → AISEARCH/felo_search.py} +0 -0
  114. {webscout-6.4.dist-info → webscout-6.6.dist-info}/LICENSE.md +0 -0
  115. {webscout-6.4.dist-info → webscout-6.6.dist-info}/WHEEL +0 -0
  116. {webscout-6.4.dist-info → webscout-6.6.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,140 @@
1
+ """
2
+ Scout Crawler Module
3
+ """
4
+
5
+ import concurrent.futures
6
+ import urllib.parse
7
+ from typing import Union, List, Dict
8
+ import requests
9
+
10
+ from .scout import Scout
11
+
12
+ class ScoutCrawler:
13
+ """
14
+ Advanced web crawling utility for Scout library.
15
+ """
16
+ def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None):
17
+ """
18
+ Initialize the web crawler.
19
+
20
+ Args:
21
+ base_url (str): Starting URL to crawl
22
+ max_pages (int, optional): Maximum number of pages to crawl
23
+ tags_to_remove (List[str], optional): List of tags to remove
24
+ """
25
+ self.base_url = base_url
26
+ self.max_pages = max_pages
27
+ self.tags_to_remove = tags_to_remove if tags_to_remove is not None else ["script", "style", "header", "footer", "nav", "aside", "form", "button"]
28
+ self.visited_urls = set()
29
+ self.crawled_pages = []
30
+
31
+ def _is_valid_url(self, url: str) -> bool:
32
+ """
33
+ Check if a URL is valid and within the same domain.
34
+
35
+ Args:
36
+ url (str): URL to validate
37
+
38
+ Returns:
39
+ bool: Whether the URL is valid
40
+ """
41
+ try:
42
+ parsed_base = urllib.parse.urlparse(self.base_url)
43
+ parsed_url = urllib.parse.urlparse(url)
44
+
45
+ return (
46
+ parsed_url.scheme in ['http', 'https'] and
47
+ parsed_base.netloc == parsed_url.netloc and
48
+ len(self.visited_urls) < self.max_pages
49
+ )
50
+ except Exception:
51
+ return False
52
+
53
+ def _crawl_page(self, url: str, depth: int = 0) -> Dict[str, Union[str, List[str]]]:
54
+ """
55
+ Crawl a single page and extract information.
56
+
57
+ Args:
58
+ url (str): URL to crawl
59
+ depth (int, optional): Current crawl depth
60
+
61
+ Returns:
62
+ Dict[str, Union[str, List[str]]]: Crawled page information
63
+ """
64
+ if url in self.visited_urls:
65
+ return {}
66
+
67
+ try:
68
+ response = requests.get(url, timeout=10)
69
+ response.raise_for_status()
70
+
71
+ scout = Scout(response.content, features='lxml')
72
+
73
+ title_result = scout.find('title')
74
+ title = title_result[0].get_text() if title_result else ''
75
+
76
+ visible_text = scout._soup.get_text(strip=True)
77
+
78
+ for tag in scout._soup(self.tags_to_remove):
79
+ tag.extract()
80
+
81
+ page_info = {
82
+ 'url': url,
83
+ 'title': title,
84
+ 'links': [
85
+ urllib.parse.urljoin(url, link.get('href'))
86
+ for link in scout.find_all('a', href=True)
87
+ if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
88
+ ],
89
+ 'text': visible_text,
90
+ 'depth': depth
91
+ }
92
+
93
+ self.visited_urls.add(url)
94
+ self.crawled_pages.append(page_info)
95
+
96
+ return page_info
97
+ except Exception as e:
98
+ print(f"Error crawling {url}: {e}")
99
+ return {}
100
+
101
+ def crawl(self) -> List[Dict[str, Union[str, List[str]]]]:
102
+ """
103
+ Start web crawling from base URL.
104
+
105
+ Returns:
106
+ List[Dict[str, Union[str, List[str]]]]: List of crawled pages
107
+ """
108
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
109
+ futures = {executor.submit(self._crawl_page, self.base_url, 0)}
110
+
111
+ while futures:
112
+ done, futures = concurrent.futures.wait(
113
+ futures, return_when=concurrent.futures.FIRST_COMPLETED
114
+ )
115
+
116
+ for future in done:
117
+ page_info = future.result()
118
+
119
+ if len(self.visited_urls) >= self.max_pages:
120
+ break
121
+
122
+ submitted_links = set() # New set to track submitted links
123
+ for link in page_info.get('links', []):
124
+ if (
125
+ len(self.visited_urls) < self.max_pages and
126
+ link not in self.visited_urls
127
+ ):
128
+ if link not in submitted_links: # Check against submitted links
129
+ submitted_links.add(link) # Add to submitted links
130
+ futures.add(
131
+ executor.submit(
132
+ self._crawl_page,
133
+ link,
134
+ page_info.get('depth', 0) + 1
135
+ )
136
+ )
137
+ if len(self.visited_urls) >= self.max_pages:
138
+ break
139
+
140
+ return self.crawled_pages
@@ -0,0 +1,571 @@
1
+ """
2
+ Scout Main Module - HTML Parsing and Traversal
3
+ """
4
+ import sys
5
+ import re
6
+ import warnings
7
+ import json
8
+ import hashlib
9
+ import unicodedata
10
+ import os
11
+ import urllib.parse
12
+ from typing import Union, List, Dict, Optional, Callable, Any, Tuple
13
+
14
+ from ..parsers import ParserRegistry
15
+ from ..element import Tag, NavigableString
16
+ from ..utils import decode_markup
17
+ from .text_analyzer import ScoutTextAnalyzer
18
+ from .web_analyzer import ScoutWebAnalyzer
19
+ from .search_result import ScoutSearchResult
20
+ from .text_utils import SentenceTokenizer
21
+
22
+
23
+ class Scout:
24
+ """
25
+ Scout - Making web scraping a breeze! 🌊
26
+ A comprehensive HTML parsing and traversal library.
27
+ Enhanced with advanced features and intelligent parsing.
28
+ """
29
+
30
+ def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
31
+ """
32
+ Initialize Scout with HTML content.
33
+
34
+ Args:
35
+ markup (str): HTML content to parse
36
+ features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
37
+ from_encoding (str): Source encoding (if known)
38
+ **kwargs: Additional parsing options
39
+ """
40
+ # Intelligent markup handling
41
+ self.markup = self._preprocess_markup(markup, from_encoding)
42
+ self.features = features
43
+ self.from_encoding = from_encoding
44
+
45
+ # Get the right parser for the job
46
+ if features not in ParserRegistry.list_parsers():
47
+ raise ValueError(
48
+ f"Invalid parser '{features}'! Choose from: {', '.join(ParserRegistry.list_parsers().keys())}"
49
+ )
50
+
51
+ parser_class = ParserRegistry.get_parser(features)
52
+ self.parser = parser_class
53
+
54
+ # Parse that HTML! 🎯
55
+ self._soup = self.parser.parse(self.markup)
56
+
57
+ # BeautifulSoup-like attributes
58
+ self.name = self._soup.name if hasattr(self._soup, 'name') else None
59
+ self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
60
+
61
+ # Advanced parsing options
62
+ self._cache = {}
63
+
64
+ # Text and web analyzers
65
+ self.text_analyzer = ScoutTextAnalyzer()
66
+ self.web_analyzer = ScoutWebAnalyzer()
67
+
68
+ def normalize_text(self, text: str, form='NFKD') -> str:
69
+ """
70
+ Normalize text using Unicode normalization.
71
+
72
+ Args:
73
+ text (str): Input text
74
+ form (str, optional): Normalization form
75
+
76
+ Returns:
77
+ str: Normalized text
78
+ """
79
+ return unicodedata.normalize(form, text)
80
+
81
+ def url_parse(self, url: str) -> Dict[str, str]:
82
+ """
83
+ Parse and analyze a URL.
84
+
85
+ Args:
86
+ url (str): URL to parse
87
+
88
+ Returns:
89
+ Dict[str, str]: Parsed URL components
90
+ """
91
+ parsed = urllib.parse.urlparse(url)
92
+ return {
93
+ 'scheme': parsed.scheme,
94
+ 'netloc': parsed.netloc,
95
+ 'path': parsed.path,
96
+ 'params': parsed.params,
97
+ 'query': parsed.query,
98
+ 'fragment': parsed.fragment
99
+ }
100
+
101
+ def analyze_page_structure(self) -> Dict[str, Any]:
102
+ """
103
+ Analyze the structure of the parsed page.
104
+
105
+ Returns:
106
+ Dict[str, Any]: Page structure analysis
107
+ """
108
+ return self.web_analyzer.analyze_page_structure(self)
109
+
110
+ def analyze_text(self, text: Optional[str] = None) -> Dict[str, Any]:
111
+ """
112
+ Perform advanced text analysis.
113
+
114
+ Args:
115
+ text (str, optional): Text to analyze. If None, uses page text.
116
+
117
+ Returns:
118
+ Dict[str, Any]: Text analysis results
119
+ """
120
+ if text is None:
121
+ text = self.get_text()
122
+
123
+ return {
124
+ 'word_count': self.text_analyzer.count_words(text),
125
+ 'entities': self.text_analyzer.extract_entities(text),
126
+ 'tokens': self.text_analyzer.tokenize(text)
127
+ }
128
+
129
+ def extract_semantic_info(self) -> Dict[str, Any]:
130
+ """
131
+ Extract semantic information from the document.
132
+
133
+ Returns:
134
+ Dict[str, Any]: Semantic information
135
+ """
136
+ semantic_info = {
137
+ 'headings': {
138
+ 'h1': [h.get_text(strip=True) for h in self.find_all('h1')],
139
+ 'h2': [h.get_text(strip=True) for h in self.find_all('h2')],
140
+ 'h3': [h.get_text(strip=True) for h in self.find_all('h3')]
141
+ },
142
+ 'lists': {
143
+ 'ul': [ul.find_all('li') for ul in self.find_all('ul')],
144
+ 'ol': [ol.find_all('li') for ol in self.find_all('ol')]
145
+ },
146
+ 'tables': {
147
+ 'count': len(self.find_all('table')),
148
+ 'headers': [table.find_all('th') for table in self.find_all('table')]
149
+ }
150
+ }
151
+ return semantic_info
152
+
153
+ def cache(self, key: str, value: Any = None) -> Any:
154
+ """
155
+ Manage a cache for parsed content.
156
+
157
+ Args:
158
+ key (str): Cache key
159
+ value (Any, optional): Value to cache
160
+
161
+ Returns:
162
+ Any: Cached value or None
163
+ """
164
+ if value is not None:
165
+ self._cache[key] = value
166
+ return self._cache.get(key)
167
+
168
+ def hash_content(self, method='md5') -> str:
169
+ """
170
+ Generate a hash of the parsed content.
171
+
172
+ Args:
173
+ method (str, optional): Hashing method
174
+
175
+ Returns:
176
+ str: Content hash
177
+ """
178
+ hash_methods = {
179
+ 'md5': hashlib.md5,
180
+ 'sha1': hashlib.sha1,
181
+ 'sha256': hashlib.sha256
182
+ }
183
+
184
+ if method not in hash_methods:
185
+ raise ValueError(f"Unsupported hash method: {method}")
186
+
187
+ hasher = hash_methods[method]()
188
+ hasher.update(str(self._soup).encode('utf-8'))
189
+ return hasher.hexdigest()
190
+
191
+ def extract_links(self, base_url: Optional[str] = None) -> List[Dict[str, str]]:
192
+ """
193
+ Extract all links from the document.
194
+
195
+ Args:
196
+ base_url (str, optional): Base URL for resolving relative links
197
+
198
+ Returns:
199
+ List[Dict[str, str]]: List of link dictionaries
200
+ """
201
+ links = []
202
+ for link in self.find_all(['a', 'link']):
203
+ href = link.get('href')
204
+ if href:
205
+ # Resolve relative URLs if base_url is provided
206
+ if base_url and not href.startswith(('http://', 'https://', '//')):
207
+ href = f"{base_url.rstrip('/')}/{href.lstrip('/')}"
208
+
209
+ links.append({
210
+ 'href': href,
211
+ 'text': link.get_text(strip=True),
212
+ 'rel': link.get('rel', [None])[0],
213
+ 'type': link.get('type')
214
+ })
215
+ return links
216
+
217
+ def extract_metadata(self) -> Dict[str, Any]:
218
+ """
219
+ Extract metadata from HTML document.
220
+
221
+ Returns:
222
+ Dict[str, Any]: Extracted metadata
223
+ """
224
+ metadata = {
225
+ 'title': self.find('title').texts()[0] if self.find('title').texts() else None,
226
+ 'description': self.find('meta', attrs={'name': 'description'}).attrs('content')[0] if self.find('meta', attrs={'name': 'description'}).attrs('content') else None,
227
+ 'keywords': self.find('meta', attrs={'name': 'keywords'}).attrs('content')[0].split(',') if self.find('meta', attrs={'name': 'keywords'}).attrs('content') else [],
228
+ 'og_metadata': {},
229
+ 'twitter_metadata': {}
230
+ }
231
+
232
+ # Open Graph metadata
233
+ for meta in self.find_all('meta', attrs={'property': re.compile(r'^og:')}):
234
+ key = meta.attrs('property')[0][3:]
235
+ metadata['og_metadata'][key] = meta.attrs('content')[0]
236
+
237
+ # Twitter Card metadata
238
+ for meta in self.find_all('meta', attrs={'name': re.compile(r'^twitter:')}):
239
+ key = meta.attrs('name')[0][8:]
240
+ metadata['twitter_metadata'][key] = meta.attrs('content')[0]
241
+
242
+ return metadata
243
+
244
+ def to_json(self, indent=2) -> str:
245
+ """
246
+ Convert parsed content to JSON.
247
+
248
+ Args:
249
+ indent (int, optional): JSON indentation
250
+
251
+ Returns:
252
+ str: JSON representation of the document
253
+ """
254
+ def _tag_to_dict(tag):
255
+ if isinstance(tag, NavigableString):
256
+ return str(tag)
257
+
258
+ result = {
259
+ 'name': tag.name,
260
+ 'attrs': tag.attrs,
261
+ 'text': tag.get_text(strip=True)
262
+ }
263
+
264
+ if tag.contents:
265
+ result['children'] = [_tag_to_dict(child) for child in tag.contents]
266
+
267
+ return result
268
+
269
+ return json.dumps(_tag_to_dict(self._soup), indent=indent)
270
+
271
+ def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> ScoutSearchResult:
272
+ """
273
+ Find the first matching element.
274
+
275
+ Args:
276
+ name (str, optional): Tag name to search for
277
+ attrs (dict, optional): Attributes to match
278
+ recursive (bool, optional): Search recursively
279
+ text (str, optional): Text content to match
280
+
281
+ Returns:
282
+ ScoutSearchResult: First matching element
283
+ """
284
+ result = self._soup.find(name, attrs, recursive, text, **kwargs)
285
+ return ScoutSearchResult([result]) if result else ScoutSearchResult([])
286
+
287
+ def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> ScoutSearchResult:
288
+ """
289
+ Find all matching elements.
290
+
291
+ Args:
292
+ name (str, optional): Tag name to search for
293
+ attrs (dict, optional): Attributes to match
294
+ recursive (bool, optional): Search recursively
295
+ text (str, optional): Text content to match
296
+ limit (int, optional): Maximum number of results
297
+
298
+ Returns:
299
+ ScoutSearchResult: List of matching elements
300
+ """
301
+ results = self._soup.find_all(name, attrs, recursive, text, limit, **kwargs)
302
+ return ScoutSearchResult(results)
303
+
304
+ def find_parent(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
305
+ """
306
+ Find the first parent matching given criteria.
307
+
308
+ Args:
309
+ name (str, optional): Tag name to search for
310
+ attrs (dict, optional): Attributes to match
311
+
312
+ Returns:
313
+ Tag or None: First matching parent
314
+ """
315
+ current = self._soup.parent
316
+ while current:
317
+ if (name is None or current.name == name) and \
318
+ all(current.get(k) == v for k, v in attrs.items()):
319
+ return current
320
+ current = current.parent
321
+ return None
322
+
323
+ def find_parents(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
324
+ """
325
+ Find all parents matching given criteria.
326
+
327
+ Args:
328
+ name (str, optional): Tag name to search for
329
+ attrs (dict, optional): Attributes to match
330
+ limit (int, optional): Maximum number of results
331
+
332
+ Returns:
333
+ List[Tag]: List of matching parents
334
+ """
335
+ parents = []
336
+ current = self._soup.parent
337
+ while current and (limit is None or len(parents) < limit):
338
+ if (name is None or current.name == name) and \
339
+ all(current.get(k) == v for k, v in attrs.items()):
340
+ parents.append(current)
341
+ current = current.parent
342
+ return parents
343
+
344
+ def find_next_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
345
+ """
346
+ Find the next sibling matching given criteria.
347
+
348
+ Args:
349
+ name (str, optional): Tag name to search for
350
+ attrs (dict, optional): Attributes to match
351
+
352
+ Returns:
353
+ Tag or None: First matching next sibling
354
+ """
355
+ if not self._soup.parent:
356
+ return None
357
+
358
+ siblings = self._soup.parent.contents
359
+ try:
360
+ current_index = siblings.index(self._soup)
361
+ for sibling in siblings[current_index + 1:]:
362
+ if isinstance(sibling, Tag):
363
+ if (name is None or sibling.name == name) and \
364
+ all(sibling.get(k) == v for k, v in attrs.items()):
365
+ return sibling
366
+ except ValueError:
367
+ pass
368
+ return None
369
+
370
+ def find_next_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
371
+ """
372
+ Find all next siblings matching given criteria.
373
+
374
+ Args:
375
+ name (str, optional): Tag name to search for
376
+ attrs (dict, optional): Attributes to match
377
+ limit (int, optional): Maximum number of results
378
+
379
+ Returns:
380
+ List[Tag]: List of matching next siblings
381
+ """
382
+ if not self._soup.parent:
383
+ return []
384
+
385
+ siblings = []
386
+ siblings_list = self._soup.parent.contents
387
+ try:
388
+ current_index = siblings_list.index(self._soup)
389
+ for sibling in siblings_list[current_index + 1:]:
390
+ if isinstance(sibling, Tag):
391
+ if (name is None or sibling.name == name) and \
392
+ all(sibling.get(k) == v for k, v in attrs.items()):
393
+ siblings.append(sibling)
394
+ if limit and len(siblings) == limit:
395
+ break
396
+ except ValueError:
397
+ pass
398
+ return siblings
399
+
400
+ def select(self, selector: str) -> List[Tag]:
401
+ """
402
+ Select elements using CSS selector.
403
+
404
+ Args:
405
+ selector (str): CSS selector string
406
+
407
+ Returns:
408
+ List[Tag]: List of matching elements
409
+ """
410
+ return self._soup.select(selector)
411
+
412
+ def select_one(self, selector: str) -> Optional[Tag]:
413
+ """
414
+ Select the first element matching the CSS selector.
415
+
416
+ Args:
417
+ selector (str): CSS selector string
418
+
419
+ Returns:
420
+ Tag or None: First matching element
421
+ """
422
+ return self._soup.select_one(selector)
423
+
424
+ def get_text(self, separator=' ', strip=False, types=None) -> str:
425
+ """
426
+ Extract all text from the parsed document.
427
+
428
+ Args:
429
+ separator (str, optional): Text separator
430
+ strip (bool, optional): Strip whitespace
431
+ types (list, optional): Types of content to extract
432
+
433
+ Returns:
434
+ str: Extracted text
435
+ """
436
+ tokenizer = SentenceTokenizer()
437
+ text = self._soup.get_text(separator, strip, types)
438
+ sentences = tokenizer.tokenize(text)
439
+ return "\n\n".join(sentences)
440
+
441
+ def remove_tags(self, tags: List[str]) -> None:
442
+ """
443
+ Remove specified tags and their contents from the document.
444
+
445
+ Args:
446
+ tags (List[str]): List of tag names to remove
447
+ """
448
+ for tag_name in tags:
449
+ for tag in self._soup.find_all(tag_name):
450
+ tag.decompose()
451
+
452
+ def prettify(self, formatter='minimal') -> str:
453
+ """
454
+ Return a formatted, pretty-printed version of the HTML.
455
+
456
+ Args:
457
+ formatter (str, optional): Formatting style
458
+
459
+ Returns:
460
+ str: Prettified HTML
461
+ """
462
+ return self._soup.prettify(formatter)
463
+
464
+ def decompose(self, tag: Tag = None) -> None:
465
+ """
466
+ Remove a tag and its contents from the document.
467
+
468
+ Args:
469
+ tag (Tag, optional): Tag to remove. If None, removes the root tag.
470
+ """
471
+ if tag is None:
472
+ tag = self._soup
473
+ tag.decompose()
474
+
475
+ def extract(self, tag: Tag = None) -> Tag:
476
+ """
477
+ Remove a tag from the document and return it.
478
+
479
+ Args:
480
+ tag (Tag, optional): Tag to extract. If None, extracts the root tag.
481
+
482
+ Returns:
483
+ Tag: Extracted tag
484
+ """
485
+ if tag is None:
486
+ tag = self._soup
487
+ return tag.extract()
488
+
489
+ def clear(self, tag: Tag = None) -> None:
490
+ """
491
+ Remove a tag's contents while keeping the tag itself.
492
+
493
+ Args:
494
+ tag (Tag, optional): Tag to clear. If None, clears the root tag.
495
+ """
496
+ if tag is None:
497
+ tag = self._soup
498
+ tag.clear()
499
+
500
+ def replace_with(self, old_tag: Tag, new_tag: Tag) -> None:
501
+ """
502
+ Replace one tag with another.
503
+
504
+ Args:
505
+ old_tag (Tag): Tag to replace
506
+ new_tag (Tag): Replacement tag
507
+ """
508
+ old_tag.replace_with(new_tag)
509
+
510
+ def encode(self, encoding='utf-8') -> bytes:
511
+ """
512
+ Encode the document to a specific encoding.
513
+
514
+ Args:
515
+ encoding (str, optional): Encoding to use
516
+
517
+ Returns:
518
+ bytes: Encoded document
519
+ """
520
+ return str(self._soup).encode(encoding)
521
+
522
+ def decode(self, encoding='utf-8') -> str:
523
+ """
524
+ Decode the document from a specific encoding.
525
+
526
+ Args:
527
+ encoding (str, optional): Encoding to use
528
+
529
+ Returns:
530
+ str: Decoded document
531
+ """
532
+ return str(self._soup)
533
+
534
+ def __str__(self) -> str:
535
+ """
536
+ String representation of the parsed document.
537
+
538
+ Returns:
539
+ str: HTML content
540
+ """
541
+ return str(self._soup)
542
+
543
+ def __repr__(self) -> str:
544
+ """
545
+ Detailed representation of the Scout object.
546
+
547
+ Returns:
548
+ str: Scout object description
549
+ """
550
+ return f"Scout(features='{self.features}', content_length={len(self.markup)})"
551
+
552
+ def _preprocess_markup(self, markup: str, encoding: Optional[str] = None) -> str:
553
+ """
554
+ Preprocess markup before parsing.
555
+
556
+ Args:
557
+ markup (str): Input markup
558
+ encoding (str, optional): Encoding to use
559
+
560
+ Returns:
561
+ str: Preprocessed markup
562
+ """
563
+ # Decode markup
564
+ decoded_markup = decode_markup(markup, encoding)
565
+
566
+ # Basic HTML cleaning
567
+ # Remove comments, normalize whitespace, etc.
568
+ decoded_markup = re.sub(r'<!--.*?-->', '', decoded_markup, flags=re.DOTALL)
569
+ decoded_markup = re.sub(r'\s+', ' ', decoded_markup)
570
+
571
+ return decoded_markup