webscout 6.4__py3-none-any.whl → 6.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (69) hide show
  1. webscout/AIutel.py +7 -54
  2. webscout/DWEBS.py +48 -26
  3. webscout/{YTdownloader.py → Extra/YTToolkit/YTdownloader.py} +990 -1103
  4. webscout/Extra/YTToolkit/__init__.py +3 -0
  5. webscout/{transcriber.py → Extra/YTToolkit/transcriber.py} +1 -1
  6. webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
  7. webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
  8. webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
  9. webscout/Extra/YTToolkit/ytapi/extras.py +45 -0
  10. webscout/Extra/YTToolkit/ytapi/https.py +88 -0
  11. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
  12. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
  13. webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
  14. webscout/Extra/YTToolkit/ytapi/query.py +37 -0
  15. webscout/Extra/YTToolkit/ytapi/stream.py +60 -0
  16. webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
  17. webscout/Extra/YTToolkit/ytapi/video.py +102 -0
  18. webscout/Extra/__init__.py +2 -1
  19. webscout/Extra/autocoder/rawdog.py +679 -680
  20. webscout/Extra/gguf.py +441 -441
  21. webscout/Extra/markdownlite/__init__.py +862 -0
  22. webscout/Extra/weather_ascii.py +2 -2
  23. webscout/Provider/PI.py +292 -221
  24. webscout/Provider/Perplexity.py +6 -14
  25. webscout/Provider/Reka.py +0 -1
  26. webscout/Provider/TTS/__init__.py +5 -1
  27. webscout/Provider/TTS/deepgram.py +183 -0
  28. webscout/Provider/TTS/elevenlabs.py +137 -0
  29. webscout/Provider/TTS/gesserit.py +151 -0
  30. webscout/Provider/TTS/murfai.py +139 -0
  31. webscout/Provider/TTS/parler.py +134 -107
  32. webscout/Provider/TTS/streamElements.py +360 -275
  33. webscout/Provider/TTS/utils.py +280 -0
  34. webscout/Provider/TTS/voicepod.py +116 -116
  35. webscout/Provider/__init__.py +146 -146
  36. webscout/Provider/meta.py +794 -779
  37. webscout/Provider/typegpt.py +1 -2
  38. webscout/__init__.py +24 -28
  39. webscout/litprinter/__init__.py +831 -830
  40. webscout/optimizers.py +269 -269
  41. webscout/prompt_manager.py +279 -279
  42. webscout/scout/__init__.py +11 -0
  43. webscout/scout/core.py +884 -0
  44. webscout/scout/element.py +459 -0
  45. webscout/scout/parsers/__init__.py +69 -0
  46. webscout/scout/parsers/html5lib_parser.py +172 -0
  47. webscout/scout/parsers/html_parser.py +236 -0
  48. webscout/scout/parsers/lxml_parser.py +178 -0
  49. webscout/scout/utils.py +38 -0
  50. webscout/update_checker.py +125 -125
  51. webscout/version.py +1 -1
  52. webscout/zeroart/__init__.py +55 -0
  53. webscout/zeroart/base.py +61 -0
  54. webscout/zeroart/effects.py +99 -0
  55. webscout/zeroart/fonts.py +816 -0
  56. webscout/zerodir/__init__.py +225 -0
  57. {webscout-6.4.dist-info → webscout-6.5.dist-info}/METADATA +12 -68
  58. {webscout-6.4.dist-info → webscout-6.5.dist-info}/RECORD +62 -37
  59. webscout/Agents/Onlinesearcher.py +0 -182
  60. webscout/Agents/__init__.py +0 -2
  61. webscout/Agents/functioncall.py +0 -248
  62. webscout/Bing_search.py +0 -251
  63. webscout/gpt4free.py +0 -666
  64. webscout/requestsHTMLfix.py +0 -775
  65. webscout/webai.py +0 -2590
  66. {webscout-6.4.dist-info → webscout-6.5.dist-info}/LICENSE.md +0 -0
  67. {webscout-6.4.dist-info → webscout-6.5.dist-info}/WHEEL +0 -0
  68. {webscout-6.4.dist-info → webscout-6.5.dist-info}/entry_points.txt +0 -0
  69. {webscout-6.4.dist-info → webscout-6.5.dist-info}/top_level.txt +0 -0
webscout/scout/core.py ADDED
@@ -0,0 +1,884 @@
1
+ """
2
+ Scout - The next-gen web scraping library! 🚀
3
+ A powerful, flexible, and performant HTML parsing library.
4
+ Enhanced with advanced features and intelligent parsing.
5
+ """
6
+
7
+ import sys
8
+ import re
9
+ import warnings
10
+ import json
11
+ import hashlib
12
+ import unicodedata
13
+ import os
14
+ import requests
15
+ from markdownify import MarkdownConverter
16
+ import concurrent.futures
17
+ import urllib.parse
18
+ from collections import Counter, defaultdict
19
+ from typing import Union, List, Dict, Optional, Callable, Any, Tuple, Iterator, Set
20
+
21
+ from .parsers.html_parser import HTMLParser
22
+ from .parsers.lxml_parser import LXMLParser
23
+ from .parsers.html5lib_parser import HTML5Parser
24
+ from .element import Tag, NavigableString
25
+ from .utils import decode_markup
26
+
27
+ class ScoutTextAnalyzer:
28
+ """
29
+ Advanced text analysis and processing utility.
30
+ """
31
+ @staticmethod
32
+ def tokenize(text: str, lowercase=True, remove_punctuation=True) -> List[str]:
33
+ """
34
+ Tokenize text into words.
35
+
36
+ Args:
37
+ text (str): Input text
38
+ lowercase (bool, optional): Convert to lowercase
39
+ remove_punctuation (bool, optional): Remove punctuation
40
+
41
+ Returns:
42
+ List[str]: List of tokens
43
+ """
44
+ if lowercase:
45
+ text = text.lower()
46
+
47
+ if remove_punctuation:
48
+ text = re.sub(r'[^\w\s]', '', text)
49
+
50
+ return text.split()
51
+
52
+ @staticmethod
53
+ def count_words(text: str) -> Dict[str, int]:
54
+ """
55
+ Count word frequencies.
56
+
57
+ Args:
58
+ text (str): Input text
59
+
60
+ Returns:
61
+ Dict[str, int]: Word frequency dictionary
62
+ """
63
+ return dict(Counter(ScoutTextAnalyzer.tokenize(text)))
64
+
65
+ @staticmethod
66
+ def extract_entities(text: str) -> Dict[str, Set[str]]:
67
+ """
68
+ Extract named entities from text.
69
+
70
+ Args:
71
+ text (str): Input text
72
+
73
+ Returns:
74
+ Dict[str, Set[str]]: Extracted entities
75
+ """
76
+ entities = {
77
+ 'emails': set(re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)),
78
+ 'urls': set(re.findall(r'https?://\S+', text)),
79
+ 'phones': set(re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text)),
80
+ 'dates': set(re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', text))
81
+ }
82
+ return entities
83
+
84
+ class ScoutWebAnalyzer:
85
+ """
86
+ Advanced web content analysis utility.
87
+ """
88
+ @staticmethod
89
+ def analyze_page_structure(scout_obj: 'Scout') -> Dict[str, Any]:
90
+ """
91
+ Analyze the structure of a web page.
92
+
93
+ Args:
94
+ scout_obj (Scout): Parsed Scout object
95
+
96
+ Returns:
97
+ Dict[str, Any]: Page structure analysis
98
+ """
99
+ analysis = {
100
+ 'tag_distribution': {},
101
+ 'class_distribution': {},
102
+ 'id_distribution': {},
103
+ 'depth_analysis': {}
104
+ }
105
+
106
+ # Tag distribution
107
+ for tag in scout_obj.find_all():
108
+ analysis['tag_distribution'][tag.name] = analysis['tag_distribution'].get(tag.name, 0) + 1
109
+
110
+ # Class distribution
111
+ for tag in scout_obj.find_all(attrs={'class': True}):
112
+ for cls in tag.get('class', []):
113
+ analysis['class_distribution'][cls] = analysis['class_distribution'].get(cls, 0) + 1
114
+
115
+ # ID distribution
116
+ for tag in scout_obj.find_all(attrs={'id': True}):
117
+ analysis['id_distribution'][tag.get('id')] = analysis['id_distribution'].get(tag.get('id'), 0) + 1
118
+
119
+ # Depth analysis
120
+ def _analyze_depth(tag, current_depth=0):
121
+ analysis['depth_analysis'][current_depth] = analysis['depth_analysis'].get(current_depth, 0) + 1
122
+ for child in tag.contents:
123
+ if isinstance(child, Tag):
124
+ _analyze_depth(child, current_depth + 1)
125
+
126
+ _analyze_depth(scout_obj._soup)
127
+
128
+ return analysis
129
+
130
+ class ScoutSearchResult:
131
+ """
132
+ Represents a search result with advanced querying capabilities.
133
+ Enhanced with more intelligent filtering and processing.
134
+ """
135
+ def __init__(self, results: List[Tag]):
136
+ """
137
+ Initialize a search result collection.
138
+
139
+ Args:
140
+ results (List[Tag]): List of matching tags
141
+ """
142
+ self._results = results
143
+
144
+ def __len__(self) -> int:
145
+ return len(self._results)
146
+
147
+ def __iter__(self) -> Iterator[Tag]:
148
+ return iter(self._results)
149
+
150
+ def __getitem__(self, index: Union[int, slice]) -> Union[Tag, List[Tag]]:
151
+ return self._results[index]
152
+
153
+ def texts(self, separator=' ', strip=True) -> List[str]:
154
+ """
155
+ Extract texts from all results.
156
+
157
+ Args:
158
+ separator (str, optional): Text separator
159
+ strip (bool, optional): Strip whitespace
160
+
161
+ Returns:
162
+ List[str]: List of extracted texts
163
+ """
164
+ return [tag.get_text(separator, strip) for tag in self._results]
165
+
166
+ def attrs(self, attr_name: str) -> List[Any]:
167
+ """
168
+ Extract a specific attribute from all results.
169
+
170
+ Args:
171
+ attr_name (str): Attribute name to extract
172
+
173
+ Returns:
174
+ List[Any]: List of attribute values
175
+ """
176
+ return [tag.get(attr_name) for tag in self._results]
177
+
178
+ def filter(self, predicate: Callable[[Tag], bool]) -> 'ScoutSearchResult':
179
+ """
180
+ Filter results using a predicate function.
181
+
182
+ Args:
183
+ predicate (Callable[[Tag], bool]): Filtering function
184
+
185
+ Returns:
186
+ ScoutSearchResult: Filtered search results
187
+ """
188
+ return ScoutSearchResult([tag for tag in self._results if predicate(tag)])
189
+
190
+ def map(self, transform: Callable[[Tag], Any]) -> List[Any]:
191
+ """
192
+ Transform results using a mapping function.
193
+
194
+ Args:
195
+ transform (Callable[[Tag], Any]): Transformation function
196
+
197
+ Returns:
198
+ List[Any]: Transformed results
199
+ """
200
+ return [transform(tag) for tag in self._results]
201
+
202
+ def analyze_text(self) -> Dict[str, Any]:
203
+ """
204
+ Perform text analysis on search results.
205
+
206
+ Returns:
207
+ Dict[str, Any]: Text analysis results
208
+ """
209
+ texts = self.texts(strip=True)
210
+ full_text = ' '.join(texts)
211
+
212
+ return {
213
+ 'total_results': len(self._results),
214
+ 'word_count': ScoutTextAnalyzer.count_words(full_text),
215
+ 'entities': ScoutTextAnalyzer.extract_entities(full_text)
216
+ }
217
+
218
+ class ScoutCrawler:
219
+ """
220
+ Advanced web crawling utility for Scout library.
221
+ """
222
+ def __init__(self, base_url: str, max_depth: int = 3, max_pages: int = 50):
223
+ """
224
+ Initialize the web crawler.
225
+
226
+ Args:
227
+ base_url (str): Starting URL to crawl
228
+ max_depth (int, optional): Maximum crawl depth
229
+ max_pages (int, optional): Maximum number of pages to crawl
230
+ """
231
+ self.base_url = base_url
232
+ self.max_depth = max_depth
233
+ self.max_pages = max_pages
234
+ self.visited_urls = set()
235
+ self.crawled_pages = []
236
+
237
+ def _is_valid_url(self, url: str) -> bool:
238
+ """
239
+ Check if a URL is valid and within the same domain.
240
+
241
+ Args:
242
+ url (str): URL to validate
243
+
244
+ Returns:
245
+ bool: Whether the URL is valid
246
+ """
247
+ try:
248
+ parsed_base = urllib.parse.urlparse(self.base_url)
249
+ parsed_url = urllib.parse.urlparse(url)
250
+
251
+ return (
252
+ parsed_url.scheme in ['http', 'https'] and
253
+ parsed_base.netloc == parsed_url.netloc and
254
+ len(self.visited_urls) < self.max_pages
255
+ )
256
+ except Exception:
257
+ return False
258
+
259
+ def _crawl_page(self, url: str, depth: int = 0) -> Dict[str, Union[str, List[str]]]:
260
+ """
261
+ Crawl a single page and extract information.
262
+
263
+ Args:
264
+ url (str): URL to crawl
265
+ depth (int, optional): Current crawl depth
266
+
267
+ Returns:
268
+ Dict[str, Union[str, List[str]]]: Crawled page information
269
+ """
270
+ if depth > self.max_depth or url in self.visited_urls:
271
+ return {}
272
+
273
+ try:
274
+ response = requests.get(url, timeout=10)
275
+ response.raise_for_status()
276
+
277
+ scout = Scout(response.text, features='lxml')
278
+
279
+ page_info = {
280
+ 'url': url,
281
+ 'title': scout.find('title').get_text() if scout.find('title') else '',
282
+ 'links': [
283
+ urllib.parse.urljoin(url, link.get('href'))
284
+ for link in scout.find_all('a', href=True)
285
+ if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
286
+ ],
287
+ 'text': scout.get_text(),
288
+ 'markdown': scout.to_markdown()
289
+ }
290
+
291
+ self.visited_urls.add(url)
292
+ self.crawled_pages.append(page_info)
293
+
294
+ return page_info
295
+ except Exception as e:
296
+ print(f"Error crawling {url}: {e}")
297
+ return {}
298
+
299
+ def crawl(self) -> List[Dict[str, Union[str, List[str]]]]:
300
+ """
301
+ Start web crawling from base URL.
302
+
303
+ Returns:
304
+ List[Dict[str, Union[str, List[str]]]]: List of crawled pages
305
+ """
306
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
307
+ futures = {executor.submit(self._crawl_page, self.base_url, 0)}
308
+
309
+ while futures:
310
+ done, futures = concurrent.futures.wait(
311
+ futures, return_when=concurrent.futures.FIRST_COMPLETED
312
+ )
313
+
314
+ for future in done:
315
+ page_info = future.result()
316
+
317
+ for link in page_info.get('links', []):
318
+ if link not in self.visited_urls:
319
+ futures.add(
320
+ executor.submit(
321
+ self._crawl_page,
322
+ link,
323
+ self.visited_urls.count(self.base_url) + 1
324
+ )
325
+ )
326
+
327
+ return self.crawled_pages
328
+
329
+ class Scout:
330
+ """
331
+ Scout - Making web scraping a breeze! 🌊
332
+ A comprehensive HTML parsing and traversal library.
333
+ Enhanced with advanced features and intelligent parsing.
334
+ """
335
+
336
+ # Available parsers - choose your weapon! ⚔️
337
+ PARSERS = {
338
+ 'html.parser': HTMLParser,
339
+ 'lxml': LXMLParser,
340
+ 'html5lib': HTML5Parser,
341
+ 'lxml-xml': LXMLParser, # For XML parsing
342
+ }
343
+
344
+ def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
345
+ """
346
+ Initialize Scout with HTML content.
347
+
348
+ Args:
349
+ markup (str): HTML content to parse
350
+ features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
351
+ from_encoding (str): Source encoding (if known)
352
+ **kwargs: Additional parsing options
353
+ """
354
+ # Intelligent markup handling
355
+ self.markup = self._preprocess_markup(markup, from_encoding)
356
+ self.features = features
357
+ self.from_encoding = from_encoding
358
+
359
+ # Get the right parser for the job
360
+ if features not in self.PARSERS:
361
+ raise ValueError(
362
+ f"Invalid parser '{features}'! Choose from: {', '.join(self.PARSERS.keys())}"
363
+ )
364
+
365
+ parser_class = self.PARSERS[features]
366
+ self.parser = parser_class()
367
+
368
+ # Parse that HTML! 🎯
369
+ self._soup = self.parser.parse(self.markup)
370
+
371
+ # BeautifulSoup-like attributes
372
+ self.name = self._soup.name if hasattr(self._soup, 'name') else None
373
+ self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
374
+
375
+ # Advanced parsing options
376
+ self._cache = {}
377
+
378
+ # Text and web analyzers
379
+ self.text_analyzer = ScoutTextAnalyzer()
380
+ self.web_analyzer = ScoutWebAnalyzer()
381
+
382
+ def normalize_text(self, text: str, form='NFKD') -> str:
383
+ """
384
+ Normalize text using Unicode normalization.
385
+
386
+ Args:
387
+ text (str): Input text
388
+ form (str, optional): Normalization form
389
+
390
+ Returns:
391
+ str: Normalized text
392
+ """
393
+ return unicodedata.normalize(form, text)
394
+
395
+ def url_parse(self, url: str) -> Dict[str, str]:
396
+ """
397
+ Parse and analyze a URL.
398
+
399
+ Args:
400
+ url (str): URL to parse
401
+
402
+ Returns:
403
+ Dict[str, str]: Parsed URL components
404
+ """
405
+ parsed = urllib.parse.urlparse(url)
406
+ return {
407
+ 'scheme': parsed.scheme,
408
+ 'netloc': parsed.netloc,
409
+ 'path': parsed.path,
410
+ 'params': parsed.params,
411
+ 'query': parsed.query,
412
+ 'fragment': parsed.fragment
413
+ }
414
+
415
+ def analyze_page_structure(self) -> Dict[str, Any]:
416
+ """
417
+ Analyze the structure of the parsed page.
418
+
419
+ Returns:
420
+ Dict[str, Any]: Page structure analysis
421
+ """
422
+ return self.web_analyzer.analyze_page_structure(self)
423
+
424
+ def analyze_text(self, text: Optional[str] = None) -> Dict[str, Any]:
425
+ """
426
+ Perform advanced text analysis.
427
+
428
+ Args:
429
+ text (str, optional): Text to analyze. If None, uses page text.
430
+
431
+ Returns:
432
+ Dict[str, Any]: Text analysis results
433
+ """
434
+ if text is None:
435
+ text = self.get_text()
436
+
437
+ return {
438
+ 'word_count': self.text_analyzer.count_words(text),
439
+ 'entities': self.text_analyzer.extract_entities(text),
440
+ 'tokens': self.text_analyzer.tokenize(text)
441
+ }
442
+
443
+ def extract_semantic_info(self) -> Dict[str, Any]:
444
+ """
445
+ Extract semantic information from the document.
446
+
447
+ Returns:
448
+ Dict[str, Any]: Semantic information
449
+ """
450
+ semantic_info = {
451
+ 'headings': {
452
+ 'h1': [h.get_text(strip=True) for h in self.find_all('h1')],
453
+ 'h2': [h.get_text(strip=True) for h in self.find_all('h2')],
454
+ 'h3': [h.get_text(strip=True) for h in self.find_all('h3')]
455
+ },
456
+ 'lists': {
457
+ 'ul': [ul.find_all('li') for ul in self.find_all('ul')],
458
+ 'ol': [ol.find_all('li') for ol in self.find_all('ol')]
459
+ },
460
+ 'tables': {
461
+ 'count': len(self.find_all('table')),
462
+ 'headers': [table.find_all('th') for table in self.find_all('table')]
463
+ }
464
+ }
465
+ return semantic_info
466
+
467
+ def cache(self, key: str, value: Any = None) -> Any:
468
+ """
469
+ Manage a cache for parsed content.
470
+
471
+ Args:
472
+ key (str): Cache key
473
+ value (Any, optional): Value to cache
474
+
475
+ Returns:
476
+ Any: Cached value or None
477
+ """
478
+ if value is not None:
479
+ self._cache[key] = value
480
+ return self._cache.get(key)
481
+
482
+ def hash_content(self, method='md5') -> str:
483
+ """
484
+ Generate a hash of the parsed content.
485
+
486
+ Args:
487
+ method (str, optional): Hashing method
488
+
489
+ Returns:
490
+ str: Content hash
491
+ """
492
+ hash_methods = {
493
+ 'md5': hashlib.md5,
494
+ 'sha1': hashlib.sha1,
495
+ 'sha256': hashlib.sha256
496
+ }
497
+
498
+ if method not in hash_methods:
499
+ raise ValueError(f"Unsupported hash method: {method}")
500
+
501
+ hasher = hash_methods[method]()
502
+ hasher.update(str(self._soup).encode('utf-8'))
503
+ return hasher.hexdigest()
504
+
505
+ def extract_links(self, base_url: Optional[str] = None) -> List[Dict[str, str]]:
506
+ """
507
+ Extract all links from the document.
508
+
509
+ Args:
510
+ base_url (str, optional): Base URL for resolving relative links
511
+
512
+ Returns:
513
+ List[Dict[str, str]]: List of link dictionaries
514
+ """
515
+ links = []
516
+ for link in self.find_all(['a', 'link']):
517
+ href = link.get('href')
518
+ if href:
519
+ # Resolve relative URLs if base_url is provided
520
+ if base_url and not href.startswith(('http://', 'https://', '//')):
521
+ href = f"{base_url.rstrip('/')}/{href.lstrip('/')}"
522
+
523
+ links.append({
524
+ 'href': href,
525
+ 'text': link.get_text(strip=True),
526
+ 'rel': link.get('rel', [None])[0],
527
+ 'type': link.get('type')
528
+ })
529
+ return links
530
+
531
+ def extract_metadata(self) -> Dict[str, Any]:
532
+ """
533
+ Extract metadata from HTML document.
534
+
535
+ Returns:
536
+ Dict[str, Any]: Extracted metadata
537
+ """
538
+ metadata = {
539
+ 'title': self.find('title').get_text() if self.find('title') else None,
540
+ 'description': self.find('meta', attrs={'name': 'description'}).get('content') if self.find('meta', attrs={'name': 'description'}) else None,
541
+ 'keywords': self.find('meta', attrs={'name': 'keywords'}).get('content', '').split(',') if self.find('meta', attrs={'name': 'keywords'}) else [],
542
+ 'og_metadata': {},
543
+ 'twitter_metadata': {}
544
+ }
545
+
546
+ # Open Graph metadata
547
+ for meta in self.find_all('meta', attrs={'property': re.compile(r'^og:')}):
548
+ key = meta.get('property')[3:]
549
+ metadata['og_metadata'][key] = meta.get('content')
550
+
551
+ # Twitter Card metadata
552
+ for meta in self.find_all('meta', attrs={'name': re.compile(r'^twitter:')}):
553
+ key = meta.get('name')[8:]
554
+ metadata['twitter_metadata'][key] = meta.get('content')
555
+
556
+ return metadata
557
+
558
+ def to_json(self, indent=2) -> str:
559
+ """
560
+ Convert parsed content to JSON.
561
+
562
+ Args:
563
+ indent (int, optional): JSON indentation
564
+
565
+ Returns:
566
+ str: JSON representation of the document
567
+ """
568
+ def _tag_to_dict(tag):
569
+ if isinstance(tag, NavigableString):
570
+ return str(tag)
571
+
572
+ result = {
573
+ 'name': tag.name,
574
+ 'attrs': tag.attrs,
575
+ 'text': tag.get_text(strip=True)
576
+ }
577
+
578
+ if tag.contents:
579
+ result['children'] = [_tag_to_dict(child) for child in tag.contents]
580
+
581
+ return result
582
+
583
+ return json.dumps(_tag_to_dict(self._soup), indent=indent)
584
+
585
+ def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> Optional[Tag]:
586
+ """
587
+ Find the first matching element.
588
+
589
+ Args:
590
+ name (str, optional): Tag name to search for
591
+ attrs (dict, optional): Attributes to match
592
+ recursive (bool, optional): Search recursively
593
+ text (str, optional): Text content to match
594
+
595
+ Returns:
596
+ Tag or None: First matching element
597
+ """
598
+ result = self._soup.find(name, attrs, recursive, text, **kwargs)
599
+ return ScoutSearchResult([result]) if result else ScoutSearchResult([])
600
+
601
+ def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> ScoutSearchResult:
602
+ """
603
+ Find all matching elements.
604
+
605
+ Args:
606
+ name (str, optional): Tag name to search for
607
+ attrs (dict, optional): Attributes to match
608
+ recursive (bool, optional): Search recursively
609
+ text (str, optional): Text content to match
610
+ limit (int, optional): Maximum number of results
611
+
612
+ Returns:
613
+ ScoutSearchResult: List of matching elements
614
+ """
615
+ results = self._soup.find_all(name, attrs, recursive, text, limit, **kwargs)
616
+ return ScoutSearchResult(results)
617
+
618
+ def find_parent(self, name=None, attrs={}, **kwargs):
619
+ """
620
+ Find the first parent matching given criteria.
621
+
622
+ Args:
623
+ name (str, optional): Tag name to search for
624
+ attrs (dict, optional): Attributes to match
625
+
626
+ Returns:
627
+ Tag or None: First matching parent
628
+ """
629
+ current = self._soup.parent
630
+ while current:
631
+ if (name is None or current.name == name) and \
632
+ all(current.get(k) == v for k, v in attrs.items()):
633
+ return current
634
+ current = current.parent
635
+ return None
636
+
637
+ def find_parents(self, name=None, attrs={}, limit=None, **kwargs):
638
+ """
639
+ Find all parents matching given criteria.
640
+
641
+ Args:
642
+ name (str, optional): Tag name to search for
643
+ attrs (dict, optional): Attributes to match
644
+ limit (int, optional): Maximum number of results
645
+
646
+ Returns:
647
+ List[Tag]: List of matching parents
648
+ """
649
+ parents = []
650
+ current = self._soup.parent
651
+ while current and (limit is None or len(parents) < limit):
652
+ if (name is None or current.name == name) and \
653
+ all(current.get(k) == v for k, v in attrs.items()):
654
+ parents.append(current)
655
+ current = current.parent
656
+ return parents
657
+
658
+ def find_next_sibling(self, name=None, attrs={}, **kwargs):
659
+ """
660
+ Find the next sibling matching given criteria.
661
+
662
+ Args:
663
+ name (str, optional): Tag name to search for
664
+ attrs (dict, optional): Attributes to match
665
+
666
+ Returns:
667
+ Tag or None: First matching next sibling
668
+ """
669
+ if not self._soup.parent:
670
+ return None
671
+
672
+ siblings = self._soup.parent.contents
673
+ try:
674
+ current_index = siblings.index(self._soup)
675
+ for sibling in siblings[current_index + 1:]:
676
+ if isinstance(sibling, Tag):
677
+ if (name is None or sibling.name == name) and \
678
+ all(sibling.get(k) == v for k, v in attrs.items()):
679
+ return sibling
680
+ except ValueError:
681
+ pass
682
+ return None
683
+
684
+ def find_next_siblings(self, name=None, attrs={}, limit=None, **kwargs):
685
+ """
686
+ Find all next siblings matching given criteria.
687
+
688
+ Args:
689
+ name (str, optional): Tag name to search for
690
+ attrs (dict, optional): Attributes to match
691
+ limit (int, optional): Maximum number of results
692
+
693
+ Returns:
694
+ List[Tag]: List of matching next siblings
695
+ """
696
+ if not self._soup.parent:
697
+ return []
698
+
699
+ siblings = []
700
+ siblings_list = self._soup.parent.contents
701
+ try:
702
+ current_index = siblings_list.index(self._soup)
703
+ for sibling in siblings_list[current_index + 1:]:
704
+ if isinstance(sibling, Tag):
705
+ if (name is None or sibling.name == name) and \
706
+ all(sibling.get(k) == v for k, v in attrs.items()):
707
+ siblings.append(sibling)
708
+ if limit and len(siblings) == limit:
709
+ break
710
+ except ValueError:
711
+ pass
712
+ return siblings
713
+
714
+ def select(self, selector: str) -> List[Tag]:
715
+ """
716
+ Select elements using CSS selector.
717
+
718
+ Args:
719
+ selector (str): CSS selector string
720
+
721
+ Returns:
722
+ List[Tag]: List of matching elements
723
+ """
724
+ return self._soup.select(selector)
725
+
726
+ def select_one(self, selector: str) -> Optional[Tag]:
727
+ """
728
+ Select the first element matching the CSS selector.
729
+
730
+ Args:
731
+ selector (str): CSS selector string
732
+
733
+ Returns:
734
+ Tag or None: First matching element
735
+ """
736
+ return self._soup.select_one(selector)
737
+
738
+ def get_text(self, separator=' ', strip=False, types=None) -> str:
739
+ """
740
+ Extract all text from the parsed document.
741
+
742
+ Args:
743
+ separator (str, optional): Text separator
744
+ strip (bool, optional): Strip whitespace
745
+ types (list, optional): Types of content to extract
746
+
747
+ Returns:
748
+ str: Extracted text
749
+ """
750
+ return self._soup.get_text(separator, strip, types)
751
+
752
+ def prettify(self, formatter='minimal') -> str:
753
+ """
754
+ Return a formatted, pretty-printed version of the HTML.
755
+
756
+ Args:
757
+ formatter (str, optional): Formatting style
758
+
759
+ Returns:
760
+ str: Prettified HTML
761
+ """
762
+ return self._soup.prettify(formatter)
763
+
764
+ def decompose(self, tag: Tag = None) -> None:
765
+ """
766
+ Remove a tag and its contents from the document.
767
+
768
+ Args:
769
+ tag (Tag, optional): Tag to remove. If None, removes the root tag.
770
+ """
771
+ if tag is None:
772
+ tag = self._soup
773
+ tag.decompose()
774
+
775
+ def extract(self, tag: Tag = None) -> Tag:
776
+ """
777
+ Remove a tag from the document and return it.
778
+
779
+ Args:
780
+ tag (Tag, optional): Tag to extract. If None, extracts the root tag.
781
+
782
+ Returns:
783
+ Tag: Extracted tag
784
+ """
785
+ if tag is None:
786
+ tag = self._soup
787
+ return tag.extract()
788
+
789
+ def clear(self, tag: Tag = None) -> None:
790
+ """
791
+ Remove a tag's contents while keeping the tag itself.
792
+
793
+ Args:
794
+ tag (Tag, optional): Tag to clear. If None, clears the root tag.
795
+ """
796
+ if tag is None:
797
+ tag = self._soup
798
+ tag.clear()
799
+
800
+ def replace_with(self, old_tag: Tag, new_tag: Tag) -> None:
801
+ """
802
+ Replace one tag with another.
803
+
804
+ Args:
805
+ old_tag (Tag): Tag to replace
806
+ new_tag (Tag): Replacement tag
807
+ """
808
+ old_tag.replace_with(new_tag)
809
+
810
+ def encode(self, encoding='utf-8') -> bytes:
811
+ """
812
+ Encode the document to a specific encoding.
813
+
814
+ Args:
815
+ encoding (str, optional): Encoding to use
816
+
817
+ Returns:
818
+ bytes: Encoded document
819
+ """
820
+ return str(self._soup).encode(encoding)
821
+
822
+ def decode(self, encoding='utf-8') -> str:
823
+ """
824
+ Decode the document from a specific encoding.
825
+
826
+ Args:
827
+ encoding (str, optional): Encoding to use
828
+
829
+ Returns:
830
+ str: Decoded document
831
+ """
832
+ return str(self._soup)
833
+
834
+ def __str__(self) -> str:
835
+ """
836
+ String representation of the parsed document.
837
+
838
+ Returns:
839
+ str: HTML content
840
+ """
841
+ return str(self._soup)
842
+
843
+ def __repr__(self) -> str:
844
+ """
845
+ Detailed representation of the Scout object.
846
+
847
+ Returns:
848
+ str: Scout object description
849
+ """
850
+ return f"Scout(features='{self.features}', content_length={len(self.markup)})"
851
+
852
+ def _preprocess_markup(self, markup: str, encoding: Optional[str] = None) -> str:
853
+ """
854
+ Preprocess markup before parsing.
855
+
856
+ Args:
857
+ markup (str): Input markup
858
+ encoding (str, optional): Encoding to use
859
+
860
+ Returns:
861
+ str: Preprocessed markup
862
+ """
863
+ # Decode markup
864
+ decoded_markup = decode_markup(markup, encoding)
865
+
866
+ # Basic HTML cleaning
867
+ # Remove comments, normalize whitespace, etc.
868
+ decoded_markup = re.sub(r'<!--.*?-->', '', decoded_markup, flags=re.DOTALL)
869
+ decoded_markup = re.sub(r'\s+', ' ', decoded_markup)
870
+
871
+ return decoded_markup
872
+
873
+ def to_markdown(self, heading_style='ATX') -> str:
874
+ """
875
+ Convert HTML to Markdown.
876
+
877
+ Args:
878
+ heading_style (str, optional): Markdown heading style
879
+
880
+ Returns:
881
+ str: Markdown representation of the document
882
+ """
883
+ converter = MarkdownConverter(heading_style=heading_style)
884
+ return converter.convert(str(self._soup))