webscout 8.2.7__py3-none-any.whl → 8.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. webscout/AIauto.py +33 -15
  2. webscout/AIbase.py +96 -37
  3. webscout/AIutel.py +703 -250
  4. webscout/Bard.py +441 -323
  5. webscout/Extra/Act.md +309 -0
  6. webscout/Extra/GitToolkit/__init__.py +10 -0
  7. webscout/Extra/GitToolkit/gitapi/README.md +110 -0
  8. webscout/Extra/GitToolkit/gitapi/__init__.py +12 -0
  9. webscout/Extra/GitToolkit/gitapi/repository.py +195 -0
  10. webscout/Extra/GitToolkit/gitapi/user.py +96 -0
  11. webscout/Extra/GitToolkit/gitapi/utils.py +62 -0
  12. webscout/Extra/YTToolkit/README.md +375 -0
  13. webscout/Extra/YTToolkit/YTdownloader.py +957 -0
  14. webscout/Extra/YTToolkit/__init__.py +3 -0
  15. webscout/Extra/YTToolkit/transcriber.py +476 -0
  16. webscout/Extra/YTToolkit/ytapi/README.md +44 -0
  17. webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
  18. webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
  19. webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
  20. webscout/Extra/YTToolkit/ytapi/extras.py +118 -0
  21. webscout/Extra/YTToolkit/ytapi/https.py +88 -0
  22. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
  23. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
  24. webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
  25. webscout/Extra/YTToolkit/ytapi/query.py +40 -0
  26. webscout/Extra/YTToolkit/ytapi/stream.py +63 -0
  27. webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
  28. webscout/Extra/YTToolkit/ytapi/video.py +232 -0
  29. webscout/Extra/__init__.py +7 -0
  30. webscout/Extra/autocoder/__init__.py +9 -0
  31. webscout/Extra/autocoder/autocoder.py +1105 -0
  32. webscout/Extra/autocoder/autocoder_utiles.py +332 -0
  33. webscout/Extra/gguf.md +430 -0
  34. webscout/Extra/gguf.py +684 -0
  35. webscout/Extra/tempmail/README.md +488 -0
  36. webscout/Extra/tempmail/__init__.py +28 -0
  37. webscout/Extra/tempmail/async_utils.py +141 -0
  38. webscout/Extra/tempmail/base.py +161 -0
  39. webscout/Extra/tempmail/cli.py +187 -0
  40. webscout/Extra/tempmail/emailnator.py +84 -0
  41. webscout/Extra/tempmail/mail_tm.py +361 -0
  42. webscout/Extra/tempmail/temp_mail_io.py +292 -0
  43. webscout/Extra/weather.md +281 -0
  44. webscout/Extra/weather.py +194 -0
  45. webscout/Extra/weather_ascii.py +76 -0
  46. webscout/Litlogger/README.md +10 -0
  47. webscout/Litlogger/__init__.py +15 -0
  48. webscout/Litlogger/formats.py +4 -0
  49. webscout/Litlogger/handlers.py +103 -0
  50. webscout/Litlogger/levels.py +13 -0
  51. webscout/Litlogger/logger.py +92 -0
  52. webscout/Provider/AI21.py +177 -0
  53. webscout/Provider/AISEARCH/DeepFind.py +254 -0
  54. webscout/Provider/AISEARCH/Perplexity.py +333 -0
  55. webscout/Provider/AISEARCH/README.md +279 -0
  56. webscout/Provider/AISEARCH/__init__.py +9 -0
  57. webscout/Provider/AISEARCH/felo_search.py +202 -0
  58. webscout/Provider/AISEARCH/genspark_search.py +324 -0
  59. webscout/Provider/AISEARCH/hika_search.py +186 -0
  60. webscout/Provider/AISEARCH/iask_search.py +410 -0
  61. webscout/Provider/AISEARCH/monica_search.py +220 -0
  62. webscout/Provider/AISEARCH/scira_search.py +298 -0
  63. webscout/Provider/AISEARCH/webpilotai_search.py +255 -0
  64. webscout/Provider/Aitopia.py +316 -0
  65. webscout/Provider/AllenAI.py +440 -0
  66. webscout/Provider/Andi.py +228 -0
  67. webscout/Provider/Blackboxai.py +791 -0
  68. webscout/Provider/ChatGPTClone.py +237 -0
  69. webscout/Provider/ChatGPTGratis.py +194 -0
  70. webscout/Provider/ChatSandbox.py +342 -0
  71. webscout/Provider/Cloudflare.py +324 -0
  72. webscout/Provider/Cohere.py +208 -0
  73. webscout/Provider/Deepinfra.py +340 -0
  74. webscout/Provider/ExaAI.py +261 -0
  75. webscout/Provider/ExaChat.py +358 -0
  76. webscout/Provider/Flowith.py +217 -0
  77. webscout/Provider/FreeGemini.py +250 -0
  78. webscout/Provider/Gemini.py +169 -0
  79. webscout/Provider/GithubChat.py +369 -0
  80. webscout/Provider/GizAI.py +295 -0
  81. webscout/Provider/Glider.py +225 -0
  82. webscout/Provider/Groq.py +801 -0
  83. webscout/Provider/HF_space/__init__.py +0 -0
  84. webscout/Provider/HF_space/qwen_qwen2.py +206 -0
  85. webscout/Provider/HeckAI.py +375 -0
  86. webscout/Provider/HuggingFaceChat.py +469 -0
  87. webscout/Provider/Hunyuan.py +283 -0
  88. webscout/Provider/Jadve.py +291 -0
  89. webscout/Provider/Koboldai.py +384 -0
  90. webscout/Provider/LambdaChat.py +411 -0
  91. webscout/Provider/Llama3.py +259 -0
  92. webscout/Provider/MCPCore.py +315 -0
  93. webscout/Provider/Marcus.py +198 -0
  94. webscout/Provider/Nemotron.py +218 -0
  95. webscout/Provider/Netwrck.py +270 -0
  96. webscout/Provider/OLLAMA.py +396 -0
  97. webscout/Provider/OPENAI/BLACKBOXAI.py +766 -0
  98. webscout/Provider/OPENAI/Cloudflare.py +378 -0
  99. webscout/Provider/OPENAI/FreeGemini.py +283 -0
  100. webscout/Provider/OPENAI/NEMOTRON.py +232 -0
  101. webscout/Provider/OPENAI/Qwen3.py +283 -0
  102. webscout/Provider/OPENAI/README.md +952 -0
  103. webscout/Provider/OPENAI/TwoAI.py +357 -0
  104. webscout/Provider/OPENAI/__init__.py +40 -0
  105. webscout/Provider/OPENAI/ai4chat.py +293 -0
  106. webscout/Provider/OPENAI/api.py +969 -0
  107. webscout/Provider/OPENAI/base.py +249 -0
  108. webscout/Provider/OPENAI/c4ai.py +373 -0
  109. webscout/Provider/OPENAI/chatgpt.py +556 -0
  110. webscout/Provider/OPENAI/chatgptclone.py +494 -0
  111. webscout/Provider/OPENAI/chatsandbox.py +173 -0
  112. webscout/Provider/OPENAI/copilot.py +242 -0
  113. webscout/Provider/OPENAI/deepinfra.py +322 -0
  114. webscout/Provider/OPENAI/e2b.py +1414 -0
  115. webscout/Provider/OPENAI/exaai.py +417 -0
  116. webscout/Provider/OPENAI/exachat.py +444 -0
  117. webscout/Provider/OPENAI/flowith.py +162 -0
  118. webscout/Provider/OPENAI/freeaichat.py +359 -0
  119. webscout/Provider/OPENAI/glider.py +326 -0
  120. webscout/Provider/OPENAI/groq.py +364 -0
  121. webscout/Provider/OPENAI/heckai.py +308 -0
  122. webscout/Provider/OPENAI/llmchatco.py +335 -0
  123. webscout/Provider/OPENAI/mcpcore.py +389 -0
  124. webscout/Provider/OPENAI/multichat.py +376 -0
  125. webscout/Provider/OPENAI/netwrck.py +357 -0
  126. webscout/Provider/OPENAI/oivscode.py +287 -0
  127. webscout/Provider/OPENAI/opkfc.py +496 -0
  128. webscout/Provider/OPENAI/pydantic_imports.py +172 -0
  129. webscout/Provider/OPENAI/scirachat.py +477 -0
  130. webscout/Provider/OPENAI/sonus.py +304 -0
  131. webscout/Provider/OPENAI/standardinput.py +433 -0
  132. webscout/Provider/OPENAI/textpollinations.py +339 -0
  133. webscout/Provider/OPENAI/toolbaz.py +413 -0
  134. webscout/Provider/OPENAI/typefully.py +355 -0
  135. webscout/Provider/OPENAI/typegpt.py +364 -0
  136. webscout/Provider/OPENAI/uncovrAI.py +463 -0
  137. webscout/Provider/OPENAI/utils.py +318 -0
  138. webscout/Provider/OPENAI/venice.py +431 -0
  139. webscout/Provider/OPENAI/wisecat.py +387 -0
  140. webscout/Provider/OPENAI/writecream.py +163 -0
  141. webscout/Provider/OPENAI/x0gpt.py +365 -0
  142. webscout/Provider/OPENAI/yep.py +382 -0
  143. webscout/Provider/OpenGPT.py +209 -0
  144. webscout/Provider/Openai.py +496 -0
  145. webscout/Provider/PI.py +429 -0
  146. webscout/Provider/Perplexitylabs.py +415 -0
  147. webscout/Provider/QwenLM.py +254 -0
  148. webscout/Provider/Reka.py +214 -0
  149. webscout/Provider/StandardInput.py +290 -0
  150. webscout/Provider/TTI/README.md +82 -0
  151. webscout/Provider/TTI/__init__.py +7 -0
  152. webscout/Provider/TTI/aiarta.py +365 -0
  153. webscout/Provider/TTI/artbit.py +0 -0
  154. webscout/Provider/TTI/base.py +64 -0
  155. webscout/Provider/TTI/fastflux.py +200 -0
  156. webscout/Provider/TTI/magicstudio.py +201 -0
  157. webscout/Provider/TTI/piclumen.py +203 -0
  158. webscout/Provider/TTI/pixelmuse.py +225 -0
  159. webscout/Provider/TTI/pollinations.py +221 -0
  160. webscout/Provider/TTI/utils.py +11 -0
  161. webscout/Provider/TTS/README.md +192 -0
  162. webscout/Provider/TTS/__init__.py +10 -0
  163. webscout/Provider/TTS/base.py +159 -0
  164. webscout/Provider/TTS/deepgram.py +156 -0
  165. webscout/Provider/TTS/elevenlabs.py +111 -0
  166. webscout/Provider/TTS/gesserit.py +128 -0
  167. webscout/Provider/TTS/murfai.py +113 -0
  168. webscout/Provider/TTS/openai_fm.py +129 -0
  169. webscout/Provider/TTS/parler.py +111 -0
  170. webscout/Provider/TTS/speechma.py +580 -0
  171. webscout/Provider/TTS/sthir.py +94 -0
  172. webscout/Provider/TTS/streamElements.py +333 -0
  173. webscout/Provider/TTS/utils.py +280 -0
  174. webscout/Provider/TeachAnything.py +229 -0
  175. webscout/Provider/TextPollinationsAI.py +308 -0
  176. webscout/Provider/TwoAI.py +475 -0
  177. webscout/Provider/TypliAI.py +305 -0
  178. webscout/Provider/UNFINISHED/ChatHub.py +209 -0
  179. webscout/Provider/UNFINISHED/Youchat.py +330 -0
  180. webscout/Provider/UNFINISHED/liner_api_request.py +263 -0
  181. webscout/Provider/UNFINISHED/puterjs.py +635 -0
  182. webscout/Provider/UNFINISHED/test_lmarena.py +119 -0
  183. webscout/Provider/Venice.py +258 -0
  184. webscout/Provider/VercelAI.py +253 -0
  185. webscout/Provider/WiseCat.py +233 -0
  186. webscout/Provider/WrDoChat.py +370 -0
  187. webscout/Provider/Writecream.py +246 -0
  188. webscout/Provider/WritingMate.py +269 -0
  189. webscout/Provider/__init__.py +174 -0
  190. webscout/Provider/ai4chat.py +174 -0
  191. webscout/Provider/akashgpt.py +335 -0
  192. webscout/Provider/asksteve.py +220 -0
  193. webscout/Provider/cerebras.py +290 -0
  194. webscout/Provider/chatglm.py +215 -0
  195. webscout/Provider/cleeai.py +213 -0
  196. webscout/Provider/copilot.py +425 -0
  197. webscout/Provider/elmo.py +283 -0
  198. webscout/Provider/freeaichat.py +285 -0
  199. webscout/Provider/geminiapi.py +208 -0
  200. webscout/Provider/granite.py +235 -0
  201. webscout/Provider/hermes.py +266 -0
  202. webscout/Provider/julius.py +223 -0
  203. webscout/Provider/koala.py +170 -0
  204. webscout/Provider/learnfastai.py +325 -0
  205. webscout/Provider/llama3mitril.py +215 -0
  206. webscout/Provider/llmchat.py +258 -0
  207. webscout/Provider/llmchatco.py +306 -0
  208. webscout/Provider/lmarena.py +198 -0
  209. webscout/Provider/meta.py +801 -0
  210. webscout/Provider/multichat.py +364 -0
  211. webscout/Provider/oivscode.py +309 -0
  212. webscout/Provider/samurai.py +224 -0
  213. webscout/Provider/scira_chat.py +299 -0
  214. webscout/Provider/scnet.py +243 -0
  215. webscout/Provider/searchchat.py +292 -0
  216. webscout/Provider/sonus.py +258 -0
  217. webscout/Provider/talkai.py +194 -0
  218. webscout/Provider/toolbaz.py +353 -0
  219. webscout/Provider/turboseek.py +266 -0
  220. webscout/Provider/typefully.py +202 -0
  221. webscout/Provider/typegpt.py +289 -0
  222. webscout/Provider/uncovr.py +368 -0
  223. webscout/Provider/x0gpt.py +299 -0
  224. webscout/Provider/yep.py +389 -0
  225. webscout/__init__.py +4 -2
  226. webscout/cli.py +3 -28
  227. webscout/client.py +70 -0
  228. webscout/conversation.py +35 -35
  229. webscout/litagent/Readme.md +276 -0
  230. webscout/litagent/__init__.py +29 -0
  231. webscout/litagent/agent.py +455 -0
  232. webscout/litagent/constants.py +60 -0
  233. webscout/litprinter/__init__.py +59 -0
  234. webscout/optimizers.py +419 -419
  235. webscout/scout/README.md +404 -0
  236. webscout/scout/__init__.py +8 -0
  237. webscout/scout/core/__init__.py +7 -0
  238. webscout/scout/core/crawler.py +210 -0
  239. webscout/scout/core/scout.py +607 -0
  240. webscout/scout/core/search_result.py +96 -0
  241. webscout/scout/core/text_analyzer.py +63 -0
  242. webscout/scout/core/text_utils.py +277 -0
  243. webscout/scout/core/web_analyzer.py +52 -0
  244. webscout/scout/element.py +478 -0
  245. webscout/scout/parsers/__init__.py +69 -0
  246. webscout/scout/parsers/html5lib_parser.py +172 -0
  247. webscout/scout/parsers/html_parser.py +236 -0
  248. webscout/scout/parsers/lxml_parser.py +178 -0
  249. webscout/scout/utils.py +37 -0
  250. webscout/swiftcli/Readme.md +323 -0
  251. webscout/swiftcli/__init__.py +95 -0
  252. webscout/swiftcli/core/__init__.py +7 -0
  253. webscout/swiftcli/core/cli.py +297 -0
  254. webscout/swiftcli/core/context.py +104 -0
  255. webscout/swiftcli/core/group.py +241 -0
  256. webscout/swiftcli/decorators/__init__.py +28 -0
  257. webscout/swiftcli/decorators/command.py +221 -0
  258. webscout/swiftcli/decorators/options.py +220 -0
  259. webscout/swiftcli/decorators/output.py +252 -0
  260. webscout/swiftcli/exceptions.py +21 -0
  261. webscout/swiftcli/plugins/__init__.py +9 -0
  262. webscout/swiftcli/plugins/base.py +135 -0
  263. webscout/swiftcli/plugins/manager.py +269 -0
  264. webscout/swiftcli/utils/__init__.py +59 -0
  265. webscout/swiftcli/utils/formatting.py +252 -0
  266. webscout/swiftcli/utils/parsing.py +267 -0
  267. webscout/version.py +1 -1
  268. webscout/webscout_search.py +2 -182
  269. webscout/webscout_search_async.py +1 -179
  270. webscout/zeroart/README.md +89 -0
  271. webscout/zeroart/__init__.py +135 -0
  272. webscout/zeroart/base.py +66 -0
  273. webscout/zeroart/effects.py +101 -0
  274. webscout/zeroart/fonts.py +1239 -0
  275. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/METADATA +262 -83
  276. webscout-8.2.9.dist-info/RECORD +289 -0
  277. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/WHEEL +1 -1
  278. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/entry_points.txt +1 -0
  279. webscout-8.2.7.dist-info/RECORD +0 -26
  280. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/licenses/LICENSE.md +0 -0
  281. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,607 @@
1
+ """
2
+ Scout Main Module - HTML Parsing and Traversal
3
+ """
4
+ import hashlib
5
+ import json
6
+ import re
7
+ import unicodedata
8
+ import urllib.parse
9
+ from typing import Any, Dict, List, Optional
10
+
11
+ from ..element import NavigableString, Tag
12
+ from ..parsers import ParserRegistry
13
+ from ..utils import decode_markup
14
+ from .search_result import ScoutSearchResult
15
+ from .text_analyzer import ScoutTextAnalyzer
16
+ from .text_utils import SentenceTokenizer
17
+ from .web_analyzer import ScoutWebAnalyzer
18
+
19
+
20
+ class Scout:
21
+ """
22
+ Scout - Making web scraping a breeze! 🌊
23
+ A comprehensive HTML parsing and traversal library.
24
+ Enhanced with advanced features and intelligent parsing.
25
+ """
26
+
27
+ def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
28
+ """
29
+ Initialize Scout with HTML content.
30
+
31
+ Args:
32
+ markup (str): HTML content to parse
33
+ features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
34
+ from_encoding (str): Source encoding (if known)
35
+ **kwargs: Additional parsing options
36
+ """
37
+ # Intelligent markup handling
38
+ self.markup = self._preprocess_markup(markup, from_encoding)
39
+ self.features = features
40
+ self.from_encoding = from_encoding
41
+
42
+ # Get the right parser for the job
43
+ if features not in ParserRegistry.list_parsers():
44
+ raise ValueError(
45
+ f"Invalid parser '{features}'! Choose from: {', '.join(ParserRegistry.list_parsers().keys())}"
46
+ )
47
+
48
+ parser_class = ParserRegistry.get_parser(features)
49
+ self.parser = parser_class
50
+
51
+ # Parse that HTML! 🎯
52
+ self._soup = self.parser.parse(self.markup)
53
+
54
+ # BeautifulSoup-like attributes
55
+ self.name = self._soup.name if hasattr(self._soup, 'name') else None
56
+ self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
57
+
58
+ # Advanced parsing options
59
+ self._cache = {}
60
+
61
+ # Text and web analyzers
62
+ self.text_analyzer = ScoutTextAnalyzer()
63
+ self.web_analyzer = ScoutWebAnalyzer()
64
+
65
+ def normalize_text(self, text: str, form='NFKD') -> str:
66
+ """
67
+ Normalize text using Unicode normalization.
68
+
69
+ Args:
70
+ text (str): Input text
71
+ form (str, optional): Normalization form
72
+
73
+ Returns:
74
+ str: Normalized text
75
+ """
76
+ return unicodedata.normalize(form, text)
77
+
78
+ def url_parse(self, url: str) -> Dict[str, str]:
79
+ """
80
+ Parse and analyze a URL.
81
+
82
+ Args:
83
+ url (str): URL to parse
84
+
85
+ Returns:
86
+ Dict[str, str]: Parsed URL components
87
+ """
88
+ parsed = urllib.parse.urlparse(url)
89
+ return {
90
+ 'scheme': parsed.scheme,
91
+ 'netloc': parsed.netloc,
92
+ 'path': parsed.path,
93
+ 'params': parsed.params,
94
+ 'query': parsed.query,
95
+ 'fragment': parsed.fragment
96
+ }
97
+
98
+ def analyze_page_structure(self) -> Dict[str, Any]:
99
+ """
100
+ Analyze the structure of the parsed page.
101
+
102
+ Returns:
103
+ Dict[str, Any]: Page structure analysis
104
+ """
105
+ return self.web_analyzer.analyze_page_structure(self)
106
+
107
+ def analyze_text(self, text: Optional[str] = None) -> Dict[str, Any]:
108
+ """
109
+ Perform advanced text analysis.
110
+
111
+ Args:
112
+ text (str, optional): Text to analyze. If None, uses page text.
113
+
114
+ Returns:
115
+ Dict[str, Any]: Text analysis results
116
+ """
117
+ if text is None:
118
+ text = self.get_text()
119
+
120
+ return {
121
+ 'word_count': self.text_analyzer.count_words(text),
122
+ 'entities': self.text_analyzer.extract_entities(text),
123
+ 'tokens': self.text_analyzer.tokenize(text)
124
+ }
125
+
126
+ def extract_semantic_info(self) -> Dict[str, Any]:
127
+ """
128
+ Extract semantic information from the document.
129
+
130
+ Returns:
131
+ Dict[str, Any]: Semantic information
132
+ """
133
+ semantic_info = {
134
+ 'headings': {
135
+ 'h1': [h.get_text(strip=True) for h in self.find_all('h1')],
136
+ 'h2': [h.get_text(strip=True) for h in self.find_all('h2')],
137
+ 'h3': [h.get_text(strip=True) for h in self.find_all('h3')]
138
+ },
139
+ 'lists': {
140
+ 'ul': [ul.find_all('li') for ul in self.find_all('ul')],
141
+ 'ol': [ol.find_all('li') for ol in self.find_all('ol')]
142
+ },
143
+ 'tables': {
144
+ 'count': len(self.find_all('table')),
145
+ 'headers': [table.find_all('th') for table in self.find_all('table')]
146
+ }
147
+ }
148
+ return semantic_info
149
+
150
+ def cache(self, key: str, value: Any = None) -> Any:
151
+ """
152
+ Manage a cache for parsed content.
153
+
154
+ Args:
155
+ key (str): Cache key
156
+ value (Any, optional): Value to cache
157
+
158
+ Returns:
159
+ Any: Cached value or None
160
+ """
161
+ if value is not None:
162
+ self._cache[key] = value
163
+ return self._cache.get(key)
164
+
165
+ def hash_content(self, method='md5') -> str:
166
+ """
167
+ Generate a hash of the parsed content.
168
+
169
+ Args:
170
+ method (str, optional): Hashing method
171
+
172
+ Returns:
173
+ str: Content hash
174
+ """
175
+ hash_methods = {
176
+ 'md5': hashlib.md5,
177
+ 'sha1': hashlib.sha1,
178
+ 'sha256': hashlib.sha256
179
+ }
180
+
181
+ if method not in hash_methods:
182
+ raise ValueError(f"Unsupported hash method: {method}")
183
+
184
+ hasher = hash_methods[method]()
185
+ hasher.update(str(self._soup).encode('utf-8'))
186
+ return hasher.hexdigest()
187
+
188
+ def extract_links(self, base_url: Optional[str] = None) -> List[Dict[str, str]]:
189
+ """
190
+ Extract all links from the document.
191
+
192
+ Args:
193
+ base_url (str, optional): Base URL for resolving relative links
194
+
195
+ Returns:
196
+ List[Dict[str, str]]: List of link dictionaries
197
+ """
198
+ links = []
199
+ for link in self.find_all(['a', 'link']):
200
+ href = link.get('href')
201
+ if href:
202
+ # Resolve relative URLs if base_url is provided
203
+ if base_url and not href.startswith(('http://', 'https://', '//')):
204
+ href = f"{base_url.rstrip('/')}/{href.lstrip('/')}"
205
+
206
+ links.append({
207
+ 'href': href,
208
+ 'text': link.get_text(strip=True),
209
+ 'rel': link.get('rel', [None])[0],
210
+ 'type': link.get('type')
211
+ })
212
+ return links
213
+
214
+ def extract_metadata(self) -> Dict[str, Any]:
215
+ """
216
+ Extract metadata from HTML document.
217
+
218
+ Returns:
219
+ Dict[str, Any]: Extracted metadata
220
+ """
221
+ metadata = {
222
+ 'title': self.find('title').texts()[0] if self.find('title').texts() else None,
223
+ 'description': self.find('meta', attrs={'name': 'description'}).attrs('content')[0] if self.find('meta', attrs={'name': 'description'}).attrs('content') else None,
224
+ 'keywords': self.find('meta', attrs={'name': 'keywords'}).attrs('content')[0].split(',') if self.find('meta', attrs={'name': 'keywords'}).attrs('content') else [],
225
+ 'og_metadata': {},
226
+ 'twitter_metadata': {}
227
+ }
228
+
229
+ # Open Graph metadata
230
+ for meta in self.find_all('meta', attrs={'property': re.compile(r'^og:')}):
231
+ key = meta.attrs('property')[0][3:]
232
+ metadata['og_metadata'][key] = meta.attrs('content')[0]
233
+
234
+ # Twitter Card metadata
235
+ for meta in self.find_all('meta', attrs={'name': re.compile(r'^twitter:')}):
236
+ key = meta.attrs('name')[0][8:]
237
+ metadata['twitter_metadata'][key] = meta.attrs('content')[0]
238
+
239
+ return metadata
240
+
241
+ def to_json(self, indent=2) -> str:
242
+ """
243
+ Convert parsed content to JSON.
244
+
245
+ Args:
246
+ indent (int, optional): JSON indentation
247
+
248
+ Returns:
249
+ str: JSON representation of the document
250
+ """
251
+ def _tag_to_dict(tag):
252
+ if isinstance(tag, NavigableString):
253
+ return str(tag)
254
+
255
+ result = {
256
+ 'name': tag.name,
257
+ 'attrs': tag.attrs,
258
+ 'text': tag.get_text(strip=True)
259
+ }
260
+
261
+ if tag.contents:
262
+ result['children'] = [_tag_to_dict(child) for child in tag.contents]
263
+
264
+ return result
265
+
266
+ return json.dumps(_tag_to_dict(self._soup), indent=indent)
267
+
268
+ def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> ScoutSearchResult:
269
+ """
270
+ Find the first matching element.
271
+
272
+ Args:
273
+ name (str, optional): Tag name to search for
274
+ attrs (dict, optional): Attributes to match
275
+ recursive (bool, optional): Search recursively
276
+ text (str, optional): Text content to match
277
+
278
+ Returns:
279
+ ScoutSearchResult: First matching element
280
+ """
281
+ result = self._soup.find(name, attrs, recursive, text, **kwargs)
282
+ return ScoutSearchResult([result]) if result else ScoutSearchResult([])
283
+
284
+ def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> ScoutSearchResult:
285
+ """
286
+ Find all matching elements.
287
+
288
+ Args:
289
+ name (str, optional): Tag name to search for
290
+ attrs (dict, optional): Attributes to match
291
+ recursive (bool, optional): Search recursively
292
+ text (str, optional): Text content to match
293
+ limit (int, optional): Maximum number of results
294
+
295
+ Returns:
296
+ ScoutSearchResult: List of matching elements
297
+ """
298
+ results = self._soup.find_all(name, attrs, recursive, text, limit, **kwargs)
299
+ return ScoutSearchResult(results)
300
+
301
+ def find_parent(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
302
+ """
303
+ Find the first parent matching given criteria.
304
+
305
+ Args:
306
+ name (str, optional): Tag name to search for
307
+ attrs (dict, optional): Attributes to match
308
+
309
+ Returns:
310
+ Tag or None: First matching parent
311
+ """
312
+ current = self._soup.parent
313
+ while current:
314
+ if (name is None or current.name == name) and \
315
+ all(current.get(k) == v for k, v in attrs.items()):
316
+ return current
317
+ current = current.parent
318
+ return None
319
+
320
+ def find_parents(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
321
+ """
322
+ Find all parents matching given criteria.
323
+
324
+ Args:
325
+ name (str, optional): Tag name to search for
326
+ attrs (dict, optional): Attributes to match
327
+ limit (int, optional): Maximum number of results
328
+
329
+ Returns:
330
+ List[Tag]: List of matching parents
331
+ """
332
+ parents = []
333
+ current = self._soup.parent
334
+ while current and (limit is None or len(parents) < limit):
335
+ if (name is None or current.name == name) and \
336
+ all(current.get(k) == v for k, v in attrs.items()):
337
+ parents.append(current)
338
+ current = current.parent
339
+ return parents
340
+
341
+ def find_next_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
342
+ """
343
+ Find the next sibling matching given criteria.
344
+
345
+ Args:
346
+ name (str, optional): Tag name to search for
347
+ attrs (dict, optional): Attributes to match
348
+
349
+ Returns:
350
+ Tag or None: First matching next sibling
351
+ """
352
+ if not self._soup.parent:
353
+ return None
354
+
355
+ siblings = self._soup.parent.contents
356
+ try:
357
+ current_index = siblings.index(self._soup)
358
+ for sibling in siblings[current_index + 1:]:
359
+ if isinstance(sibling, Tag):
360
+ if (name is None or sibling.name == name) and \
361
+ all(sibling.get(k) == v for k, v in attrs.items()):
362
+ return sibling
363
+ except ValueError:
364
+ pass
365
+ return None
366
+
367
+ def find_next_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
368
+ """
369
+ Find all next siblings matching given criteria.
370
+
371
+ Args:
372
+ name (str, optional): Tag name to search for
373
+ attrs (dict, optional): Attributes to match
374
+ limit (int, optional): Maximum number of results
375
+
376
+ Returns:
377
+ List[Tag]: List of matching next siblings
378
+ """
379
+ if not self._soup.parent:
380
+ return []
381
+
382
+ siblings = []
383
+ siblings_list = self._soup.parent.contents
384
+ try:
385
+ current_index = siblings_list.index(self._soup)
386
+ for sibling in siblings_list[current_index + 1:]:
387
+ if isinstance(sibling, Tag):
388
+ if (name is None or sibling.name == name) and \
389
+ all(sibling.get(k) == v for k, v in attrs.items()):
390
+ siblings.append(sibling)
391
+ if limit and len(siblings) == limit:
392
+ break
393
+ except ValueError:
394
+ pass
395
+ return siblings
396
+
397
+ def find_previous_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
398
+ """Find the previous sibling matching given criteria."""
399
+ if not self._soup.parent:
400
+ return None
401
+
402
+ siblings = self._soup.parent.contents
403
+ try:
404
+ current_index = siblings.index(self._soup)
405
+ for sibling in reversed(siblings[:current_index]):
406
+ if isinstance(sibling, Tag):
407
+ if (name is None or sibling.name == name) and all(
408
+ sibling.get(k) == v for k, v in attrs.items()
409
+ ):
410
+ return sibling
411
+ except ValueError:
412
+ pass
413
+ return None
414
+
415
+ def find_previous_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
416
+ """Find all previous siblings matching given criteria."""
417
+ if not self._soup.parent:
418
+ return []
419
+
420
+ siblings = []
421
+ siblings_list = self._soup.parent.contents
422
+ try:
423
+ current_index = siblings_list.index(self._soup)
424
+ for sibling in reversed(siblings_list[:current_index]):
425
+ if isinstance(sibling, Tag):
426
+ if (name is None or sibling.name == name) and all(
427
+ sibling.get(k) == v for k, v in attrs.items()
428
+ ):
429
+ siblings.append(sibling)
430
+ if limit and len(siblings) == limit:
431
+ break
432
+ except ValueError:
433
+ pass
434
+ return siblings
435
+
436
+ def select(self, selector: str) -> List[Tag]:
437
+ """
438
+ Select elements using CSS selector.
439
+
440
+ Args:
441
+ selector (str): CSS selector string
442
+
443
+ Returns:
444
+ List[Tag]: List of matching elements
445
+ """
446
+ return self._soup.select(selector)
447
+
448
+ def select_one(self, selector: str) -> Optional[Tag]:
449
+ """
450
+ Select the first element matching the CSS selector.
451
+
452
+ Args:
453
+ selector (str): CSS selector string
454
+
455
+ Returns:
456
+ Tag or None: First matching element
457
+ """
458
+ return self._soup.select_one(selector)
459
+
460
+ def get_text(self, separator=' ', strip=False, types=None) -> str:
461
+ """
462
+ Extract all text from the parsed document.
463
+
464
+ Args:
465
+ separator (str, optional): Text separator
466
+ strip (bool, optional): Strip whitespace
467
+ types (list, optional): Types of content to extract
468
+
469
+ Returns:
470
+ str: Extracted text
471
+ """
472
+ tokenizer = SentenceTokenizer()
473
+ text = self._soup.get_text(separator, strip, types)
474
+ sentences = tokenizer.tokenize(text)
475
+ return "\n\n".join(sentences)
476
+
477
+ def remove_tags(self, tags: List[str]) -> None:
478
+ """
479
+ Remove specified tags and their contents from the document.
480
+
481
+ Args:
482
+ tags (List[str]): List of tag names to remove
483
+ """
484
+ for tag_name in tags:
485
+ for tag in self._soup.find_all(tag_name):
486
+ tag.decompose()
487
+
488
+ def prettify(self, formatter='minimal') -> str:
489
+ """
490
+ Return a formatted, pretty-printed version of the HTML.
491
+
492
+ Args:
493
+ formatter (str, optional): Formatting style
494
+
495
+ Returns:
496
+ str: Prettified HTML
497
+ """
498
+ return self._soup.prettify(formatter)
499
+
500
+ def decompose(self, tag: Tag = None) -> None:
501
+ """
502
+ Remove a tag and its contents from the document.
503
+
504
+ Args:
505
+ tag (Tag, optional): Tag to remove. If None, removes the root tag.
506
+ """
507
+ if tag is None:
508
+ tag = self._soup
509
+ tag.decompose()
510
+
511
+ def extract(self, tag: Tag = None) -> Tag:
512
+ """
513
+ Remove a tag from the document and return it.
514
+
515
+ Args:
516
+ tag (Tag, optional): Tag to extract. If None, extracts the root tag.
517
+
518
+ Returns:
519
+ Tag: Extracted tag
520
+ """
521
+ if tag is None:
522
+ tag = self._soup
523
+ return tag.extract()
524
+
525
+ def clear(self, tag: Tag = None) -> None:
526
+ """
527
+ Remove a tag's contents while keeping the tag itself.
528
+
529
+ Args:
530
+ tag (Tag, optional): Tag to clear. If None, clears the root tag.
531
+ """
532
+ if tag is None:
533
+ tag = self._soup
534
+ tag.clear()
535
+
536
+ def replace_with(self, old_tag: Tag, new_tag: Tag) -> None:
537
+ """
538
+ Replace one tag with another.
539
+
540
+ Args:
541
+ old_tag (Tag): Tag to replace
542
+ new_tag (Tag): Replacement tag
543
+ """
544
+ old_tag.replace_with(new_tag)
545
+
546
+ def encode(self, encoding='utf-8') -> bytes:
547
+ """
548
+ Encode the document to a specific encoding.
549
+
550
+ Args:
551
+ encoding (str, optional): Encoding to use
552
+
553
+ Returns:
554
+ bytes: Encoded document
555
+ """
556
+ return str(self._soup).encode(encoding)
557
+
558
+ def decode(self, encoding='utf-8') -> str:
559
+ """
560
+ Decode the document from a specific encoding.
561
+
562
+ Args:
563
+ encoding (str, optional): Encoding to use
564
+
565
+ Returns:
566
+ str: Decoded document
567
+ """
568
+ return str(self._soup)
569
+
570
+ def __str__(self) -> str:
571
+ """
572
+ String representation of the parsed document.
573
+
574
+ Returns:
575
+ str: HTML content
576
+ """
577
+ return str(self._soup)
578
+
579
+ def __repr__(self) -> str:
580
+ """
581
+ Detailed representation of the Scout object.
582
+
583
+ Returns:
584
+ str: Scout object description
585
+ """
586
+ return f"Scout(features='{self.features}', content_length={len(self.markup)})"
587
+
588
+ def _preprocess_markup(self, markup: str, encoding: Optional[str] = None) -> str:
589
+ """
590
+ Preprocess markup before parsing.
591
+
592
+ Args:
593
+ markup (str): Input markup
594
+ encoding (str, optional): Encoding to use
595
+
596
+ Returns:
597
+ str: Preprocessed markup
598
+ """
599
+ # Decode markup
600
+ decoded_markup = decode_markup(markup, encoding)
601
+
602
+ # Basic HTML cleaning
603
+ # Remove comments, normalize whitespace, etc.
604
+ decoded_markup = re.sub(r'<!--.*?-->', '', decoded_markup, flags=re.DOTALL)
605
+ decoded_markup = re.sub(r'\s+', ' ', decoded_markup)
606
+
607
+ return decoded_markup
@@ -0,0 +1,96 @@
1
+ """
2
+ Scout Search Result Module
3
+ """
4
+
5
+ from typing import List, Union, Callable, Any, Dict, Iterator
6
+ from ..element import Tag
7
+ from .text_analyzer import ScoutTextAnalyzer
8
+
9
+
10
+ class ScoutSearchResult:
11
+ """
12
+ Represents a search result with advanced querying capabilities.
13
+ Enhanced with more intelligent filtering and processing.
14
+ """
15
+ def __init__(self, results: List[Tag]):
16
+ """
17
+ Initialize a search result collection.
18
+
19
+ Args:
20
+ results (List[Tag]): List of matching tags
21
+ """
22
+ self._results = results
23
+
24
+ def __len__(self) -> int:
25
+ return len(self._results)
26
+
27
+ def __iter__(self) -> Iterator[Tag]:
28
+ return iter(self._results)
29
+
30
+ def __getitem__(self, index: Union[int, slice]) -> Union[Tag, List[Tag]]:
31
+ return self._results[index]
32
+
33
+ def texts(self, separator=' ', strip=True) -> List[str]:
34
+ """
35
+ Extract texts from all results.
36
+
37
+ Args:
38
+ separator (str, optional): Text separator
39
+ strip (bool, optional): Strip whitespace
40
+
41
+ Returns:
42
+ List[str]: List of extracted texts
43
+ """
44
+ return [tag.get_text(separator, strip) for tag in self._results]
45
+
46
+ def attrs(self, attr_name: str) -> List[Any]:
47
+ """
48
+ Extract a specific attribute from all results.
49
+
50
+ Args:
51
+ attr_name (str): Attribute name to extract
52
+
53
+ Returns:
54
+ List[Any]: List of attribute values
55
+ """
56
+ return [tag.get(attr_name) for tag in self._results]
57
+
58
+ def filter(self, predicate: Callable[[Tag], bool]) -> 'ScoutSearchResult':
59
+ """
60
+ Filter results using a predicate function.
61
+
62
+ Args:
63
+ predicate (Callable[[Tag], bool]): Filtering function
64
+
65
+ Returns:
66
+ ScoutSearchResult: Filtered search results
67
+ """
68
+ return ScoutSearchResult([tag for tag in self._results if predicate(tag)])
69
+
70
+ def map(self, transform: Callable[[Tag], Any]) -> List[Any]:
71
+ """
72
+ Transform results using a mapping function.
73
+
74
+ Args:
75
+ transform (Callable[[Tag], Any]): Transformation function
76
+
77
+ Returns:
78
+ List[Any]: Transformed results
79
+ """
80
+ return [transform(tag) for tag in self._results]
81
+
82
+ def analyze_text(self) -> Dict[str, Any]:
83
+ """
84
+ Perform text analysis on search results.
85
+
86
+ Returns:
87
+ Dict[str, Any]: Text analysis results
88
+ """
89
+ texts = self.texts(strip=True)
90
+ full_text = ' '.join(texts)
91
+
92
+ return {
93
+ 'total_results': len(self._results),
94
+ 'word_count': ScoutTextAnalyzer.count_words(full_text),
95
+ 'entities': ScoutTextAnalyzer.extract_entities(full_text)
96
+ }