webscout 8.2.2__py3-none-any.whl → 8.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (306) hide show
  1. webscout/AIauto.py +112 -22
  2. webscout/AIbase.py +144 -7
  3. webscout/AIutel.py +249 -131
  4. webscout/Bard.py +579 -206
  5. webscout/DWEBS.py +78 -35
  6. webscout/__init__.py +0 -1
  7. webscout/cli.py +256 -0
  8. webscout/conversation.py +307 -436
  9. webscout/exceptions.py +23 -0
  10. webscout/prompt_manager.py +56 -42
  11. webscout/version.py +1 -1
  12. webscout/webscout_search.py +65 -47
  13. webscout/webscout_search_async.py +81 -126
  14. webscout/yep_search.py +93 -43
  15. {webscout-8.2.2.dist-info → webscout-8.2.7.dist-info}/METADATA +172 -52
  16. webscout-8.2.7.dist-info/RECORD +26 -0
  17. {webscout-8.2.2.dist-info → webscout-8.2.7.dist-info}/WHEEL +1 -1
  18. webscout-8.2.7.dist-info/entry_points.txt +3 -0
  19. webscout-8.2.7.dist-info/top_level.txt +1 -0
  20. inferno/__init__.py +0 -6
  21. inferno/__main__.py +0 -9
  22. inferno/cli.py +0 -6
  23. webscout/Extra/GitToolkit/__init__.py +0 -10
  24. webscout/Extra/GitToolkit/gitapi/__init__.py +0 -12
  25. webscout/Extra/GitToolkit/gitapi/repository.py +0 -195
  26. webscout/Extra/GitToolkit/gitapi/user.py +0 -96
  27. webscout/Extra/GitToolkit/gitapi/utils.py +0 -62
  28. webscout/Extra/YTToolkit/YTdownloader.py +0 -957
  29. webscout/Extra/YTToolkit/__init__.py +0 -3
  30. webscout/Extra/YTToolkit/transcriber.py +0 -476
  31. webscout/Extra/YTToolkit/ytapi/__init__.py +0 -6
  32. webscout/Extra/YTToolkit/ytapi/channel.py +0 -307
  33. webscout/Extra/YTToolkit/ytapi/errors.py +0 -13
  34. webscout/Extra/YTToolkit/ytapi/extras.py +0 -45
  35. webscout/Extra/YTToolkit/ytapi/https.py +0 -88
  36. webscout/Extra/YTToolkit/ytapi/patterns.py +0 -61
  37. webscout/Extra/YTToolkit/ytapi/playlist.py +0 -59
  38. webscout/Extra/YTToolkit/ytapi/pool.py +0 -8
  39. webscout/Extra/YTToolkit/ytapi/query.py +0 -40
  40. webscout/Extra/YTToolkit/ytapi/stream.py +0 -63
  41. webscout/Extra/YTToolkit/ytapi/utils.py +0 -62
  42. webscout/Extra/YTToolkit/ytapi/video.py +0 -232
  43. webscout/Extra/__init__.py +0 -7
  44. webscout/Extra/autocoder/__init__.py +0 -9
  45. webscout/Extra/autocoder/autocoder.py +0 -849
  46. webscout/Extra/autocoder/autocoder_utiles.py +0 -332
  47. webscout/Extra/gguf.py +0 -682
  48. webscout/Extra/tempmail/__init__.py +0 -28
  49. webscout/Extra/tempmail/async_utils.py +0 -141
  50. webscout/Extra/tempmail/base.py +0 -161
  51. webscout/Extra/tempmail/cli.py +0 -187
  52. webscout/Extra/tempmail/emailnator.py +0 -84
  53. webscout/Extra/tempmail/mail_tm.py +0 -361
  54. webscout/Extra/tempmail/temp_mail_io.py +0 -292
  55. webscout/Extra/weather.py +0 -194
  56. webscout/Extra/weather_ascii.py +0 -76
  57. webscout/LLM.py +0 -442
  58. webscout/Litlogger/__init__.py +0 -67
  59. webscout/Litlogger/core/__init__.py +0 -6
  60. webscout/Litlogger/core/level.py +0 -23
  61. webscout/Litlogger/core/logger.py +0 -165
  62. webscout/Litlogger/handlers/__init__.py +0 -12
  63. webscout/Litlogger/handlers/console.py +0 -33
  64. webscout/Litlogger/handlers/file.py +0 -143
  65. webscout/Litlogger/handlers/network.py +0 -173
  66. webscout/Litlogger/styles/__init__.py +0 -7
  67. webscout/Litlogger/styles/colors.py +0 -249
  68. webscout/Litlogger/styles/formats.py +0 -458
  69. webscout/Litlogger/styles/text.py +0 -87
  70. webscout/Litlogger/utils/__init__.py +0 -6
  71. webscout/Litlogger/utils/detectors.py +0 -153
  72. webscout/Litlogger/utils/formatters.py +0 -200
  73. webscout/Local/__init__.py +0 -12
  74. webscout/Local/__main__.py +0 -9
  75. webscout/Local/api.py +0 -576
  76. webscout/Local/cli.py +0 -516
  77. webscout/Local/config.py +0 -75
  78. webscout/Local/llm.py +0 -287
  79. webscout/Local/model_manager.py +0 -253
  80. webscout/Local/server.py +0 -721
  81. webscout/Local/utils.py +0 -93
  82. webscout/Provider/AI21.py +0 -177
  83. webscout/Provider/AISEARCH/DeepFind.py +0 -250
  84. webscout/Provider/AISEARCH/ISou.py +0 -256
  85. webscout/Provider/AISEARCH/Perplexity.py +0 -359
  86. webscout/Provider/AISEARCH/__init__.py +0 -10
  87. webscout/Provider/AISEARCH/felo_search.py +0 -228
  88. webscout/Provider/AISEARCH/genspark_search.py +0 -208
  89. webscout/Provider/AISEARCH/hika_search.py +0 -194
  90. webscout/Provider/AISEARCH/iask_search.py +0 -436
  91. webscout/Provider/AISEARCH/monica_search.py +0 -246
  92. webscout/Provider/AISEARCH/scira_search.py +0 -324
  93. webscout/Provider/AISEARCH/webpilotai_search.py +0 -281
  94. webscout/Provider/Aitopia.py +0 -292
  95. webscout/Provider/AllenAI.py +0 -413
  96. webscout/Provider/Andi.py +0 -228
  97. webscout/Provider/Blackboxai.py +0 -229
  98. webscout/Provider/C4ai.py +0 -432
  99. webscout/Provider/ChatGPTClone.py +0 -226
  100. webscout/Provider/ChatGPTES.py +0 -237
  101. webscout/Provider/ChatGPTGratis.py +0 -194
  102. webscout/Provider/Chatify.py +0 -175
  103. webscout/Provider/Cloudflare.py +0 -273
  104. webscout/Provider/Cohere.py +0 -208
  105. webscout/Provider/DeepSeek.py +0 -196
  106. webscout/Provider/Deepinfra.py +0 -297
  107. webscout/Provider/ElectronHub.py +0 -709
  108. webscout/Provider/ExaAI.py +0 -261
  109. webscout/Provider/ExaChat.py +0 -342
  110. webscout/Provider/Free2GPT.py +0 -241
  111. webscout/Provider/GPTWeb.py +0 -193
  112. webscout/Provider/Gemini.py +0 -169
  113. webscout/Provider/GithubChat.py +0 -367
  114. webscout/Provider/Glider.py +0 -211
  115. webscout/Provider/Groq.py +0 -670
  116. webscout/Provider/HF_space/__init__.py +0 -0
  117. webscout/Provider/HF_space/qwen_qwen2.py +0 -206
  118. webscout/Provider/HeckAI.py +0 -233
  119. webscout/Provider/HuggingFaceChat.py +0 -462
  120. webscout/Provider/Hunyuan.py +0 -272
  121. webscout/Provider/Jadve.py +0 -266
  122. webscout/Provider/Koboldai.py +0 -381
  123. webscout/Provider/LambdaChat.py +0 -392
  124. webscout/Provider/Llama.py +0 -200
  125. webscout/Provider/Llama3.py +0 -204
  126. webscout/Provider/Marcus.py +0 -148
  127. webscout/Provider/Netwrck.py +0 -228
  128. webscout/Provider/OLLAMA.py +0 -396
  129. webscout/Provider/OPENAI/__init__.py +0 -25
  130. webscout/Provider/OPENAI/base.py +0 -46
  131. webscout/Provider/OPENAI/c4ai.py +0 -367
  132. webscout/Provider/OPENAI/chatgpt.py +0 -549
  133. webscout/Provider/OPENAI/chatgptclone.py +0 -460
  134. webscout/Provider/OPENAI/deepinfra.py +0 -272
  135. webscout/Provider/OPENAI/e2b.py +0 -1350
  136. webscout/Provider/OPENAI/exaai.py +0 -404
  137. webscout/Provider/OPENAI/exachat.py +0 -433
  138. webscout/Provider/OPENAI/freeaichat.py +0 -352
  139. webscout/Provider/OPENAI/glider.py +0 -316
  140. webscout/Provider/OPENAI/heckai.py +0 -337
  141. webscout/Provider/OPENAI/llmchatco.py +0 -327
  142. webscout/Provider/OPENAI/netwrck.py +0 -348
  143. webscout/Provider/OPENAI/opkfc.py +0 -488
  144. webscout/Provider/OPENAI/scirachat.py +0 -463
  145. webscout/Provider/OPENAI/sonus.py +0 -294
  146. webscout/Provider/OPENAI/standardinput.py +0 -425
  147. webscout/Provider/OPENAI/textpollinations.py +0 -285
  148. webscout/Provider/OPENAI/toolbaz.py +0 -405
  149. webscout/Provider/OPENAI/typegpt.py +0 -346
  150. webscout/Provider/OPENAI/uncovrAI.py +0 -455
  151. webscout/Provider/OPENAI/utils.py +0 -211
  152. webscout/Provider/OPENAI/venice.py +0 -413
  153. webscout/Provider/OPENAI/wisecat.py +0 -381
  154. webscout/Provider/OPENAI/writecream.py +0 -156
  155. webscout/Provider/OPENAI/x0gpt.py +0 -371
  156. webscout/Provider/OPENAI/yep.py +0 -327
  157. webscout/Provider/OpenGPT.py +0 -199
  158. webscout/Provider/Openai.py +0 -496
  159. webscout/Provider/PI.py +0 -344
  160. webscout/Provider/Perplexitylabs.py +0 -415
  161. webscout/Provider/Phind.py +0 -535
  162. webscout/Provider/PizzaGPT.py +0 -198
  163. webscout/Provider/QwenLM.py +0 -254
  164. webscout/Provider/Reka.py +0 -214
  165. webscout/Provider/StandardInput.py +0 -278
  166. webscout/Provider/TTI/AiForce/__init__.py +0 -22
  167. webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
  168. webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
  169. webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
  170. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
  171. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
  172. webscout/Provider/TTI/ImgSys/__init__.py +0 -23
  173. webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
  174. webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
  175. webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
  176. webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
  177. webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
  178. webscout/Provider/TTI/Nexra/__init__.py +0 -22
  179. webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
  180. webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
  181. webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
  182. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
  183. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
  184. webscout/Provider/TTI/__init__.py +0 -12
  185. webscout/Provider/TTI/aiarta/__init__.py +0 -2
  186. webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
  187. webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
  188. webscout/Provider/TTI/artbit/__init__.py +0 -22
  189. webscout/Provider/TTI/artbit/async_artbit.py +0 -155
  190. webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
  191. webscout/Provider/TTI/fastflux/__init__.py +0 -22
  192. webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
  193. webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
  194. webscout/Provider/TTI/huggingface/__init__.py +0 -22
  195. webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
  196. webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
  197. webscout/Provider/TTI/piclumen/__init__.py +0 -23
  198. webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
  199. webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
  200. webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
  201. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
  202. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
  203. webscout/Provider/TTI/talkai/__init__.py +0 -4
  204. webscout/Provider/TTI/talkai/async_talkai.py +0 -229
  205. webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
  206. webscout/Provider/TTS/__init__.py +0 -7
  207. webscout/Provider/TTS/deepgram.py +0 -156
  208. webscout/Provider/TTS/elevenlabs.py +0 -111
  209. webscout/Provider/TTS/gesserit.py +0 -127
  210. webscout/Provider/TTS/murfai.py +0 -113
  211. webscout/Provider/TTS/parler.py +0 -111
  212. webscout/Provider/TTS/speechma.py +0 -180
  213. webscout/Provider/TTS/streamElements.py +0 -333
  214. webscout/Provider/TTS/utils.py +0 -280
  215. webscout/Provider/TeachAnything.py +0 -187
  216. webscout/Provider/TextPollinationsAI.py +0 -231
  217. webscout/Provider/TwoAI.py +0 -199
  218. webscout/Provider/Venice.py +0 -219
  219. webscout/Provider/VercelAI.py +0 -234
  220. webscout/Provider/WebSim.py +0 -228
  221. webscout/Provider/WiseCat.py +0 -196
  222. webscout/Provider/Writecream.py +0 -211
  223. webscout/Provider/WritingMate.py +0 -197
  224. webscout/Provider/Youchat.py +0 -330
  225. webscout/Provider/__init__.py +0 -198
  226. webscout/Provider/ai4chat.py +0 -202
  227. webscout/Provider/aimathgpt.py +0 -189
  228. webscout/Provider/akashgpt.py +0 -342
  229. webscout/Provider/askmyai.py +0 -158
  230. webscout/Provider/asksteve.py +0 -203
  231. webscout/Provider/bagoodex.py +0 -145
  232. webscout/Provider/cerebras.py +0 -242
  233. webscout/Provider/chatglm.py +0 -205
  234. webscout/Provider/cleeai.py +0 -213
  235. webscout/Provider/copilot.py +0 -428
  236. webscout/Provider/elmo.py +0 -234
  237. webscout/Provider/freeaichat.py +0 -271
  238. webscout/Provider/gaurish.py +0 -244
  239. webscout/Provider/geminiapi.py +0 -208
  240. webscout/Provider/geminiprorealtime.py +0 -160
  241. webscout/Provider/granite.py +0 -187
  242. webscout/Provider/hermes.py +0 -219
  243. webscout/Provider/julius.py +0 -223
  244. webscout/Provider/koala.py +0 -268
  245. webscout/Provider/labyrinth.py +0 -340
  246. webscout/Provider/learnfastai.py +0 -266
  247. webscout/Provider/lepton.py +0 -194
  248. webscout/Provider/llama3mitril.py +0 -180
  249. webscout/Provider/llamatutor.py +0 -192
  250. webscout/Provider/llmchat.py +0 -213
  251. webscout/Provider/llmchatco.py +0 -311
  252. webscout/Provider/meta.py +0 -794
  253. webscout/Provider/multichat.py +0 -325
  254. webscout/Provider/promptrefine.py +0 -193
  255. webscout/Provider/scira_chat.py +0 -277
  256. webscout/Provider/scnet.py +0 -187
  257. webscout/Provider/searchchat.py +0 -293
  258. webscout/Provider/sonus.py +0 -208
  259. webscout/Provider/talkai.py +0 -194
  260. webscout/Provider/toolbaz.py +0 -320
  261. webscout/Provider/turboseek.py +0 -219
  262. webscout/Provider/tutorai.py +0 -252
  263. webscout/Provider/typefully.py +0 -280
  264. webscout/Provider/typegpt.py +0 -232
  265. webscout/Provider/uncovr.py +0 -312
  266. webscout/Provider/x0gpt.py +0 -256
  267. webscout/Provider/yep.py +0 -376
  268. webscout/litagent/__init__.py +0 -29
  269. webscout/litagent/agent.py +0 -455
  270. webscout/litagent/constants.py +0 -60
  271. webscout/litprinter/__init__.py +0 -59
  272. webscout/scout/__init__.py +0 -8
  273. webscout/scout/core/__init__.py +0 -7
  274. webscout/scout/core/crawler.py +0 -140
  275. webscout/scout/core/scout.py +0 -568
  276. webscout/scout/core/search_result.py +0 -96
  277. webscout/scout/core/text_analyzer.py +0 -63
  278. webscout/scout/core/text_utils.py +0 -277
  279. webscout/scout/core/web_analyzer.py +0 -52
  280. webscout/scout/core.py +0 -881
  281. webscout/scout/element.py +0 -460
  282. webscout/scout/parsers/__init__.py +0 -69
  283. webscout/scout/parsers/html5lib_parser.py +0 -172
  284. webscout/scout/parsers/html_parser.py +0 -236
  285. webscout/scout/parsers/lxml_parser.py +0 -178
  286. webscout/scout/utils.py +0 -37
  287. webscout/swiftcli/__init__.py +0 -809
  288. webscout/zeroart/__init__.py +0 -55
  289. webscout/zeroart/base.py +0 -60
  290. webscout/zeroart/effects.py +0 -99
  291. webscout/zeroart/fonts.py +0 -816
  292. webscout-8.2.2.dist-info/RECORD +0 -309
  293. webscout-8.2.2.dist-info/entry_points.txt +0 -5
  294. webscout-8.2.2.dist-info/top_level.txt +0 -3
  295. webstoken/__init__.py +0 -30
  296. webstoken/classifier.py +0 -189
  297. webstoken/keywords.py +0 -216
  298. webstoken/language.py +0 -128
  299. webstoken/ner.py +0 -164
  300. webstoken/normalizer.py +0 -35
  301. webstoken/processor.py +0 -77
  302. webstoken/sentiment.py +0 -206
  303. webstoken/stemmer.py +0 -73
  304. webstoken/tagger.py +0 -60
  305. webstoken/tokenizer.py +0 -158
  306. {webscout-8.2.2.dist-info → webscout-8.2.7.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,140 +0,0 @@
1
- """
2
- Scout Crawler Module
3
- """
4
-
5
- import concurrent.futures
6
- import urllib.parse
7
- from typing import Union, List, Dict
8
- import requests
9
-
10
- from .scout import Scout
11
-
12
- class ScoutCrawler:
13
- """
14
- Advanced web crawling utility for Scout library.
15
- """
16
- def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None):
17
- """
18
- Initialize the web crawler.
19
-
20
- Args:
21
- base_url (str): Starting URL to crawl
22
- max_pages (int, optional): Maximum number of pages to crawl
23
- tags_to_remove (List[str], optional): List of tags to remove
24
- """
25
- self.base_url = base_url
26
- self.max_pages = max_pages
27
- self.tags_to_remove = tags_to_remove if tags_to_remove is not None else ["script", "style", "header", "footer", "nav", "aside", "form", "button"]
28
- self.visited_urls = set()
29
- self.crawled_pages = []
30
-
31
- def _is_valid_url(self, url: str) -> bool:
32
- """
33
- Check if a URL is valid and within the same domain.
34
-
35
- Args:
36
- url (str): URL to validate
37
-
38
- Returns:
39
- bool: Whether the URL is valid
40
- """
41
- try:
42
- parsed_base = urllib.parse.urlparse(self.base_url)
43
- parsed_url = urllib.parse.urlparse(url)
44
-
45
- return (
46
- parsed_url.scheme in ['http', 'https'] and
47
- parsed_base.netloc == parsed_url.netloc and
48
- len(self.visited_urls) < self.max_pages
49
- )
50
- except Exception:
51
- return False
52
-
53
- def _crawl_page(self, url: str, depth: int = 0) -> Dict[str, Union[str, List[str]]]:
54
- """
55
- Crawl a single page and extract information.
56
-
57
- Args:
58
- url (str): URL to crawl
59
- depth (int, optional): Current crawl depth
60
-
61
- Returns:
62
- Dict[str, Union[str, List[str]]]: Crawled page information
63
- """
64
- if url in self.visited_urls:
65
- return {}
66
-
67
- try:
68
- response = requests.get(url, timeout=10)
69
- response.raise_for_status()
70
-
71
- scout = Scout(response.content, features='lxml')
72
-
73
- title_result = scout.find('title')
74
- title = title_result[0].get_text() if title_result else ''
75
-
76
- visible_text = scout._soup.get_text(strip=True)
77
-
78
- for tag in scout._soup(self.tags_to_remove):
79
- tag.extract()
80
-
81
- page_info = {
82
- 'url': url,
83
- 'title': title,
84
- 'links': [
85
- urllib.parse.urljoin(url, link.get('href'))
86
- for link in scout.find_all('a', href=True)
87
- if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
88
- ],
89
- 'text': visible_text,
90
- 'depth': depth
91
- }
92
-
93
- self.visited_urls.add(url)
94
- self.crawled_pages.append(page_info)
95
-
96
- return page_info
97
- except Exception as e:
98
- print(f"Error crawling {url}: {e}")
99
- return {}
100
-
101
- def crawl(self) -> List[Dict[str, Union[str, List[str]]]]:
102
- """
103
- Start web crawling from base URL.
104
-
105
- Returns:
106
- List[Dict[str, Union[str, List[str]]]]: List of crawled pages
107
- """
108
- with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
109
- futures = {executor.submit(self._crawl_page, self.base_url, 0)}
110
-
111
- while futures:
112
- done, futures = concurrent.futures.wait(
113
- futures, return_when=concurrent.futures.FIRST_COMPLETED
114
- )
115
-
116
- for future in done:
117
- page_info = future.result()
118
-
119
- if len(self.visited_urls) >= self.max_pages:
120
- break
121
-
122
- submitted_links = set() # New set to track submitted links
123
- for link in page_info.get('links', []):
124
- if (
125
- len(self.visited_urls) < self.max_pages and
126
- link not in self.visited_urls
127
- ):
128
- if link not in submitted_links: # Check against submitted links
129
- submitted_links.add(link) # Add to submitted links
130
- futures.add(
131
- executor.submit(
132
- self._crawl_page,
133
- link,
134
- page_info.get('depth', 0) + 1
135
- )
136
- )
137
- if len(self.visited_urls) >= self.max_pages:
138
- break
139
-
140
- return self.crawled_pages
@@ -1,568 +0,0 @@
1
- """
2
- Scout Main Module - HTML Parsing and Traversal
3
- """
4
- import re
5
- import json
6
- import hashlib
7
- import unicodedata
8
- import urllib.parse
9
- from typing import List, Dict, Optional, Any
10
-
11
- from ..parsers import ParserRegistry
12
- from ..element import Tag, NavigableString
13
- from ..utils import decode_markup
14
- from .text_analyzer import ScoutTextAnalyzer
15
- from .web_analyzer import ScoutWebAnalyzer
16
- from .search_result import ScoutSearchResult
17
- from .text_utils import SentenceTokenizer
18
-
19
-
20
- class Scout:
21
- """
22
- Scout - Making web scraping a breeze! 🌊
23
- A comprehensive HTML parsing and traversal library.
24
- Enhanced with advanced features and intelligent parsing.
25
- """
26
-
27
- def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
28
- """
29
- Initialize Scout with HTML content.
30
-
31
- Args:
32
- markup (str): HTML content to parse
33
- features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
34
- from_encoding (str): Source encoding (if known)
35
- **kwargs: Additional parsing options
36
- """
37
- # Intelligent markup handling
38
- self.markup = self._preprocess_markup(markup, from_encoding)
39
- self.features = features
40
- self.from_encoding = from_encoding
41
-
42
- # Get the right parser for the job
43
- if features not in ParserRegistry.list_parsers():
44
- raise ValueError(
45
- f"Invalid parser '{features}'! Choose from: {', '.join(ParserRegistry.list_parsers().keys())}"
46
- )
47
-
48
- parser_class = ParserRegistry.get_parser(features)
49
- self.parser = parser_class
50
-
51
- # Parse that HTML! 🎯
52
- self._soup = self.parser.parse(self.markup)
53
-
54
- # BeautifulSoup-like attributes
55
- self.name = self._soup.name if hasattr(self._soup, 'name') else None
56
- self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
57
-
58
- # Advanced parsing options
59
- self._cache = {}
60
-
61
- # Text and web analyzers
62
- self.text_analyzer = ScoutTextAnalyzer()
63
- self.web_analyzer = ScoutWebAnalyzer()
64
-
65
- def normalize_text(self, text: str, form='NFKD') -> str:
66
- """
67
- Normalize text using Unicode normalization.
68
-
69
- Args:
70
- text (str): Input text
71
- form (str, optional): Normalization form
72
-
73
- Returns:
74
- str: Normalized text
75
- """
76
- return unicodedata.normalize(form, text)
77
-
78
- def url_parse(self, url: str) -> Dict[str, str]:
79
- """
80
- Parse and analyze a URL.
81
-
82
- Args:
83
- url (str): URL to parse
84
-
85
- Returns:
86
- Dict[str, str]: Parsed URL components
87
- """
88
- parsed = urllib.parse.urlparse(url)
89
- return {
90
- 'scheme': parsed.scheme,
91
- 'netloc': parsed.netloc,
92
- 'path': parsed.path,
93
- 'params': parsed.params,
94
- 'query': parsed.query,
95
- 'fragment': parsed.fragment
96
- }
97
-
98
- def analyze_page_structure(self) -> Dict[str, Any]:
99
- """
100
- Analyze the structure of the parsed page.
101
-
102
- Returns:
103
- Dict[str, Any]: Page structure analysis
104
- """
105
- return self.web_analyzer.analyze_page_structure(self)
106
-
107
- def analyze_text(self, text: Optional[str] = None) -> Dict[str, Any]:
108
- """
109
- Perform advanced text analysis.
110
-
111
- Args:
112
- text (str, optional): Text to analyze. If None, uses page text.
113
-
114
- Returns:
115
- Dict[str, Any]: Text analysis results
116
- """
117
- if text is None:
118
- text = self.get_text()
119
-
120
- return {
121
- 'word_count': self.text_analyzer.count_words(text),
122
- 'entities': self.text_analyzer.extract_entities(text),
123
- 'tokens': self.text_analyzer.tokenize(text)
124
- }
125
-
126
- def extract_semantic_info(self) -> Dict[str, Any]:
127
- """
128
- Extract semantic information from the document.
129
-
130
- Returns:
131
- Dict[str, Any]: Semantic information
132
- """
133
- semantic_info = {
134
- 'headings': {
135
- 'h1': [h.get_text(strip=True) for h in self.find_all('h1')],
136
- 'h2': [h.get_text(strip=True) for h in self.find_all('h2')],
137
- 'h3': [h.get_text(strip=True) for h in self.find_all('h3')]
138
- },
139
- 'lists': {
140
- 'ul': [ul.find_all('li') for ul in self.find_all('ul')],
141
- 'ol': [ol.find_all('li') for ol in self.find_all('ol')]
142
- },
143
- 'tables': {
144
- 'count': len(self.find_all('table')),
145
- 'headers': [table.find_all('th') for table in self.find_all('table')]
146
- }
147
- }
148
- return semantic_info
149
-
150
- def cache(self, key: str, value: Any = None) -> Any:
151
- """
152
- Manage a cache for parsed content.
153
-
154
- Args:
155
- key (str): Cache key
156
- value (Any, optional): Value to cache
157
-
158
- Returns:
159
- Any: Cached value or None
160
- """
161
- if value is not None:
162
- self._cache[key] = value
163
- return self._cache.get(key)
164
-
165
- def hash_content(self, method='md5') -> str:
166
- """
167
- Generate a hash of the parsed content.
168
-
169
- Args:
170
- method (str, optional): Hashing method
171
-
172
- Returns:
173
- str: Content hash
174
- """
175
- hash_methods = {
176
- 'md5': hashlib.md5,
177
- 'sha1': hashlib.sha1,
178
- 'sha256': hashlib.sha256
179
- }
180
-
181
- if method not in hash_methods:
182
- raise ValueError(f"Unsupported hash method: {method}")
183
-
184
- hasher = hash_methods[method]()
185
- hasher.update(str(self._soup).encode('utf-8'))
186
- return hasher.hexdigest()
187
-
188
- def extract_links(self, base_url: Optional[str] = None) -> List[Dict[str, str]]:
189
- """
190
- Extract all links from the document.
191
-
192
- Args:
193
- base_url (str, optional): Base URL for resolving relative links
194
-
195
- Returns:
196
- List[Dict[str, str]]: List of link dictionaries
197
- """
198
- links = []
199
- for link in self.find_all(['a', 'link']):
200
- href = link.get('href')
201
- if href:
202
- # Resolve relative URLs if base_url is provided
203
- if base_url and not href.startswith(('http://', 'https://', '//')):
204
- href = f"{base_url.rstrip('/')}/{href.lstrip('/')}"
205
-
206
- links.append({
207
- 'href': href,
208
- 'text': link.get_text(strip=True),
209
- 'rel': link.get('rel', [None])[0],
210
- 'type': link.get('type')
211
- })
212
- return links
213
-
214
- def extract_metadata(self) -> Dict[str, Any]:
215
- """
216
- Extract metadata from HTML document.
217
-
218
- Returns:
219
- Dict[str, Any]: Extracted metadata
220
- """
221
- metadata = {
222
- 'title': self.find('title').texts()[0] if self.find('title').texts() else None,
223
- 'description': self.find('meta', attrs={'name': 'description'}).attrs('content')[0] if self.find('meta', attrs={'name': 'description'}).attrs('content') else None,
224
- 'keywords': self.find('meta', attrs={'name': 'keywords'}).attrs('content')[0].split(',') if self.find('meta', attrs={'name': 'keywords'}).attrs('content') else [],
225
- 'og_metadata': {},
226
- 'twitter_metadata': {}
227
- }
228
-
229
- # Open Graph metadata
230
- for meta in self.find_all('meta', attrs={'property': re.compile(r'^og:')}):
231
- key = meta.attrs('property')[0][3:]
232
- metadata['og_metadata'][key] = meta.attrs('content')[0]
233
-
234
- # Twitter Card metadata
235
- for meta in self.find_all('meta', attrs={'name': re.compile(r'^twitter:')}):
236
- key = meta.attrs('name')[0][8:]
237
- metadata['twitter_metadata'][key] = meta.attrs('content')[0]
238
-
239
- return metadata
240
-
241
- def to_json(self, indent=2) -> str:
242
- """
243
- Convert parsed content to JSON.
244
-
245
- Args:
246
- indent (int, optional): JSON indentation
247
-
248
- Returns:
249
- str: JSON representation of the document
250
- """
251
- def _tag_to_dict(tag):
252
- if isinstance(tag, NavigableString):
253
- return str(tag)
254
-
255
- result = {
256
- 'name': tag.name,
257
- 'attrs': tag.attrs,
258
- 'text': tag.get_text(strip=True)
259
- }
260
-
261
- if tag.contents:
262
- result['children'] = [_tag_to_dict(child) for child in tag.contents]
263
-
264
- return result
265
-
266
- return json.dumps(_tag_to_dict(self._soup), indent=indent)
267
-
268
- def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> ScoutSearchResult:
269
- """
270
- Find the first matching element.
271
-
272
- Args:
273
- name (str, optional): Tag name to search for
274
- attrs (dict, optional): Attributes to match
275
- recursive (bool, optional): Search recursively
276
- text (str, optional): Text content to match
277
-
278
- Returns:
279
- ScoutSearchResult: First matching element
280
- """
281
- result = self._soup.find(name, attrs, recursive, text, **kwargs)
282
- return ScoutSearchResult([result]) if result else ScoutSearchResult([])
283
-
284
- def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> ScoutSearchResult:
285
- """
286
- Find all matching elements.
287
-
288
- Args:
289
- name (str, optional): Tag name to search for
290
- attrs (dict, optional): Attributes to match
291
- recursive (bool, optional): Search recursively
292
- text (str, optional): Text content to match
293
- limit (int, optional): Maximum number of results
294
-
295
- Returns:
296
- ScoutSearchResult: List of matching elements
297
- """
298
- results = self._soup.find_all(name, attrs, recursive, text, limit, **kwargs)
299
- return ScoutSearchResult(results)
300
-
301
- def find_parent(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
302
- """
303
- Find the first parent matching given criteria.
304
-
305
- Args:
306
- name (str, optional): Tag name to search for
307
- attrs (dict, optional): Attributes to match
308
-
309
- Returns:
310
- Tag or None: First matching parent
311
- """
312
- current = self._soup.parent
313
- while current:
314
- if (name is None or current.name == name) and \
315
- all(current.get(k) == v for k, v in attrs.items()):
316
- return current
317
- current = current.parent
318
- return None
319
-
320
- def find_parents(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
321
- """
322
- Find all parents matching given criteria.
323
-
324
- Args:
325
- name (str, optional): Tag name to search for
326
- attrs (dict, optional): Attributes to match
327
- limit (int, optional): Maximum number of results
328
-
329
- Returns:
330
- List[Tag]: List of matching parents
331
- """
332
- parents = []
333
- current = self._soup.parent
334
- while current and (limit is None or len(parents) < limit):
335
- if (name is None or current.name == name) and \
336
- all(current.get(k) == v for k, v in attrs.items()):
337
- parents.append(current)
338
- current = current.parent
339
- return parents
340
-
341
- def find_next_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
342
- """
343
- Find the next sibling matching given criteria.
344
-
345
- Args:
346
- name (str, optional): Tag name to search for
347
- attrs (dict, optional): Attributes to match
348
-
349
- Returns:
350
- Tag or None: First matching next sibling
351
- """
352
- if not self._soup.parent:
353
- return None
354
-
355
- siblings = self._soup.parent.contents
356
- try:
357
- current_index = siblings.index(self._soup)
358
- for sibling in siblings[current_index + 1:]:
359
- if isinstance(sibling, Tag):
360
- if (name is None or sibling.name == name) and \
361
- all(sibling.get(k) == v for k, v in attrs.items()):
362
- return sibling
363
- except ValueError:
364
- pass
365
- return None
366
-
367
- def find_next_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
368
- """
369
- Find all next siblings matching given criteria.
370
-
371
- Args:
372
- name (str, optional): Tag name to search for
373
- attrs (dict, optional): Attributes to match
374
- limit (int, optional): Maximum number of results
375
-
376
- Returns:
377
- List[Tag]: List of matching next siblings
378
- """
379
- if not self._soup.parent:
380
- return []
381
-
382
- siblings = []
383
- siblings_list = self._soup.parent.contents
384
- try:
385
- current_index = siblings_list.index(self._soup)
386
- for sibling in siblings_list[current_index + 1:]:
387
- if isinstance(sibling, Tag):
388
- if (name is None or sibling.name == name) and \
389
- all(sibling.get(k) == v for k, v in attrs.items()):
390
- siblings.append(sibling)
391
- if limit and len(siblings) == limit:
392
- break
393
- except ValueError:
394
- pass
395
- return siblings
396
-
397
- def select(self, selector: str) -> List[Tag]:
398
- """
399
- Select elements using CSS selector.
400
-
401
- Args:
402
- selector (str): CSS selector string
403
-
404
- Returns:
405
- List[Tag]: List of matching elements
406
- """
407
- return self._soup.select(selector)
408
-
409
- def select_one(self, selector: str) -> Optional[Tag]:
410
- """
411
- Select the first element matching the CSS selector.
412
-
413
- Args:
414
- selector (str): CSS selector string
415
-
416
- Returns:
417
- Tag or None: First matching element
418
- """
419
- return self._soup.select_one(selector)
420
-
421
- def get_text(self, separator=' ', strip=False, types=None) -> str:
422
- """
423
- Extract all text from the parsed document.
424
-
425
- Args:
426
- separator (str, optional): Text separator
427
- strip (bool, optional): Strip whitespace
428
- types (list, optional): Types of content to extract
429
-
430
- Returns:
431
- str: Extracted text
432
- """
433
- tokenizer = SentenceTokenizer()
434
- text = self._soup.get_text(separator, strip, types)
435
- sentences = tokenizer.tokenize(text)
436
- return "\n\n".join(sentences)
437
-
438
- def remove_tags(self, tags: List[str]) -> None:
439
- """
440
- Remove specified tags and their contents from the document.
441
-
442
- Args:
443
- tags (List[str]): List of tag names to remove
444
- """
445
- for tag_name in tags:
446
- for tag in self._soup.find_all(tag_name):
447
- tag.decompose()
448
-
449
- def prettify(self, formatter='minimal') -> str:
450
- """
451
- Return a formatted, pretty-printed version of the HTML.
452
-
453
- Args:
454
- formatter (str, optional): Formatting style
455
-
456
- Returns:
457
- str: Prettified HTML
458
- """
459
- return self._soup.prettify(formatter)
460
-
461
- def decompose(self, tag: Tag = None) -> None:
462
- """
463
- Remove a tag and its contents from the document.
464
-
465
- Args:
466
- tag (Tag, optional): Tag to remove. If None, removes the root tag.
467
- """
468
- if tag is None:
469
- tag = self._soup
470
- tag.decompose()
471
-
472
- def extract(self, tag: Tag = None) -> Tag:
473
- """
474
- Remove a tag from the document and return it.
475
-
476
- Args:
477
- tag (Tag, optional): Tag to extract. If None, extracts the root tag.
478
-
479
- Returns:
480
- Tag: Extracted tag
481
- """
482
- if tag is None:
483
- tag = self._soup
484
- return tag.extract()
485
-
486
- def clear(self, tag: Tag = None) -> None:
487
- """
488
- Remove a tag's contents while keeping the tag itself.
489
-
490
- Args:
491
- tag (Tag, optional): Tag to clear. If None, clears the root tag.
492
- """
493
- if tag is None:
494
- tag = self._soup
495
- tag.clear()
496
-
497
- def replace_with(self, old_tag: Tag, new_tag: Tag) -> None:
498
- """
499
- Replace one tag with another.
500
-
501
- Args:
502
- old_tag (Tag): Tag to replace
503
- new_tag (Tag): Replacement tag
504
- """
505
- old_tag.replace_with(new_tag)
506
-
507
- def encode(self, encoding='utf-8') -> bytes:
508
- """
509
- Encode the document to a specific encoding.
510
-
511
- Args:
512
- encoding (str, optional): Encoding to use
513
-
514
- Returns:
515
- bytes: Encoded document
516
- """
517
- return str(self._soup).encode(encoding)
518
-
519
- def decode(self, encoding='utf-8') -> str:
520
- """
521
- Decode the document from a specific encoding.
522
-
523
- Args:
524
- encoding (str, optional): Encoding to use
525
-
526
- Returns:
527
- str: Decoded document
528
- """
529
- return str(self._soup)
530
-
531
- def __str__(self) -> str:
532
- """
533
- String representation of the parsed document.
534
-
535
- Returns:
536
- str: HTML content
537
- """
538
- return str(self._soup)
539
-
540
- def __repr__(self) -> str:
541
- """
542
- Detailed representation of the Scout object.
543
-
544
- Returns:
545
- str: Scout object description
546
- """
547
- return f"Scout(features='{self.features}', content_length={len(self.markup)})"
548
-
549
- def _preprocess_markup(self, markup: str, encoding: Optional[str] = None) -> str:
550
- """
551
- Preprocess markup before parsing.
552
-
553
- Args:
554
- markup (str): Input markup
555
- encoding (str, optional): Encoding to use
556
-
557
- Returns:
558
- str: Preprocessed markup
559
- """
560
- # Decode markup
561
- decoded_markup = decode_markup(markup, encoding)
562
-
563
- # Basic HTML cleaning
564
- # Remove comments, normalize whitespace, etc.
565
- decoded_markup = re.sub(r'<!--.*?-->', '', decoded_markup, flags=re.DOTALL)
566
- decoded_markup = re.sub(r'\s+', ' ', decoded_markup)
567
-
568
- return decoded_markup