webscout 8.2.7__py3-none-any.whl → 8.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (323) hide show
  1. webscout/AIauto.py +1 -1
  2. webscout/AIutel.py +298 -249
  3. webscout/Extra/Act.md +309 -0
  4. webscout/Extra/GitToolkit/__init__.py +10 -0
  5. webscout/Extra/GitToolkit/gitapi/README.md +110 -0
  6. webscout/Extra/GitToolkit/gitapi/__init__.py +12 -0
  7. webscout/Extra/GitToolkit/gitapi/repository.py +195 -0
  8. webscout/Extra/GitToolkit/gitapi/user.py +96 -0
  9. webscout/Extra/GitToolkit/gitapi/utils.py +62 -0
  10. webscout/Extra/YTToolkit/README.md +375 -0
  11. webscout/Extra/YTToolkit/YTdownloader.py +957 -0
  12. webscout/Extra/YTToolkit/__init__.py +3 -0
  13. webscout/Extra/YTToolkit/transcriber.py +476 -0
  14. webscout/Extra/YTToolkit/ytapi/README.md +44 -0
  15. webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
  16. webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
  17. webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
  18. webscout/Extra/YTToolkit/ytapi/extras.py +118 -0
  19. webscout/Extra/YTToolkit/ytapi/https.py +88 -0
  20. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
  21. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
  22. webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
  23. webscout/Extra/YTToolkit/ytapi/query.py +40 -0
  24. webscout/Extra/YTToolkit/ytapi/stream.py +63 -0
  25. webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
  26. webscout/Extra/YTToolkit/ytapi/video.py +232 -0
  27. webscout/Extra/__init__.py +7 -0
  28. webscout/Extra/autocoder/__init__.py +9 -0
  29. webscout/Extra/autocoder/autocoder.py +1105 -0
  30. webscout/Extra/autocoder/autocoder_utiles.py +332 -0
  31. webscout/Extra/gguf.md +430 -0
  32. webscout/Extra/gguf.py +684 -0
  33. webscout/Extra/tempmail/README.md +488 -0
  34. webscout/Extra/tempmail/__init__.py +28 -0
  35. webscout/Extra/tempmail/async_utils.py +141 -0
  36. webscout/Extra/tempmail/base.py +161 -0
  37. webscout/Extra/tempmail/cli.py +187 -0
  38. webscout/Extra/tempmail/emailnator.py +84 -0
  39. webscout/Extra/tempmail/mail_tm.py +361 -0
  40. webscout/Extra/tempmail/temp_mail_io.py +292 -0
  41. webscout/Extra/weather.md +281 -0
  42. webscout/Extra/weather.py +194 -0
  43. webscout/Extra/weather_ascii.py +76 -0
  44. webscout/Litlogger/Readme.md +175 -0
  45. webscout/Litlogger/__init__.py +67 -0
  46. webscout/Litlogger/core/__init__.py +6 -0
  47. webscout/Litlogger/core/level.py +23 -0
  48. webscout/Litlogger/core/logger.py +165 -0
  49. webscout/Litlogger/handlers/__init__.py +12 -0
  50. webscout/Litlogger/handlers/console.py +33 -0
  51. webscout/Litlogger/handlers/file.py +143 -0
  52. webscout/Litlogger/handlers/network.py +173 -0
  53. webscout/Litlogger/styles/__init__.py +7 -0
  54. webscout/Litlogger/styles/colors.py +249 -0
  55. webscout/Litlogger/styles/formats.py +458 -0
  56. webscout/Litlogger/styles/text.py +87 -0
  57. webscout/Litlogger/utils/__init__.py +6 -0
  58. webscout/Litlogger/utils/detectors.py +153 -0
  59. webscout/Litlogger/utils/formatters.py +200 -0
  60. webscout/Provider/AI21.py +177 -0
  61. webscout/Provider/AISEARCH/DeepFind.py +254 -0
  62. webscout/Provider/AISEARCH/Perplexity.py +359 -0
  63. webscout/Provider/AISEARCH/README.md +279 -0
  64. webscout/Provider/AISEARCH/__init__.py +9 -0
  65. webscout/Provider/AISEARCH/felo_search.py +228 -0
  66. webscout/Provider/AISEARCH/genspark_search.py +350 -0
  67. webscout/Provider/AISEARCH/hika_search.py +198 -0
  68. webscout/Provider/AISEARCH/iask_search.py +436 -0
  69. webscout/Provider/AISEARCH/monica_search.py +246 -0
  70. webscout/Provider/AISEARCH/scira_search.py +324 -0
  71. webscout/Provider/AISEARCH/webpilotai_search.py +281 -0
  72. webscout/Provider/Aitopia.py +316 -0
  73. webscout/Provider/AllenAI.py +440 -0
  74. webscout/Provider/Andi.py +228 -0
  75. webscout/Provider/Blackboxai.py +673 -0
  76. webscout/Provider/ChatGPTClone.py +237 -0
  77. webscout/Provider/ChatGPTGratis.py +194 -0
  78. webscout/Provider/ChatSandbox.py +342 -0
  79. webscout/Provider/Cloudflare.py +324 -0
  80. webscout/Provider/Cohere.py +208 -0
  81. webscout/Provider/Deepinfra.py +340 -0
  82. webscout/Provider/ExaAI.py +261 -0
  83. webscout/Provider/ExaChat.py +358 -0
  84. webscout/Provider/Flowith.py +217 -0
  85. webscout/Provider/FreeGemini.py +250 -0
  86. webscout/Provider/Gemini.py +169 -0
  87. webscout/Provider/GithubChat.py +370 -0
  88. webscout/Provider/GizAI.py +295 -0
  89. webscout/Provider/Glider.py +225 -0
  90. webscout/Provider/Groq.py +801 -0
  91. webscout/Provider/HF_space/__init__.py +0 -0
  92. webscout/Provider/HF_space/qwen_qwen2.py +206 -0
  93. webscout/Provider/HeckAI.py +285 -0
  94. webscout/Provider/HuggingFaceChat.py +469 -0
  95. webscout/Provider/Hunyuan.py +283 -0
  96. webscout/Provider/Jadve.py +291 -0
  97. webscout/Provider/Koboldai.py +384 -0
  98. webscout/Provider/LambdaChat.py +411 -0
  99. webscout/Provider/Llama3.py +259 -0
  100. webscout/Provider/MCPCore.py +315 -0
  101. webscout/Provider/Marcus.py +198 -0
  102. webscout/Provider/Nemotron.py +218 -0
  103. webscout/Provider/Netwrck.py +270 -0
  104. webscout/Provider/OLLAMA.py +396 -0
  105. webscout/Provider/OPENAI/BLACKBOXAI.py +735 -0
  106. webscout/Provider/OPENAI/Cloudflare.py +378 -0
  107. webscout/Provider/OPENAI/FreeGemini.py +282 -0
  108. webscout/Provider/OPENAI/NEMOTRON.py +244 -0
  109. webscout/Provider/OPENAI/README.md +1253 -0
  110. webscout/Provider/OPENAI/__init__.py +36 -0
  111. webscout/Provider/OPENAI/ai4chat.py +293 -0
  112. webscout/Provider/OPENAI/api.py +810 -0
  113. webscout/Provider/OPENAI/base.py +249 -0
  114. webscout/Provider/OPENAI/c4ai.py +373 -0
  115. webscout/Provider/OPENAI/chatgpt.py +556 -0
  116. webscout/Provider/OPENAI/chatgptclone.py +488 -0
  117. webscout/Provider/OPENAI/chatsandbox.py +172 -0
  118. webscout/Provider/OPENAI/deepinfra.py +319 -0
  119. webscout/Provider/OPENAI/e2b.py +1356 -0
  120. webscout/Provider/OPENAI/exaai.py +411 -0
  121. webscout/Provider/OPENAI/exachat.py +443 -0
  122. webscout/Provider/OPENAI/flowith.py +162 -0
  123. webscout/Provider/OPENAI/freeaichat.py +359 -0
  124. webscout/Provider/OPENAI/glider.py +323 -0
  125. webscout/Provider/OPENAI/groq.py +361 -0
  126. webscout/Provider/OPENAI/heckai.py +307 -0
  127. webscout/Provider/OPENAI/llmchatco.py +335 -0
  128. webscout/Provider/OPENAI/mcpcore.py +383 -0
  129. webscout/Provider/OPENAI/multichat.py +376 -0
  130. webscout/Provider/OPENAI/netwrck.py +356 -0
  131. webscout/Provider/OPENAI/opkfc.py +496 -0
  132. webscout/Provider/OPENAI/scirachat.py +471 -0
  133. webscout/Provider/OPENAI/sonus.py +303 -0
  134. webscout/Provider/OPENAI/standardinput.py +433 -0
  135. webscout/Provider/OPENAI/textpollinations.py +339 -0
  136. webscout/Provider/OPENAI/toolbaz.py +413 -0
  137. webscout/Provider/OPENAI/typefully.py +355 -0
  138. webscout/Provider/OPENAI/typegpt.py +358 -0
  139. webscout/Provider/OPENAI/uncovrAI.py +462 -0
  140. webscout/Provider/OPENAI/utils.py +307 -0
  141. webscout/Provider/OPENAI/venice.py +425 -0
  142. webscout/Provider/OPENAI/wisecat.py +381 -0
  143. webscout/Provider/OPENAI/writecream.py +163 -0
  144. webscout/Provider/OPENAI/x0gpt.py +378 -0
  145. webscout/Provider/OPENAI/yep.py +356 -0
  146. webscout/Provider/OpenGPT.py +209 -0
  147. webscout/Provider/Openai.py +496 -0
  148. webscout/Provider/PI.py +429 -0
  149. webscout/Provider/Perplexitylabs.py +415 -0
  150. webscout/Provider/QwenLM.py +254 -0
  151. webscout/Provider/Reka.py +214 -0
  152. webscout/Provider/StandardInput.py +290 -0
  153. webscout/Provider/TTI/AiForce/README.md +159 -0
  154. webscout/Provider/TTI/AiForce/__init__.py +22 -0
  155. webscout/Provider/TTI/AiForce/async_aiforce.py +224 -0
  156. webscout/Provider/TTI/AiForce/sync_aiforce.py +245 -0
  157. webscout/Provider/TTI/FreeAIPlayground/README.md +99 -0
  158. webscout/Provider/TTI/FreeAIPlayground/__init__.py +9 -0
  159. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +181 -0
  160. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +180 -0
  161. webscout/Provider/TTI/ImgSys/README.md +174 -0
  162. webscout/Provider/TTI/ImgSys/__init__.py +23 -0
  163. webscout/Provider/TTI/ImgSys/async_imgsys.py +202 -0
  164. webscout/Provider/TTI/ImgSys/sync_imgsys.py +195 -0
  165. webscout/Provider/TTI/MagicStudio/README.md +101 -0
  166. webscout/Provider/TTI/MagicStudio/__init__.py +2 -0
  167. webscout/Provider/TTI/MagicStudio/async_magicstudio.py +111 -0
  168. webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +109 -0
  169. webscout/Provider/TTI/Nexra/README.md +155 -0
  170. webscout/Provider/TTI/Nexra/__init__.py +22 -0
  171. webscout/Provider/TTI/Nexra/async_nexra.py +286 -0
  172. webscout/Provider/TTI/Nexra/sync_nexra.py +258 -0
  173. webscout/Provider/TTI/PollinationsAI/README.md +146 -0
  174. webscout/Provider/TTI/PollinationsAI/__init__.py +23 -0
  175. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +311 -0
  176. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +265 -0
  177. webscout/Provider/TTI/README.md +128 -0
  178. webscout/Provider/TTI/__init__.py +12 -0
  179. webscout/Provider/TTI/aiarta/README.md +134 -0
  180. webscout/Provider/TTI/aiarta/__init__.py +2 -0
  181. webscout/Provider/TTI/aiarta/async_aiarta.py +482 -0
  182. webscout/Provider/TTI/aiarta/sync_aiarta.py +440 -0
  183. webscout/Provider/TTI/artbit/README.md +100 -0
  184. webscout/Provider/TTI/artbit/__init__.py +22 -0
  185. webscout/Provider/TTI/artbit/async_artbit.py +155 -0
  186. webscout/Provider/TTI/artbit/sync_artbit.py +148 -0
  187. webscout/Provider/TTI/fastflux/README.md +129 -0
  188. webscout/Provider/TTI/fastflux/__init__.py +22 -0
  189. webscout/Provider/TTI/fastflux/async_fastflux.py +261 -0
  190. webscout/Provider/TTI/fastflux/sync_fastflux.py +252 -0
  191. webscout/Provider/TTI/huggingface/README.md +114 -0
  192. webscout/Provider/TTI/huggingface/__init__.py +22 -0
  193. webscout/Provider/TTI/huggingface/async_huggingface.py +199 -0
  194. webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -0
  195. webscout/Provider/TTI/piclumen/README.md +161 -0
  196. webscout/Provider/TTI/piclumen/__init__.py +23 -0
  197. webscout/Provider/TTI/piclumen/async_piclumen.py +268 -0
  198. webscout/Provider/TTI/piclumen/sync_piclumen.py +233 -0
  199. webscout/Provider/TTI/pixelmuse/README.md +79 -0
  200. webscout/Provider/TTI/pixelmuse/__init__.py +4 -0
  201. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +249 -0
  202. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +182 -0
  203. webscout/Provider/TTI/talkai/README.md +139 -0
  204. webscout/Provider/TTI/talkai/__init__.py +4 -0
  205. webscout/Provider/TTI/talkai/async_talkai.py +229 -0
  206. webscout/Provider/TTI/talkai/sync_talkai.py +207 -0
  207. webscout/Provider/TTS/README.md +192 -0
  208. webscout/Provider/TTS/__init__.py +9 -0
  209. webscout/Provider/TTS/base.py +159 -0
  210. webscout/Provider/TTS/deepgram.py +156 -0
  211. webscout/Provider/TTS/elevenlabs.py +111 -0
  212. webscout/Provider/TTS/gesserit.py +128 -0
  213. webscout/Provider/TTS/murfai.py +113 -0
  214. webscout/Provider/TTS/parler.py +111 -0
  215. webscout/Provider/TTS/speechma.py +580 -0
  216. webscout/Provider/TTS/sthir.py +94 -0
  217. webscout/Provider/TTS/streamElements.py +333 -0
  218. webscout/Provider/TTS/utils.py +280 -0
  219. webscout/Provider/TeachAnything.py +229 -0
  220. webscout/Provider/TextPollinationsAI.py +308 -0
  221. webscout/Provider/TwoAI.py +280 -0
  222. webscout/Provider/TypliAI.py +305 -0
  223. webscout/Provider/UNFINISHED/ChatHub.py +209 -0
  224. webscout/Provider/UNFINISHED/Youchat.py +330 -0
  225. webscout/Provider/UNFINISHED/liner_api_request.py +263 -0
  226. webscout/Provider/UNFINISHED/oivscode.py +351 -0
  227. webscout/Provider/UNFINISHED/test_lmarena.py +119 -0
  228. webscout/Provider/Venice.py +258 -0
  229. webscout/Provider/VercelAI.py +253 -0
  230. webscout/Provider/WiseCat.py +233 -0
  231. webscout/Provider/WrDoChat.py +370 -0
  232. webscout/Provider/Writecream.py +246 -0
  233. webscout/Provider/WritingMate.py +269 -0
  234. webscout/Provider/__init__.py +172 -0
  235. webscout/Provider/ai4chat.py +149 -0
  236. webscout/Provider/akashgpt.py +335 -0
  237. webscout/Provider/asksteve.py +220 -0
  238. webscout/Provider/cerebras.py +290 -0
  239. webscout/Provider/chatglm.py +215 -0
  240. webscout/Provider/cleeai.py +213 -0
  241. webscout/Provider/copilot.py +425 -0
  242. webscout/Provider/elmo.py +283 -0
  243. webscout/Provider/freeaichat.py +285 -0
  244. webscout/Provider/geminiapi.py +208 -0
  245. webscout/Provider/granite.py +235 -0
  246. webscout/Provider/hermes.py +266 -0
  247. webscout/Provider/julius.py +223 -0
  248. webscout/Provider/koala.py +170 -0
  249. webscout/Provider/learnfastai.py +325 -0
  250. webscout/Provider/llama3mitril.py +215 -0
  251. webscout/Provider/llmchat.py +258 -0
  252. webscout/Provider/llmchatco.py +306 -0
  253. webscout/Provider/lmarena.py +198 -0
  254. webscout/Provider/meta.py +801 -0
  255. webscout/Provider/multichat.py +364 -0
  256. webscout/Provider/samurai.py +223 -0
  257. webscout/Provider/scira_chat.py +299 -0
  258. webscout/Provider/scnet.py +243 -0
  259. webscout/Provider/searchchat.py +292 -0
  260. webscout/Provider/sonus.py +258 -0
  261. webscout/Provider/talkai.py +194 -0
  262. webscout/Provider/toolbaz.py +353 -0
  263. webscout/Provider/turboseek.py +266 -0
  264. webscout/Provider/typefully.py +202 -0
  265. webscout/Provider/typegpt.py +289 -0
  266. webscout/Provider/uncovr.py +368 -0
  267. webscout/Provider/x0gpt.py +299 -0
  268. webscout/Provider/yep.py +389 -0
  269. webscout/__init__.py +4 -2
  270. webscout/cli.py +3 -28
  271. webscout/conversation.py +35 -35
  272. webscout/litagent/Readme.md +276 -0
  273. webscout/litagent/__init__.py +29 -0
  274. webscout/litagent/agent.py +455 -0
  275. webscout/litagent/constants.py +60 -0
  276. webscout/litprinter/__init__.py +59 -0
  277. webscout/scout/README.md +402 -0
  278. webscout/scout/__init__.py +8 -0
  279. webscout/scout/core/__init__.py +7 -0
  280. webscout/scout/core/crawler.py +140 -0
  281. webscout/scout/core/scout.py +568 -0
  282. webscout/scout/core/search_result.py +96 -0
  283. webscout/scout/core/text_analyzer.py +63 -0
  284. webscout/scout/core/text_utils.py +277 -0
  285. webscout/scout/core/web_analyzer.py +52 -0
  286. webscout/scout/element.py +460 -0
  287. webscout/scout/parsers/__init__.py +69 -0
  288. webscout/scout/parsers/html5lib_parser.py +172 -0
  289. webscout/scout/parsers/html_parser.py +236 -0
  290. webscout/scout/parsers/lxml_parser.py +178 -0
  291. webscout/scout/utils.py +37 -0
  292. webscout/swiftcli/Readme.md +323 -0
  293. webscout/swiftcli/__init__.py +95 -0
  294. webscout/swiftcli/core/__init__.py +7 -0
  295. webscout/swiftcli/core/cli.py +297 -0
  296. webscout/swiftcli/core/context.py +104 -0
  297. webscout/swiftcli/core/group.py +241 -0
  298. webscout/swiftcli/decorators/__init__.py +28 -0
  299. webscout/swiftcli/decorators/command.py +221 -0
  300. webscout/swiftcli/decorators/options.py +220 -0
  301. webscout/swiftcli/decorators/output.py +252 -0
  302. webscout/swiftcli/exceptions.py +21 -0
  303. webscout/swiftcli/plugins/__init__.py +9 -0
  304. webscout/swiftcli/plugins/base.py +135 -0
  305. webscout/swiftcli/plugins/manager.py +262 -0
  306. webscout/swiftcli/utils/__init__.py +59 -0
  307. webscout/swiftcli/utils/formatting.py +252 -0
  308. webscout/swiftcli/utils/parsing.py +267 -0
  309. webscout/version.py +1 -1
  310. webscout/webscout_search.py +2 -182
  311. webscout/webscout_search_async.py +1 -179
  312. webscout/zeroart/README.md +89 -0
  313. webscout/zeroart/__init__.py +135 -0
  314. webscout/zeroart/base.py +66 -0
  315. webscout/zeroart/effects.py +101 -0
  316. webscout/zeroart/fonts.py +1239 -0
  317. {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/METADATA +115 -60
  318. webscout-8.2.8.dist-info/RECORD +334 -0
  319. {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/WHEEL +1 -1
  320. webscout-8.2.7.dist-info/RECORD +0 -26
  321. {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/entry_points.txt +0 -0
  322. {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/licenses/LICENSE.md +0 -0
  323. {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,402 @@
1
+ # 🕵️ Scout: Next-Gen Web Parsing Library
2
+
3
+ <div align="center">
4
+
5
+ [![Python](https://img.shields.io/badge/Python-3.8%2B-blue)](https://www.python.org/)
6
+ [![License](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
7
+ [![Maintenance](https://img.shields.io/badge/Maintained-Yes-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout)
8
+ [![Documentation](https://img.shields.io/badge/Docs-Wiki-orange)](https://github.com/OE-LUCIFER/Webscout/wiki)
9
+ [![PRs Welcome](https://img.shields.io/badge/PRs-Welcome-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout/pulls)
10
+
11
+ </div>
12
+
13
+ ## 📋 Overview
14
+
15
+ Scout is a powerful, flexible, and performant HTML parsing library that makes web scraping a breeze! It provides intelligent HTML/XML parsing with advanced features like web crawling, text analysis, semantic extraction, and Markdown conversion. Scout goes beyond traditional parsing libraries with its intuitive API and comprehensive feature set.
16
+
17
+ <details open>
18
+ <summary><b>Why Choose Scout?</b></summary>
19
+
20
+ - **Powerful Parsing**: Multiple parser backends with intelligent markup handling
21
+ - **Advanced Analysis**: Built-in text and web content analysis tools
22
+ - **Concurrent Crawling**: Efficient multi-threaded web crawling
23
+ - **Flexible API**: Intuitive interface similar to BeautifulSoup but with enhanced capabilities
24
+ - **Format Conversion**: Convert HTML to JSON, Markdown, and more
25
+
26
+ </details>
27
+
28
+ ## 📑 Table of Contents
29
+
30
+ - [Installation](#-installation)
31
+ - [Quick Start](#-quick-start)
32
+ - [Features](#-features)
33
+ - [Advanced Usage](#-advanced-usage)
34
+ - [API Reference](#-api-reference)
35
+ - [Dependencies](#-dependencies)
36
+ - [Supported Python Versions](#-supported-python-versions)
37
+ - [Contributing](#-contributing)
38
+ - [License](#-license)
39
+
40
+ ## 📦 Installation
41
+
42
+ ```bash
43
+ pip install webscout
44
+ ```
45
+
46
+ Or install the latest version from GitHub:
47
+
48
+ ```bash
49
+ pip install git+https://github.com/OE-LUCIFER/Webscout.git
50
+ ```
51
+
52
+ ## 🚀 Quick Start
53
+
54
+ ### Basic Parsing
55
+
56
+ ```python
57
+ from webscout.scout import Scout
58
+
59
+ # Parse HTML content
60
+ html_content = """
61
+ <html>
62
+ <body>
63
+ <h1>Hello, Scout!</h1>
64
+ <div class="content">
65
+ <p>Web parsing made easy.</p>
66
+ <a href="https://example.com">Link</a>
67
+ </div>
68
+ </body>
69
+ </html>
70
+ """
71
+
72
+ scout = Scout(html_content)
73
+
74
+ # Find elements
75
+ title = scout.find('h1')
76
+ links = scout.find_all('a')
77
+
78
+ # Extract text
79
+ print(title[0].get_text()) # Output: Hello, Scout!
80
+ print(links.attrs('href')) # Output: ['https://example.com']
81
+ ```
82
+
83
+ ### Web Crawling
84
+
85
+ ```python
86
+ from webscout.scout import ScoutCrawler
87
+
88
+ # Crawl a website with default settings
89
+ crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
90
+
91
+ # Or customize the crawler
92
+ crawler = ScoutCrawler(
93
+ 'https://example.com', # base_url
94
+ max_pages=100, # maximum pages to crawl
95
+ tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
96
+ )
97
+
98
+ # Start crawling
99
+ crawled_pages = crawler.crawl()
100
+
101
+ for page in crawled_pages:
102
+ print(f"URL: {page['url']}")
103
+ print(f"Title: {page['title']}")
104
+ print(f"Links found: {len(page['links'])}")
105
+ print(f"Crawl depth: {page['depth']}")
106
+ ```
107
+
108
+ ### Text Analysis
109
+
110
+ ```python
111
+ from webscout.scout import Scout
112
+
113
+ # Parse a webpage
114
+ html = """<div><h1>Climate Change</h1><p>Email us at info@example.com or call 555-123-4567.</p>
115
+ <p>Visit https://climate-action.org for more information.</p></div>"""
116
+ scout = Scout(html)
117
+
118
+ # Analyze text and extract entities
119
+ analysis = scout.analyze_text()
120
+ print(f"Word frequencies: {analysis['word_count']}")
121
+ print(f"Entities found: {analysis['entities']}")
122
+ ```
123
+
124
+ ## ✨ Features
125
+
126
+ ### 🔍 Multiple Parser Support
127
+
128
+ Scout supports multiple HTML/XML parsers, allowing you to choose the best tool for your specific needs:
129
+
130
+ | Parser | Description | Best For |
131
+ |--------|-------------|----------|
132
+ | `html.parser` | Python's built-in parser | General-purpose parsing, no dependencies |
133
+ | `lxml` | Fast C-based parser | Performance-critical applications |
134
+ | `html5lib` | Highly compliant HTML5 parser | Handling malformed HTML |
135
+ | `lxml-xml` | XML parser | XML document parsing |
136
+
137
+ ```python
138
+ # Choose your parser
139
+ scout = Scout(html_content, features='lxml') # For speed
140
+ scout = Scout(html_content, features='html5lib') # For compliance
141
+ ```
142
+
143
+ ### 🌐 Advanced Parsing Capabilities
144
+
145
+ Scout provides powerful tools for navigating and manipulating HTML/XML documents:
146
+
147
+ - **Element Selection**: Find elements by tag name, attributes, CSS selectors, and more
148
+ - **Tree Traversal**: Navigate parent-child relationships and sibling elements
149
+ - **Content Extraction**: Extract text, attributes, and structured data
150
+ - **Document Manipulation**: Modify, replace, or remove elements
151
+
152
+ ```python
153
+ # CSS selector support
154
+ elements = scout.select('div.content > p')
155
+
156
+ # Advanced find with attribute matching
157
+ results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
158
+
159
+ # Tree traversal
160
+ parent = element.find_parent('div')
161
+ siblings = element.find_next_siblings('p')
162
+ ```
163
+
164
+ ### 🧠 Intelligent Analysis
165
+
166
+ Scout includes built-in analysis tools for extracting insights from web content:
167
+
168
+ #### Text Analysis
169
+
170
+ ```python
171
+ # Extract and analyze text
172
+ text = scout.get_text()
173
+ word_counts = scout.text_analyzer.count_words(text)
174
+ entities = scout.text_analyzer.extract_entities(text)
175
+ ```
176
+
177
+ #### Web Structure Analysis
178
+
179
+ ```python
180
+ # Analyze page structure
181
+ structure = scout.analyze_page_structure()
182
+ print(f"Most common tags: {structure['tag_distribution']}")
183
+ print(f"Page depth: {max(structure['depth_analysis'].keys())}")
184
+ ```
185
+
186
+ #### Semantic Information Extraction
187
+
188
+ ```python
189
+ # Extract semantic information
190
+ semantics = scout.extract_semantic_info()
191
+ print(f"Headings: {semantics['headings']}")
192
+ print(f"Lists: {len(semantics['lists']['ul']) + len(semantics['lists']['ol'])}")
193
+ print(f"Tables: {semantics['tables']['count']}")
194
+ ```
195
+
196
+ ### 🕸️ Web Crawling
197
+
198
+ Scout includes a powerful concurrent web crawler for fetching and analyzing multiple pages:
199
+
200
+ ```python
201
+ from webscout.scout import ScoutCrawler
202
+
203
+ # Create a crawler with default settings
204
+ crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
205
+
206
+ # Or customize the crawler with specific options
207
+ crawler = ScoutCrawler(
208
+ 'https://example.com', # base_url
209
+ max_pages=100, # maximum pages to crawl
210
+ tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
211
+ )
212
+
213
+ # Start crawling
214
+ pages = crawler.crawl()
215
+
216
+ # Process results
217
+ for page in pages:
218
+ print(f"URL: {page['url']}")
219
+ print(f"Title: {page['title']}")
220
+ print(f"Links: {len(page['links'])}")
221
+ print(f"Depth: {page['depth']}")
222
+ ```
223
+
224
+ The crawler automatically:
225
+ - Stays within the same domain as the base URL
226
+ - Uses concurrent requests for faster crawling
227
+ - Removes unwanted tags (like scripts and styles) for cleaner text extraction
228
+ - Tracks crawl depth for each page
229
+
230
+ ### 📄 Format Conversion
231
+
232
+ Scout can convert HTML to various formats:
233
+
234
+ ```python
235
+ # Convert to JSON
236
+ json_data = scout.to_json(indent=2)
237
+
238
+ # Convert to Markdown
239
+ markdown = scout.to_markdown(heading_style='ATX')
240
+
241
+ # Pretty-print HTML
242
+ pretty_html = scout.prettify()
243
+ ```
244
+
245
+ ## 🔬 Advanced Usage
246
+
247
+ ### Working with Search Results
248
+
249
+ Scout's search methods return a `ScoutSearchResult` object with powerful methods for processing results:
250
+
251
+ ```python
252
+ from webscout.scout import Scout
253
+
254
+ scout = Scout(html_content)
255
+
256
+ # Find all paragraphs
257
+ paragraphs = scout.find_all('p')
258
+
259
+ # Extract all text from results
260
+ all_text = paragraphs.texts(separator='\n')
261
+
262
+ # Extract specific attributes
263
+ hrefs = paragraphs.attrs('href')
264
+
265
+ # Filter results with a predicate function
266
+ important = paragraphs.filter(lambda p: 'important' in p.get('class', []))
267
+
268
+ # Transform results
269
+ word_counts = paragraphs.map(lambda p: len(p.get_text().split()))
270
+
271
+ # Analyze text in results
272
+ analysis = paragraphs.analyze_text()
273
+ ```
274
+
275
+ ### URL Handling and Analysis
276
+
277
+ ```python
278
+ from webscout.scout import Scout
279
+
280
+ scout = Scout(html_content)
281
+
282
+ # Parse and analyze URLs
283
+ links = scout.extract_links(base_url='https://example.com')
284
+ for link in links:
285
+ url_components = scout.url_parse(link['href'])
286
+ print(f"Domain: {url_components['netloc']}")
287
+ print(f"Path: {url_components['path']}")
288
+ ```
289
+
290
+ ### Metadata Extraction
291
+
292
+ ```python
293
+ from webscout.scout import Scout
294
+
295
+ scout = Scout(html_content)
296
+
297
+ # Extract metadata
298
+ metadata = scout.extract_metadata()
299
+ print(f"Title: {metadata['title']}")
300
+ print(f"Description: {metadata['description']}")
301
+ print(f"Open Graph: {metadata['og_metadata']}")
302
+ print(f"Twitter Card: {metadata['twitter_metadata']}")
303
+ ```
304
+
305
+ ### Content Hashing and Caching
306
+
307
+ ```python
308
+ from webscout.scout import Scout
309
+
310
+ scout = Scout(html_content)
311
+
312
+ # Generate content hash
313
+ content_hash = scout.hash_content(method='sha256')
314
+
315
+ # Use caching for expensive operations
316
+ if not scout.cache('parsed_data'):
317
+ data = scout.extract_semantic_info()
318
+ scout.cache('parsed_data', data)
319
+
320
+ cached_data = scout.cache('parsed_data')
321
+ ```
322
+
323
+ ## 📚 API Reference
324
+
325
+ ### Core Classes
326
+
327
+ | Class | Description |
328
+ |-------|-------------|
329
+ | `Scout` | Main class for HTML parsing and traversal |
330
+ | `ScoutCrawler` | Web crawler for fetching and parsing multiple pages |
331
+ | `ScoutTextAnalyzer` | Text analysis utilities |
332
+ | `ScoutWebAnalyzer` | Web page analysis utilities |
333
+ | `ScoutSearchResult` | Enhanced search results with filtering and analysis |
334
+ | `Tag` | Represents an HTML/XML tag |
335
+ | `NavigableString` | Represents text within an HTML/XML document |
336
+
337
+ ### Key Methods
338
+
339
+ #### Scout Class
340
+
341
+ - `__init__(markup, features='html.parser', from_encoding=None)`: Initialize with HTML content
342
+ - `find(name, attrs={}, recursive=True, text=None)`: Find first matching element
343
+ - `find_all(name, attrs={}, recursive=True, text=None, limit=None)`: Find all matching elements
344
+ - `select(selector)`: Find elements using CSS selector
345
+ - `get_text(separator=' ', strip=False)`: Extract text from document
346
+ - `analyze_text()`: Perform text analysis
347
+ - `analyze_page_structure()`: Analyze document structure
348
+ - `extract_semantic_info()`: Extract semantic information
349
+ - `extract_links(base_url=None)`: Extract all links
350
+ - `extract_metadata()`: Extract metadata from document
351
+ - `to_json(indent=2)`: Convert to JSON
352
+ - `to_markdown(heading_style='ATX')`: Convert to Markdown
353
+ - `prettify(formatter='minimal')`: Pretty-print HTML
354
+
355
+ #### ScoutCrawler Class
356
+
357
+ - `__init__(base_url, max_pages=50, tags_to_remove=None)`: Initialize the crawler
358
+ - `crawl()`: Start crawling from the base URL
359
+ - `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
360
+ - `_is_valid_url(url)`: Check if a URL is valid (internal method)
361
+
362
+ For detailed API documentation, please refer to the [documentation](https://github.com/OE-LUCIFER/Webscout/wiki).
363
+
364
+ ## 🔧 Dependencies
365
+
366
+ - `requests`: HTTP library for making web requests
367
+ - `lxml`: XML and HTML processing library (optional, recommended)
368
+ - `html5lib`: Standards-compliant HTML parser (optional)
369
+ - `markdownify`: HTML to Markdown conversion
370
+ - `concurrent.futures`: Asynchronous execution (standard library)
371
+
372
+ ## 🌈 Supported Python Versions
373
+
374
+ - Python 3.8+
375
+
376
+ ## 🤝 Contributing
377
+
378
+ Contributions are welcome! Here's how you can contribute:
379
+
380
+ 1. Fork the repository
381
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
382
+ 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
383
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
384
+ 5. Open a Pull Request
385
+
386
+ Please make sure to update tests as appropriate.
387
+
388
+ ## 📄 License
389
+
390
+ This project is licensed under the MIT License - see the LICENSE file for details.
391
+
392
+ ---
393
+
394
+ <div align="center">
395
+ <p>Made with ❤️ by the Webscout team</p>
396
+ <p>
397
+ <a href="https://github.com/OE-LUCIFER/Webscout">GitHub</a> •
398
+ <a href="https://github.com/OE-LUCIFER/Webscout/wiki">Documentation</a> •
399
+ <a href="https://github.com/OE-LUCIFER/Webscout/issues">Report Bug</a> •
400
+ <a href="https://github.com/OE-LUCIFER/Webscout/issues">Request Feature</a>
401
+ </p>
402
+ </div>
@@ -0,0 +1,8 @@
1
+ """
2
+ Scout: A powerful, zero-dependency web scraping library
3
+ """
4
+
5
+ from .core import Scout, ScoutCrawler, ScoutTextAnalyzer, ScoutWebAnalyzer, ScoutSearchResult
6
+ from .element import Tag, NavigableString
7
+
8
+ __all__ = ['Scout', 'ScoutCrawler', 'Tag', 'NavigableString','ScoutTextAnalyzer', 'ScoutWebAnalyzer', 'ScoutSearchResult']
@@ -0,0 +1,7 @@
1
+ from .text_analyzer import ScoutTextAnalyzer
2
+ from .web_analyzer import ScoutWebAnalyzer
3
+ from .search_result import ScoutSearchResult
4
+ from .crawler import ScoutCrawler
5
+ from .scout import Scout
6
+
7
+ __all__ = ['ScoutTextAnalyzer', 'ScoutWebAnalyzer', 'ScoutSearchResult', 'ScoutCrawler', 'Scout']
@@ -0,0 +1,140 @@
1
+ """
2
+ Scout Crawler Module
3
+ """
4
+
5
+ import concurrent.futures
6
+ import urllib.parse
7
+ from typing import Union, List, Dict
8
+ import requests
9
+
10
+ from .scout import Scout
11
+
12
+ class ScoutCrawler:
13
+ """
14
+ Advanced web crawling utility for Scout library.
15
+ """
16
+ def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None):
17
+ """
18
+ Initialize the web crawler.
19
+
20
+ Args:
21
+ base_url (str): Starting URL to crawl
22
+ max_pages (int, optional): Maximum number of pages to crawl
23
+ tags_to_remove (List[str], optional): List of tags to remove
24
+ """
25
+ self.base_url = base_url
26
+ self.max_pages = max_pages
27
+ self.tags_to_remove = tags_to_remove if tags_to_remove is not None else ["script", "style", "header", "footer", "nav", "aside", "form", "button"]
28
+ self.visited_urls = set()
29
+ self.crawled_pages = []
30
+
31
+ def _is_valid_url(self, url: str) -> bool:
32
+ """
33
+ Check if a URL is valid and within the same domain.
34
+
35
+ Args:
36
+ url (str): URL to validate
37
+
38
+ Returns:
39
+ bool: Whether the URL is valid
40
+ """
41
+ try:
42
+ parsed_base = urllib.parse.urlparse(self.base_url)
43
+ parsed_url = urllib.parse.urlparse(url)
44
+
45
+ return (
46
+ parsed_url.scheme in ['http', 'https'] and
47
+ parsed_base.netloc == parsed_url.netloc and
48
+ len(self.visited_urls) < self.max_pages
49
+ )
50
+ except Exception:
51
+ return False
52
+
53
+ def _crawl_page(self, url: str, depth: int = 0) -> Dict[str, Union[str, List[str]]]:
54
+ """
55
+ Crawl a single page and extract information.
56
+
57
+ Args:
58
+ url (str): URL to crawl
59
+ depth (int, optional): Current crawl depth
60
+
61
+ Returns:
62
+ Dict[str, Union[str, List[str]]]: Crawled page information
63
+ """
64
+ if url in self.visited_urls:
65
+ return {}
66
+
67
+ try:
68
+ response = requests.get(url, timeout=10)
69
+ response.raise_for_status()
70
+
71
+ scout = Scout(response.content, features='lxml')
72
+
73
+ title_result = scout.find('title')
74
+ title = title_result[0].get_text() if title_result else ''
75
+
76
+ visible_text = scout._soup.get_text(strip=True)
77
+
78
+ for tag in scout._soup(self.tags_to_remove):
79
+ tag.extract()
80
+
81
+ page_info = {
82
+ 'url': url,
83
+ 'title': title,
84
+ 'links': [
85
+ urllib.parse.urljoin(url, link.get('href'))
86
+ for link in scout.find_all('a', href=True)
87
+ if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
88
+ ],
89
+ 'text': visible_text,
90
+ 'depth': depth
91
+ }
92
+
93
+ self.visited_urls.add(url)
94
+ self.crawled_pages.append(page_info)
95
+
96
+ return page_info
97
+ except Exception as e:
98
+ print(f"Error crawling {url}: {e}")
99
+ return {}
100
+
101
+ def crawl(self) -> List[Dict[str, Union[str, List[str]]]]:
102
+ """
103
+ Start web crawling from base URL.
104
+
105
+ Returns:
106
+ List[Dict[str, Union[str, List[str]]]]: List of crawled pages
107
+ """
108
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
109
+ futures = {executor.submit(self._crawl_page, self.base_url, 0)}
110
+
111
+ while futures:
112
+ done, futures = concurrent.futures.wait(
113
+ futures, return_when=concurrent.futures.FIRST_COMPLETED
114
+ )
115
+
116
+ for future in done:
117
+ page_info = future.result()
118
+
119
+ if len(self.visited_urls) >= self.max_pages:
120
+ break
121
+
122
+ submitted_links = set() # New set to track submitted links
123
+ for link in page_info.get('links', []):
124
+ if (
125
+ len(self.visited_urls) < self.max_pages and
126
+ link not in self.visited_urls
127
+ ):
128
+ if link not in submitted_links: # Check against submitted links
129
+ submitted_links.add(link) # Add to submitted links
130
+ futures.add(
131
+ executor.submit(
132
+ self._crawl_page,
133
+ link,
134
+ page_info.get('depth', 0) + 1
135
+ )
136
+ )
137
+ if len(self.visited_urls) >= self.max_pages:
138
+ break
139
+
140
+ return self.crawled_pages