webscout 8.2.7__py3-none-any.whl → 8.2.9__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (281) hide show
  1. webscout/AIauto.py +33 -15
  2. webscout/AIbase.py +96 -37
  3. webscout/AIutel.py +703 -250
  4. webscout/Bard.py +441 -323
  5. webscout/Extra/Act.md +309 -0
  6. webscout/Extra/GitToolkit/__init__.py +10 -0
  7. webscout/Extra/GitToolkit/gitapi/README.md +110 -0
  8. webscout/Extra/GitToolkit/gitapi/__init__.py +12 -0
  9. webscout/Extra/GitToolkit/gitapi/repository.py +195 -0
  10. webscout/Extra/GitToolkit/gitapi/user.py +96 -0
  11. webscout/Extra/GitToolkit/gitapi/utils.py +62 -0
  12. webscout/Extra/YTToolkit/README.md +375 -0
  13. webscout/Extra/YTToolkit/YTdownloader.py +957 -0
  14. webscout/Extra/YTToolkit/__init__.py +3 -0
  15. webscout/Extra/YTToolkit/transcriber.py +476 -0
  16. webscout/Extra/YTToolkit/ytapi/README.md +44 -0
  17. webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
  18. webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
  19. webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
  20. webscout/Extra/YTToolkit/ytapi/extras.py +118 -0
  21. webscout/Extra/YTToolkit/ytapi/https.py +88 -0
  22. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
  23. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
  24. webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
  25. webscout/Extra/YTToolkit/ytapi/query.py +40 -0
  26. webscout/Extra/YTToolkit/ytapi/stream.py +63 -0
  27. webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
  28. webscout/Extra/YTToolkit/ytapi/video.py +232 -0
  29. webscout/Extra/__init__.py +7 -0
  30. webscout/Extra/autocoder/__init__.py +9 -0
  31. webscout/Extra/autocoder/autocoder.py +1105 -0
  32. webscout/Extra/autocoder/autocoder_utiles.py +332 -0
  33. webscout/Extra/gguf.md +430 -0
  34. webscout/Extra/gguf.py +684 -0
  35. webscout/Extra/tempmail/README.md +488 -0
  36. webscout/Extra/tempmail/__init__.py +28 -0
  37. webscout/Extra/tempmail/async_utils.py +141 -0
  38. webscout/Extra/tempmail/base.py +161 -0
  39. webscout/Extra/tempmail/cli.py +187 -0
  40. webscout/Extra/tempmail/emailnator.py +84 -0
  41. webscout/Extra/tempmail/mail_tm.py +361 -0
  42. webscout/Extra/tempmail/temp_mail_io.py +292 -0
  43. webscout/Extra/weather.md +281 -0
  44. webscout/Extra/weather.py +194 -0
  45. webscout/Extra/weather_ascii.py +76 -0
  46. webscout/Litlogger/README.md +10 -0
  47. webscout/Litlogger/__init__.py +15 -0
  48. webscout/Litlogger/formats.py +4 -0
  49. webscout/Litlogger/handlers.py +103 -0
  50. webscout/Litlogger/levels.py +13 -0
  51. webscout/Litlogger/logger.py +92 -0
  52. webscout/Provider/AI21.py +177 -0
  53. webscout/Provider/AISEARCH/DeepFind.py +254 -0
  54. webscout/Provider/AISEARCH/Perplexity.py +333 -0
  55. webscout/Provider/AISEARCH/README.md +279 -0
  56. webscout/Provider/AISEARCH/__init__.py +9 -0
  57. webscout/Provider/AISEARCH/felo_search.py +202 -0
  58. webscout/Provider/AISEARCH/genspark_search.py +324 -0
  59. webscout/Provider/AISEARCH/hika_search.py +186 -0
  60. webscout/Provider/AISEARCH/iask_search.py +410 -0
  61. webscout/Provider/AISEARCH/monica_search.py +220 -0
  62. webscout/Provider/AISEARCH/scira_search.py +298 -0
  63. webscout/Provider/AISEARCH/webpilotai_search.py +255 -0
  64. webscout/Provider/Aitopia.py +316 -0
  65. webscout/Provider/AllenAI.py +440 -0
  66. webscout/Provider/Andi.py +228 -0
  67. webscout/Provider/Blackboxai.py +791 -0
  68. webscout/Provider/ChatGPTClone.py +237 -0
  69. webscout/Provider/ChatGPTGratis.py +194 -0
  70. webscout/Provider/ChatSandbox.py +342 -0
  71. webscout/Provider/Cloudflare.py +324 -0
  72. webscout/Provider/Cohere.py +208 -0
  73. webscout/Provider/Deepinfra.py +340 -0
  74. webscout/Provider/ExaAI.py +261 -0
  75. webscout/Provider/ExaChat.py +358 -0
  76. webscout/Provider/Flowith.py +217 -0
  77. webscout/Provider/FreeGemini.py +250 -0
  78. webscout/Provider/Gemini.py +169 -0
  79. webscout/Provider/GithubChat.py +369 -0
  80. webscout/Provider/GizAI.py +295 -0
  81. webscout/Provider/Glider.py +225 -0
  82. webscout/Provider/Groq.py +801 -0
  83. webscout/Provider/HF_space/__init__.py +0 -0
  84. webscout/Provider/HF_space/qwen_qwen2.py +206 -0
  85. webscout/Provider/HeckAI.py +375 -0
  86. webscout/Provider/HuggingFaceChat.py +469 -0
  87. webscout/Provider/Hunyuan.py +283 -0
  88. webscout/Provider/Jadve.py +291 -0
  89. webscout/Provider/Koboldai.py +384 -0
  90. webscout/Provider/LambdaChat.py +411 -0
  91. webscout/Provider/Llama3.py +259 -0
  92. webscout/Provider/MCPCore.py +315 -0
  93. webscout/Provider/Marcus.py +198 -0
  94. webscout/Provider/Nemotron.py +218 -0
  95. webscout/Provider/Netwrck.py +270 -0
  96. webscout/Provider/OLLAMA.py +396 -0
  97. webscout/Provider/OPENAI/BLACKBOXAI.py +766 -0
  98. webscout/Provider/OPENAI/Cloudflare.py +378 -0
  99. webscout/Provider/OPENAI/FreeGemini.py +283 -0
  100. webscout/Provider/OPENAI/NEMOTRON.py +232 -0
  101. webscout/Provider/OPENAI/Qwen3.py +283 -0
  102. webscout/Provider/OPENAI/README.md +952 -0
  103. webscout/Provider/OPENAI/TwoAI.py +357 -0
  104. webscout/Provider/OPENAI/__init__.py +40 -0
  105. webscout/Provider/OPENAI/ai4chat.py +293 -0
  106. webscout/Provider/OPENAI/api.py +969 -0
  107. webscout/Provider/OPENAI/base.py +249 -0
  108. webscout/Provider/OPENAI/c4ai.py +373 -0
  109. webscout/Provider/OPENAI/chatgpt.py +556 -0
  110. webscout/Provider/OPENAI/chatgptclone.py +494 -0
  111. webscout/Provider/OPENAI/chatsandbox.py +173 -0
  112. webscout/Provider/OPENAI/copilot.py +242 -0
  113. webscout/Provider/OPENAI/deepinfra.py +322 -0
  114. webscout/Provider/OPENAI/e2b.py +1414 -0
  115. webscout/Provider/OPENAI/exaai.py +417 -0
  116. webscout/Provider/OPENAI/exachat.py +444 -0
  117. webscout/Provider/OPENAI/flowith.py +162 -0
  118. webscout/Provider/OPENAI/freeaichat.py +359 -0
  119. webscout/Provider/OPENAI/glider.py +326 -0
  120. webscout/Provider/OPENAI/groq.py +364 -0
  121. webscout/Provider/OPENAI/heckai.py +308 -0
  122. webscout/Provider/OPENAI/llmchatco.py +335 -0
  123. webscout/Provider/OPENAI/mcpcore.py +389 -0
  124. webscout/Provider/OPENAI/multichat.py +376 -0
  125. webscout/Provider/OPENAI/netwrck.py +357 -0
  126. webscout/Provider/OPENAI/oivscode.py +287 -0
  127. webscout/Provider/OPENAI/opkfc.py +496 -0
  128. webscout/Provider/OPENAI/pydantic_imports.py +172 -0
  129. webscout/Provider/OPENAI/scirachat.py +477 -0
  130. webscout/Provider/OPENAI/sonus.py +304 -0
  131. webscout/Provider/OPENAI/standardinput.py +433 -0
  132. webscout/Provider/OPENAI/textpollinations.py +339 -0
  133. webscout/Provider/OPENAI/toolbaz.py +413 -0
  134. webscout/Provider/OPENAI/typefully.py +355 -0
  135. webscout/Provider/OPENAI/typegpt.py +364 -0
  136. webscout/Provider/OPENAI/uncovrAI.py +463 -0
  137. webscout/Provider/OPENAI/utils.py +318 -0
  138. webscout/Provider/OPENAI/venice.py +431 -0
  139. webscout/Provider/OPENAI/wisecat.py +387 -0
  140. webscout/Provider/OPENAI/writecream.py +163 -0
  141. webscout/Provider/OPENAI/x0gpt.py +365 -0
  142. webscout/Provider/OPENAI/yep.py +382 -0
  143. webscout/Provider/OpenGPT.py +209 -0
  144. webscout/Provider/Openai.py +496 -0
  145. webscout/Provider/PI.py +429 -0
  146. webscout/Provider/Perplexitylabs.py +415 -0
  147. webscout/Provider/QwenLM.py +254 -0
  148. webscout/Provider/Reka.py +214 -0
  149. webscout/Provider/StandardInput.py +290 -0
  150. webscout/Provider/TTI/README.md +82 -0
  151. webscout/Provider/TTI/__init__.py +7 -0
  152. webscout/Provider/TTI/aiarta.py +365 -0
  153. webscout/Provider/TTI/artbit.py +0 -0
  154. webscout/Provider/TTI/base.py +64 -0
  155. webscout/Provider/TTI/fastflux.py +200 -0
  156. webscout/Provider/TTI/magicstudio.py +201 -0
  157. webscout/Provider/TTI/piclumen.py +203 -0
  158. webscout/Provider/TTI/pixelmuse.py +225 -0
  159. webscout/Provider/TTI/pollinations.py +221 -0
  160. webscout/Provider/TTI/utils.py +11 -0
  161. webscout/Provider/TTS/README.md +192 -0
  162. webscout/Provider/TTS/__init__.py +10 -0
  163. webscout/Provider/TTS/base.py +159 -0
  164. webscout/Provider/TTS/deepgram.py +156 -0
  165. webscout/Provider/TTS/elevenlabs.py +111 -0
  166. webscout/Provider/TTS/gesserit.py +128 -0
  167. webscout/Provider/TTS/murfai.py +113 -0
  168. webscout/Provider/TTS/openai_fm.py +129 -0
  169. webscout/Provider/TTS/parler.py +111 -0
  170. webscout/Provider/TTS/speechma.py +580 -0
  171. webscout/Provider/TTS/sthir.py +94 -0
  172. webscout/Provider/TTS/streamElements.py +333 -0
  173. webscout/Provider/TTS/utils.py +280 -0
  174. webscout/Provider/TeachAnything.py +229 -0
  175. webscout/Provider/TextPollinationsAI.py +308 -0
  176. webscout/Provider/TwoAI.py +475 -0
  177. webscout/Provider/TypliAI.py +305 -0
  178. webscout/Provider/UNFINISHED/ChatHub.py +209 -0
  179. webscout/Provider/UNFINISHED/Youchat.py +330 -0
  180. webscout/Provider/UNFINISHED/liner_api_request.py +263 -0
  181. webscout/Provider/UNFINISHED/puterjs.py +635 -0
  182. webscout/Provider/UNFINISHED/test_lmarena.py +119 -0
  183. webscout/Provider/Venice.py +258 -0
  184. webscout/Provider/VercelAI.py +253 -0
  185. webscout/Provider/WiseCat.py +233 -0
  186. webscout/Provider/WrDoChat.py +370 -0
  187. webscout/Provider/Writecream.py +246 -0
  188. webscout/Provider/WritingMate.py +269 -0
  189. webscout/Provider/__init__.py +174 -0
  190. webscout/Provider/ai4chat.py +174 -0
  191. webscout/Provider/akashgpt.py +335 -0
  192. webscout/Provider/asksteve.py +220 -0
  193. webscout/Provider/cerebras.py +290 -0
  194. webscout/Provider/chatglm.py +215 -0
  195. webscout/Provider/cleeai.py +213 -0
  196. webscout/Provider/copilot.py +425 -0
  197. webscout/Provider/elmo.py +283 -0
  198. webscout/Provider/freeaichat.py +285 -0
  199. webscout/Provider/geminiapi.py +208 -0
  200. webscout/Provider/granite.py +235 -0
  201. webscout/Provider/hermes.py +266 -0
  202. webscout/Provider/julius.py +223 -0
  203. webscout/Provider/koala.py +170 -0
  204. webscout/Provider/learnfastai.py +325 -0
  205. webscout/Provider/llama3mitril.py +215 -0
  206. webscout/Provider/llmchat.py +258 -0
  207. webscout/Provider/llmchatco.py +306 -0
  208. webscout/Provider/lmarena.py +198 -0
  209. webscout/Provider/meta.py +801 -0
  210. webscout/Provider/multichat.py +364 -0
  211. webscout/Provider/oivscode.py +309 -0
  212. webscout/Provider/samurai.py +224 -0
  213. webscout/Provider/scira_chat.py +299 -0
  214. webscout/Provider/scnet.py +243 -0
  215. webscout/Provider/searchchat.py +292 -0
  216. webscout/Provider/sonus.py +258 -0
  217. webscout/Provider/talkai.py +194 -0
  218. webscout/Provider/toolbaz.py +353 -0
  219. webscout/Provider/turboseek.py +266 -0
  220. webscout/Provider/typefully.py +202 -0
  221. webscout/Provider/typegpt.py +289 -0
  222. webscout/Provider/uncovr.py +368 -0
  223. webscout/Provider/x0gpt.py +299 -0
  224. webscout/Provider/yep.py +389 -0
  225. webscout/__init__.py +4 -2
  226. webscout/cli.py +3 -28
  227. webscout/client.py +70 -0
  228. webscout/conversation.py +35 -35
  229. webscout/litagent/Readme.md +276 -0
  230. webscout/litagent/__init__.py +29 -0
  231. webscout/litagent/agent.py +455 -0
  232. webscout/litagent/constants.py +60 -0
  233. webscout/litprinter/__init__.py +59 -0
  234. webscout/optimizers.py +419 -419
  235. webscout/scout/README.md +404 -0
  236. webscout/scout/__init__.py +8 -0
  237. webscout/scout/core/__init__.py +7 -0
  238. webscout/scout/core/crawler.py +210 -0
  239. webscout/scout/core/scout.py +607 -0
  240. webscout/scout/core/search_result.py +96 -0
  241. webscout/scout/core/text_analyzer.py +63 -0
  242. webscout/scout/core/text_utils.py +277 -0
  243. webscout/scout/core/web_analyzer.py +52 -0
  244. webscout/scout/element.py +478 -0
  245. webscout/scout/parsers/__init__.py +69 -0
  246. webscout/scout/parsers/html5lib_parser.py +172 -0
  247. webscout/scout/parsers/html_parser.py +236 -0
  248. webscout/scout/parsers/lxml_parser.py +178 -0
  249. webscout/scout/utils.py +37 -0
  250. webscout/swiftcli/Readme.md +323 -0
  251. webscout/swiftcli/__init__.py +95 -0
  252. webscout/swiftcli/core/__init__.py +7 -0
  253. webscout/swiftcli/core/cli.py +297 -0
  254. webscout/swiftcli/core/context.py +104 -0
  255. webscout/swiftcli/core/group.py +241 -0
  256. webscout/swiftcli/decorators/__init__.py +28 -0
  257. webscout/swiftcli/decorators/command.py +221 -0
  258. webscout/swiftcli/decorators/options.py +220 -0
  259. webscout/swiftcli/decorators/output.py +252 -0
  260. webscout/swiftcli/exceptions.py +21 -0
  261. webscout/swiftcli/plugins/__init__.py +9 -0
  262. webscout/swiftcli/plugins/base.py +135 -0
  263. webscout/swiftcli/plugins/manager.py +269 -0
  264. webscout/swiftcli/utils/__init__.py +59 -0
  265. webscout/swiftcli/utils/formatting.py +252 -0
  266. webscout/swiftcli/utils/parsing.py +267 -0
  267. webscout/version.py +1 -1
  268. webscout/webscout_search.py +2 -182
  269. webscout/webscout_search_async.py +1 -179
  270. webscout/zeroart/README.md +89 -0
  271. webscout/zeroart/__init__.py +135 -0
  272. webscout/zeroart/base.py +66 -0
  273. webscout/zeroart/effects.py +101 -0
  274. webscout/zeroart/fonts.py +1239 -0
  275. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/METADATA +262 -83
  276. webscout-8.2.9.dist-info/RECORD +289 -0
  277. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/WHEEL +1 -1
  278. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/entry_points.txt +1 -0
  279. webscout-8.2.7.dist-info/RECORD +0 -26
  280. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/licenses/LICENSE.md +0 -0
  281. {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,404 @@
1
+ # 🕵️ Scout: Next-Gen Web Parsing Library
2
+
3
+ <div align="center">
4
+
5
+ [![Python](https://img.shields.io/badge/Python-3.8%2B-blue)](https://www.python.org/)
6
+ [![License](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
7
+ [![Maintenance](https://img.shields.io/badge/Maintained-Yes-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout)
8
+ [![Documentation](https://img.shields.io/badge/Docs-Wiki-orange)](https://github.com/OE-LUCIFER/Webscout/wiki)
9
+ [![PRs Welcome](https://img.shields.io/badge/PRs-Welcome-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout/pulls)
10
+
11
+ </div>
12
+
13
+ ## 📋 Overview
14
+
15
+ Scout is a powerful, flexible, and performant HTML parsing library that makes web scraping a breeze! It provides intelligent HTML/XML parsing with advanced features like web crawling, text analysis, semantic extraction, and Markdown conversion. Scout goes beyond traditional parsing libraries with its intuitive API and comprehensive feature set.
16
+
17
+ <details open>
18
+ <summary><b>Why Choose Scout?</b></summary>
19
+
20
+ - **Powerful Parsing**: Multiple parser backends with intelligent markup handling
21
+ - **Advanced Analysis**: Built-in text and web content analysis tools
22
+ - **Concurrent Crawling**: Efficient multi-threaded web crawling
23
+ - **Flexible API**: Intuitive interface similar to BeautifulSoup but with enhanced capabilities
24
+ - **Format Conversion**: Convert HTML to JSON, Markdown, and more
25
+
26
+ </details>
27
+
28
+ ## 📑 Table of Contents
29
+
30
+ - [Installation](#-installation)
31
+ - [Quick Start](#-quick-start)
32
+ - [Features](#-features)
33
+ - [Advanced Usage](#-advanced-usage)
34
+ - [API Reference](#-api-reference)
35
+ - [Dependencies](#-dependencies)
36
+ - [Supported Python Versions](#-supported-python-versions)
37
+ - [Contributing](#-contributing)
38
+ - [License](#-license)
39
+
40
+ ## 📦 Installation
41
+
42
+ ```bash
43
+ pip install webscout
44
+ ```
45
+
46
+ Or install the latest version from GitHub:
47
+
48
+ ```bash
49
+ pip install git+https://github.com/OE-LUCIFER/Webscout.git
50
+ ```
51
+
52
+ ## 🚀 Quick Start
53
+
54
+ ### Basic Parsing
55
+
56
+ ```python
57
+ from webscout.scout import Scout
58
+
59
+ # Parse HTML content
60
+ html_content = """
61
+ <html>
62
+ <body>
63
+ <h1>Hello, Scout!</h1>
64
+ <div class="content">
65
+ <p>Web parsing made easy.</p>
66
+ <a href="https://example.com">Link</a>
67
+ </div>
68
+ </body>
69
+ </html>
70
+ """
71
+
72
+ scout = Scout(html_content)
73
+
74
+ # Find elements
75
+ title = scout.find('h1')
76
+ links = scout.find_all('a')
77
+
78
+ # Extract text
79
+ print(title[0].get_text()) # Output: Hello, Scout!
80
+ print(links.attrs('href')) # Output: ['https://example.com']
81
+ ```
82
+
83
+ ### Web Crawling
84
+
85
+ ```python
86
+ from webscout.scout import ScoutCrawler
87
+
88
+ # Crawl a website with default settings
89
+ crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
90
+
91
+ # Or customize the crawler
92
+ crawler = ScoutCrawler(
93
+ 'https://example.com', # base_url
94
+ max_pages=100, # maximum pages to crawl
95
+ tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
96
+ )
97
+
98
+ # Start crawling
99
+ crawled_pages = crawler.crawl()
100
+
101
+ for page in crawled_pages:
102
+ print(f"URL: {page['url']}")
103
+ print(f"Title: {page['title']}")
104
+ print(f"Links found: {len(page['links'])}")
105
+ print(f"Crawl depth: {page['depth']}")
106
+ ```
107
+
108
+ ### Text Analysis
109
+
110
+ ```python
111
+ from webscout.scout import Scout
112
+
113
+ # Parse a webpage
114
+ html = """<div><h1>Climate Change</h1><p>Email us at info@example.com or call 555-123-4567.</p>
115
+ <p>Visit https://climate-action.org for more information.</p></div>"""
116
+ scout = Scout(html)
117
+
118
+ # Analyze text and extract entities
119
+ analysis = scout.analyze_text()
120
+ print(f"Word frequencies: {analysis['word_count']}")
121
+ print(f"Entities found: {analysis['entities']}")
122
+ ```
123
+
124
+ ## ✨ Features
125
+
126
+ ### 🔍 Multiple Parser Support
127
+
128
+ Scout supports multiple HTML/XML parsers, allowing you to choose the best tool for your specific needs:
129
+
130
+ | Parser | Description | Best For |
131
+ |--------|-------------|----------|
132
+ | `html.parser` | Python's built-in parser | General-purpose parsing, no dependencies |
133
+ | `lxml` | Fast C-based parser | Performance-critical applications |
134
+ | `html5lib` | Highly compliant HTML5 parser | Handling malformed HTML |
135
+ | `lxml-xml` | XML parser | XML document parsing |
136
+
137
+ ```python
138
+ # Choose your parser
139
+ scout = Scout(html_content, features='lxml') # For speed
140
+ scout = Scout(html_content, features='html5lib') # For compliance
141
+ ```
142
+
143
+ ### 🌐 Advanced Parsing Capabilities
144
+
145
+ Scout provides powerful tools for navigating and manipulating HTML/XML documents:
146
+
147
+ - **Element Selection**: Find elements by tag name, attributes, CSS selectors, and more
148
+ - **Tree Traversal**: Navigate parent-child relationships and sibling elements
149
+ - **Content Extraction**: Extract text, attributes, and structured data
150
+ - **Document Manipulation**: Modify, replace, or remove elements
151
+ - **Dynamic Building**: Easily append or insert new nodes
152
+
153
+ ```python
154
+ # CSS selector support
155
+ elements = scout.select('div.content > p')
156
+
157
+ # Advanced find with attribute matching
158
+ results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
159
+
160
+ # Tree traversal
161
+ parent = element.find_parent('div')
162
+ siblings = element.find_next_siblings('p')
163
+ prev_sibling = element.find_previous_sibling('p')
164
+ ```
165
+
166
+ ### 🧠 Intelligent Analysis
167
+
168
+ Scout includes built-in analysis tools for extracting insights from web content:
169
+
170
+ #### Text Analysis
171
+
172
+ ```python
173
+ # Extract and analyze text
174
+ text = scout.get_text()
175
+ word_counts = scout.text_analyzer.count_words(text)
176
+ entities = scout.text_analyzer.extract_entities(text)
177
+ ```
178
+
179
+ #### Web Structure Analysis
180
+
181
+ ```python
182
+ # Analyze page structure
183
+ structure = scout.analyze_page_structure()
184
+ print(f"Most common tags: {structure['tag_distribution']}")
185
+ print(f"Page depth: {max(structure['depth_analysis'].keys())}")
186
+ ```
187
+
188
+ #### Semantic Information Extraction
189
+
190
+ ```python
191
+ # Extract semantic information
192
+ semantics = scout.extract_semantic_info()
193
+ print(f"Headings: {semantics['headings']}")
194
+ print(f"Lists: {len(semantics['lists']['ul']) + len(semantics['lists']['ol'])}")
195
+ print(f"Tables: {semantics['tables']['count']}")
196
+ ```
197
+
198
+ ### 🕸️ Web Crawling
199
+
200
+ Scout includes a powerful concurrent web crawler for fetching and analyzing multiple pages:
201
+
202
+ ```python
203
+ from webscout.scout import ScoutCrawler
204
+
205
+ # Create a crawler with default settings
206
+ crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
207
+
208
+ # Or customize the crawler with specific options
209
+ crawler = ScoutCrawler(
210
+ 'https://example.com', # base_url
211
+ max_pages=100, # maximum pages to crawl
212
+ tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
213
+ )
214
+
215
+ # Start crawling
216
+ pages = crawler.crawl()
217
+
218
+ # Process results
219
+ for page in pages:
220
+ print(f"URL: {page['url']}")
221
+ print(f"Title: {page['title']}")
222
+ print(f"Links: {len(page['links'])}")
223
+ print(f"Depth: {page['depth']}")
224
+ ```
225
+
226
+ The crawler automatically:
227
+ - Stays within the same domain as the base URL
228
+ - Uses concurrent requests for faster crawling
229
+ - Removes unwanted tags (like scripts and styles) for cleaner text extraction
230
+ - Tracks crawl depth for each page
231
+
232
+ ### 📄 Format Conversion
233
+
234
+ Scout can convert HTML to various formats:
235
+
236
+ ```python
237
+ # Convert to JSON
238
+ json_data = scout.to_json(indent=2)
239
+
240
+ # Convert to Markdown
241
+ markdown = scout.to_markdown(heading_style='ATX')
242
+
243
+ # Pretty-print HTML
244
+ pretty_html = scout.prettify()
245
+ ```
246
+
247
+ ## 🔬 Advanced Usage
248
+
249
+ ### Working with Search Results
250
+
251
+ Scout's search methods return a `ScoutSearchResult` object with powerful methods for processing results:
252
+
253
+ ```python
254
+ from webscout.scout import Scout
255
+
256
+ scout = Scout(html_content)
257
+
258
+ # Find all paragraphs
259
+ paragraphs = scout.find_all('p')
260
+
261
+ # Extract all text from results
262
+ all_text = paragraphs.texts(separator='\n')
263
+
264
+ # Extract specific attributes
265
+ hrefs = paragraphs.attrs('href')
266
+
267
+ # Filter results with a predicate function
268
+ important = paragraphs.filter(lambda p: 'important' in p.get('class', []))
269
+
270
+ # Transform results
271
+ word_counts = paragraphs.map(lambda p: len(p.get_text().split()))
272
+
273
+ # Analyze text in results
274
+ analysis = paragraphs.analyze_text()
275
+ ```
276
+
277
+ ### URL Handling and Analysis
278
+
279
+ ```python
280
+ from webscout.scout import Scout
281
+
282
+ scout = Scout(html_content)
283
+
284
+ # Parse and analyze URLs
285
+ links = scout.extract_links(base_url='https://example.com')
286
+ for link in links:
287
+ url_components = scout.url_parse(link['href'])
288
+ print(f"Domain: {url_components['netloc']}")
289
+ print(f"Path: {url_components['path']}")
290
+ ```
291
+
292
+ ### Metadata Extraction
293
+
294
+ ```python
295
+ from webscout.scout import Scout
296
+
297
+ scout = Scout(html_content)
298
+
299
+ # Extract metadata
300
+ metadata = scout.extract_metadata()
301
+ print(f"Title: {metadata['title']}")
302
+ print(f"Description: {metadata['description']}")
303
+ print(f"Open Graph: {metadata['og_metadata']}")
304
+ print(f"Twitter Card: {metadata['twitter_metadata']}")
305
+ ```
306
+
307
+ ### Content Hashing and Caching
308
+
309
+ ```python
310
+ from webscout.scout import Scout
311
+
312
+ scout = Scout(html_content)
313
+
314
+ # Generate content hash
315
+ content_hash = scout.hash_content(method='sha256')
316
+
317
+ # Use caching for expensive operations
318
+ if not scout.cache('parsed_data'):
319
+ data = scout.extract_semantic_info()
320
+ scout.cache('parsed_data', data)
321
+
322
+ cached_data = scout.cache('parsed_data')
323
+ ```
324
+
325
+ ## 📚 API Reference
326
+
327
+ ### Core Classes
328
+
329
+ | Class | Description |
330
+ |-------|-------------|
331
+ | `Scout` | Main class for HTML parsing and traversal |
332
+ | `ScoutCrawler` | Web crawler for fetching and parsing multiple pages |
333
+ | `ScoutTextAnalyzer` | Text analysis utilities |
334
+ | `ScoutWebAnalyzer` | Web page analysis utilities |
335
+ | `ScoutSearchResult` | Enhanced search results with filtering and analysis |
336
+ | `Tag` | Represents an HTML/XML tag |
337
+ | `NavigableString` | Represents text within an HTML/XML document |
338
+
339
+ ### Key Methods
340
+
341
+ #### Scout Class
342
+
343
+ - `__init__(markup, features='html.parser', from_encoding=None)`: Initialize with HTML content
344
+ - `find(name, attrs={}, recursive=True, text=None)`: Find first matching element
345
+ - `find_all(name, attrs={}, recursive=True, text=None, limit=None)`: Find all matching elements
346
+ - `select(selector)`: Find elements using CSS selector
347
+ - `get_text(separator=' ', strip=False)`: Extract text from document
348
+ - `analyze_text()`: Perform text analysis
349
+ - `analyze_page_structure()`: Analyze document structure
350
+ - `extract_semantic_info()`: Extract semantic information
351
+ - `extract_links(base_url=None)`: Extract all links
352
+ - `extract_metadata()`: Extract metadata from document
353
+ - `to_json(indent=2)`: Convert to JSON
354
+ - `to_markdown(heading_style='ATX')`: Convert to Markdown
355
+ - `prettify(formatter='minimal')`: Pretty-print HTML
356
+
357
+ #### ScoutCrawler Class
358
+
359
+ - `__init__(base_url, max_pages=50, tags_to_remove=None)`: Initialize the crawler
360
+ - `crawl()`: Start crawling from the base URL
361
+ - `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
362
+ - `_is_valid_url(url)`: Check if a URL is valid (internal method)
363
+
364
+ For detailed API documentation, please refer to the [documentation](https://github.com/OE-LUCIFER/Webscout/wiki).
365
+
366
+ ## 🔧 Dependencies
367
+
368
+ - `curl_cffi`: HTTP library used for web requests
369
+ - `lxml`: XML and HTML processing library (optional, recommended)
370
+ - `html5lib`: Standards-compliant HTML parser (optional)
371
+ - `markdownify`: HTML to Markdown conversion
372
+ - `concurrent.futures`: Asynchronous execution (standard library)
373
+
374
+ ## 🌈 Supported Python Versions
375
+
376
+ - Python 3.8+
377
+
378
+ ## 🤝 Contributing
379
+
380
+ Contributions are welcome! Here's how you can contribute:
381
+
382
+ 1. Fork the repository
383
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
384
+ 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
385
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
386
+ 5. Open a Pull Request
387
+
388
+ Please make sure to update tests as appropriate.
389
+
390
+ ## 📄 License
391
+
392
+ This project is licensed under the MIT License - see the LICENSE file for details.
393
+
394
+ ---
395
+
396
+ <div align="center">
397
+ <p>Made with ❤️ by the Webscout team</p>
398
+ <p>
399
+ <a href="https://github.com/OE-LUCIFER/Webscout">GitHub</a> •
400
+ <a href="https://github.com/OE-LUCIFER/Webscout/wiki">Documentation</a> •
401
+ <a href="https://github.com/OE-LUCIFER/Webscout/issues">Report Bug</a> •
402
+ <a href="https://github.com/OE-LUCIFER/Webscout/issues">Request Feature</a>
403
+ </p>
404
+ </div>
@@ -0,0 +1,8 @@
1
+ """
2
+ Scout: A powerful, zero-dependency web scraping library
3
+ """
4
+
5
+ from .core import Scout, ScoutCrawler, ScoutTextAnalyzer, ScoutWebAnalyzer, ScoutSearchResult
6
+ from .element import Tag, NavigableString
7
+
8
+ __all__ = ['Scout', 'ScoutCrawler', 'Tag', 'NavigableString','ScoutTextAnalyzer', 'ScoutWebAnalyzer', 'ScoutSearchResult']
@@ -0,0 +1,7 @@
1
+ from .text_analyzer import ScoutTextAnalyzer
2
+ from .web_analyzer import ScoutWebAnalyzer
3
+ from .search_result import ScoutSearchResult
4
+ from .crawler import ScoutCrawler
5
+ from .scout import Scout
6
+
7
+ __all__ = ['ScoutTextAnalyzer', 'ScoutWebAnalyzer', 'ScoutSearchResult', 'ScoutCrawler', 'Scout']
@@ -0,0 +1,210 @@
1
+ """
2
+ Scout Crawler Module
3
+ """
4
+
5
+ import concurrent.futures
6
+ import urllib.parse
7
+ import time
8
+ import hashlib
9
+ import re
10
+ from urllib import robotparser
11
+ from datetime import datetime
12
+ from typing import Dict, List, Optional, Union
13
+ from webscout.litagent import LitAgent
14
+ from curl_cffi.requests import Session
15
+
16
+ from .scout import Scout
17
+
18
+
19
+ class ScoutCrawler:
20
+ """
21
+ Advanced web crawling utility for Scout library.
22
+ """
23
+ def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None, session: Optional[Session] = None, delay: float = 0.5, obey_robots: bool = True, allowed_domains: Optional[List[str]] = None):
24
+ """
25
+ Initialize the web crawler.
26
+
27
+ Args:
28
+ base_url (str): Starting URL to crawl
29
+ max_pages (int, optional): Maximum number of pages to crawl
30
+ tags_to_remove (List[str], optional): List of tags to remove
31
+ """
32
+ self.base_url = base_url
33
+ self.max_pages = max_pages
34
+ self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
35
+ "script",
36
+ "style",
37
+ "header",
38
+ "footer",
39
+ "nav",
40
+ "aside",
41
+ "form",
42
+ "button",
43
+ ]
44
+ self.visited_urls = set()
45
+ self.crawled_pages = []
46
+ self.session = session or Session()
47
+ self.agent = LitAgent()
48
+ # Use all headers and generate fingerprint
49
+ self.session.headers = self.agent.generate_fingerprint()
50
+ self.session.headers.setdefault("User-Agent", self.agent.chrome())
51
+ self.delay = delay
52
+ self.obey_robots = obey_robots
53
+ self.allowed_domains = allowed_domains or [urllib.parse.urlparse(base_url).netloc]
54
+ self.last_request_time = 0
55
+ self.url_hashes = set()
56
+ if obey_robots:
57
+ self.robots = robotparser.RobotFileParser()
58
+ robots_url = urllib.parse.urljoin(base_url, '/robots.txt')
59
+ try:
60
+ self.robots.set_url(robots_url)
61
+ self.robots.read()
62
+ except Exception:
63
+ self.robots = None
64
+ else:
65
+ self.robots = None
66
+
67
+ def _normalize_url(self, url: str) -> str:
68
+ url = url.split('#')[0]
69
+ url = re.sub(r'\?.*$', '', url) # Remove query params
70
+ return url.rstrip('/')
71
+
72
+ def _is_valid_url(self, url: str) -> bool:
73
+ """
74
+ Check if a URL is valid and within the same domain.
75
+
76
+ Args:
77
+ url (str): URL to validate
78
+
79
+ Returns:
80
+ bool: Whether the URL is valid
81
+ """
82
+ try:
83
+ parsed_base = urllib.parse.urlparse(self.base_url)
84
+ parsed_url = urllib.parse.urlparse(url)
85
+ if parsed_url.scheme not in ["http", "https"]:
86
+ return False
87
+ if parsed_url.netloc not in self.allowed_domains:
88
+ return False
89
+ if self.obey_robots and self.robots:
90
+ return self.robots.can_fetch("*", url)
91
+ return True
92
+ except Exception:
93
+ return False
94
+
95
+ def _is_duplicate(self, url: str) -> bool:
96
+ norm = self._normalize_url(url)
97
+ url_hash = hashlib.md5(norm.encode()).hexdigest()
98
+ if url_hash in self.url_hashes:
99
+ return True
100
+ self.url_hashes.add(url_hash)
101
+ return False
102
+
103
+ def _extract_main_text(self, soup):
104
+ # Try to extract main content (simple heuristic)
105
+ main = soup.find('main')
106
+ if main:
107
+ return main.get_text(separator=" ", strip=True)
108
+ article = soup.find('article')
109
+ if article:
110
+ return article.get_text(separator=" ", strip=True)
111
+ # fallback to body
112
+ body = soup.find('body')
113
+ if body:
114
+ return body.get_text(separator=" ", strip=True)
115
+ return soup.get_text(separator=" ", strip=True)
116
+
117
+ def _crawl_page(self, url: str, depth: int = 0) -> Dict[str, Union[str, List[str]]]:
118
+ """
119
+ Crawl a single page and extract information.
120
+
121
+ Args:
122
+ url (str): URL to crawl
123
+ depth (int, optional): Current crawl depth
124
+
125
+ Returns:
126
+ Dict[str, Union[str, List[str]]]: Crawled page information
127
+ """
128
+ if url in self.visited_urls or self._is_duplicate(url):
129
+ return {}
130
+ # Throttle requests
131
+ now = time.time()
132
+ if self.last_request_time:
133
+ elapsed = now - self.last_request_time
134
+ if elapsed < self.delay:
135
+ time.sleep(self.delay - elapsed)
136
+ self.last_request_time = time.time()
137
+ try:
138
+ response = self.session.get(url, timeout=10)
139
+ response.raise_for_status()
140
+ if not response.headers.get('Content-Type', '').startswith('text/html'):
141
+ return {}
142
+ scout = Scout(response.content, features="lxml")
143
+ title_result = scout.find("title")
144
+ title = title_result[0].get_text() if title_result else ""
145
+ for tag_name in self.tags_to_remove:
146
+ for tag in scout._soup.find_all(tag_name):
147
+ tag.extract()
148
+ visible_text = self._extract_main_text(scout._soup)
149
+ page_info = {
150
+ 'url': url,
151
+ 'title': title,
152
+ 'links': [
153
+ urllib.parse.urljoin(url, link.get('href'))
154
+ for link in scout.find_all('a', href=True)
155
+ if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
156
+ ],
157
+ 'text': visible_text,
158
+ 'depth': depth,
159
+ 'timestamp': datetime.utcnow().isoformat(),
160
+ 'headers': dict(response.headers),
161
+ }
162
+ self.visited_urls.add(url)
163
+ self.crawled_pages.append(page_info)
164
+ return page_info
165
+ except Exception as e:
166
+ print(f"Error crawling {url}: {e}")
167
+ return {}
168
+
169
+ def crawl(self):
170
+ """
171
+ Start web crawling from base URL and yield each crawled page in real time.
172
+
173
+ Yields:
174
+ Dict[str, Union[str, List[str]]]: Crawled page information
175
+ """
176
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
177
+ futures = {executor.submit(self._crawl_page, self.base_url, 0)}
178
+ submitted_links: set[str] = set()
179
+
180
+ while futures:
181
+ if len(self.visited_urls) >= self.max_pages:
182
+ break
183
+ done, not_done = concurrent.futures.wait(
184
+ futures, return_when=concurrent.futures.FIRST_COMPLETED
185
+ )
186
+ futures = not_done
187
+
188
+ for future in done:
189
+ page_info = future.result()
190
+
191
+ if page_info:
192
+ yield page_info
193
+
194
+ if len(self.visited_urls) >= self.max_pages:
195
+ return
196
+
197
+ for link in page_info.get("links", []):
198
+ if (
199
+ len(self.visited_urls) < self.max_pages
200
+ and link not in self.visited_urls
201
+ and link not in submitted_links
202
+ ):
203
+ submitted_links.add(link)
204
+ futures.add(
205
+ executor.submit(
206
+ self._crawl_page,
207
+ link,
208
+ page_info.get("depth", 0) + 1,
209
+ )
210
+ )