webscout 8.2.6__py3-none-any.whl → 8.2.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (150) hide show
  1. webscout/AIauto.py +1 -1
  2. webscout/AIutel.py +298 -239
  3. webscout/Extra/Act.md +309 -0
  4. webscout/Extra/GitToolkit/gitapi/README.md +110 -0
  5. webscout/Extra/YTToolkit/README.md +375 -0
  6. webscout/Extra/YTToolkit/ytapi/README.md +44 -0
  7. webscout/Extra/YTToolkit/ytapi/extras.py +92 -19
  8. webscout/Extra/autocoder/autocoder.py +309 -114
  9. webscout/Extra/autocoder/autocoder_utiles.py +15 -15
  10. webscout/Extra/gguf.md +430 -0
  11. webscout/Extra/tempmail/README.md +488 -0
  12. webscout/Extra/weather.md +281 -0
  13. webscout/Litlogger/Readme.md +175 -0
  14. webscout/Provider/AISEARCH/DeepFind.py +41 -37
  15. webscout/Provider/AISEARCH/README.md +279 -0
  16. webscout/Provider/AISEARCH/__init__.py +0 -1
  17. webscout/Provider/AISEARCH/genspark_search.py +228 -86
  18. webscout/Provider/AISEARCH/hika_search.py +11 -11
  19. webscout/Provider/AISEARCH/scira_search.py +324 -322
  20. webscout/Provider/AllenAI.py +7 -14
  21. webscout/Provider/Blackboxai.py +518 -74
  22. webscout/Provider/Cloudflare.py +0 -1
  23. webscout/Provider/Deepinfra.py +23 -21
  24. webscout/Provider/Flowith.py +217 -0
  25. webscout/Provider/FreeGemini.py +250 -0
  26. webscout/Provider/GizAI.py +15 -5
  27. webscout/Provider/Glider.py +11 -8
  28. webscout/Provider/HeckAI.py +80 -52
  29. webscout/Provider/Koboldai.py +7 -4
  30. webscout/Provider/LambdaChat.py +2 -2
  31. webscout/Provider/Marcus.py +10 -18
  32. webscout/Provider/OPENAI/BLACKBOXAI.py +735 -0
  33. webscout/Provider/OPENAI/Cloudflare.py +378 -0
  34. webscout/Provider/OPENAI/FreeGemini.py +282 -0
  35. webscout/Provider/OPENAI/NEMOTRON.py +244 -0
  36. webscout/Provider/OPENAI/README.md +1253 -0
  37. webscout/Provider/OPENAI/__init__.py +8 -0
  38. webscout/Provider/OPENAI/ai4chat.py +293 -286
  39. webscout/Provider/OPENAI/api.py +810 -0
  40. webscout/Provider/OPENAI/base.py +217 -14
  41. webscout/Provider/OPENAI/c4ai.py +373 -367
  42. webscout/Provider/OPENAI/chatgpt.py +7 -0
  43. webscout/Provider/OPENAI/chatgptclone.py +7 -0
  44. webscout/Provider/OPENAI/chatsandbox.py +172 -0
  45. webscout/Provider/OPENAI/deepinfra.py +30 -20
  46. webscout/Provider/OPENAI/e2b.py +6 -0
  47. webscout/Provider/OPENAI/exaai.py +7 -0
  48. webscout/Provider/OPENAI/exachat.py +6 -0
  49. webscout/Provider/OPENAI/flowith.py +162 -0
  50. webscout/Provider/OPENAI/freeaichat.py +359 -352
  51. webscout/Provider/OPENAI/glider.py +323 -316
  52. webscout/Provider/OPENAI/groq.py +361 -354
  53. webscout/Provider/OPENAI/heckai.py +30 -64
  54. webscout/Provider/OPENAI/llmchatco.py +8 -0
  55. webscout/Provider/OPENAI/mcpcore.py +7 -0
  56. webscout/Provider/OPENAI/multichat.py +8 -0
  57. webscout/Provider/OPENAI/netwrck.py +356 -350
  58. webscout/Provider/OPENAI/opkfc.py +8 -0
  59. webscout/Provider/OPENAI/scirachat.py +471 -462
  60. webscout/Provider/OPENAI/sonus.py +9 -0
  61. webscout/Provider/OPENAI/standardinput.py +9 -1
  62. webscout/Provider/OPENAI/textpollinations.py +339 -329
  63. webscout/Provider/OPENAI/toolbaz.py +7 -0
  64. webscout/Provider/OPENAI/typefully.py +355 -0
  65. webscout/Provider/OPENAI/typegpt.py +358 -346
  66. webscout/Provider/OPENAI/uncovrAI.py +7 -0
  67. webscout/Provider/OPENAI/utils.py +103 -7
  68. webscout/Provider/OPENAI/venice.py +12 -0
  69. webscout/Provider/OPENAI/wisecat.py +19 -19
  70. webscout/Provider/OPENAI/writecream.py +7 -0
  71. webscout/Provider/OPENAI/x0gpt.py +7 -0
  72. webscout/Provider/OPENAI/yep.py +50 -21
  73. webscout/Provider/OpenGPT.py +1 -1
  74. webscout/Provider/TTI/AiForce/README.md +159 -0
  75. webscout/Provider/TTI/FreeAIPlayground/README.md +99 -0
  76. webscout/Provider/TTI/ImgSys/README.md +174 -0
  77. webscout/Provider/TTI/MagicStudio/README.md +101 -0
  78. webscout/Provider/TTI/Nexra/README.md +155 -0
  79. webscout/Provider/TTI/PollinationsAI/README.md +146 -0
  80. webscout/Provider/TTI/README.md +128 -0
  81. webscout/Provider/TTI/aiarta/README.md +134 -0
  82. webscout/Provider/TTI/artbit/README.md +100 -0
  83. webscout/Provider/TTI/fastflux/README.md +129 -0
  84. webscout/Provider/TTI/huggingface/README.md +114 -0
  85. webscout/Provider/TTI/piclumen/README.md +161 -0
  86. webscout/Provider/TTI/pixelmuse/README.md +79 -0
  87. webscout/Provider/TTI/talkai/README.md +139 -0
  88. webscout/Provider/TTS/README.md +192 -0
  89. webscout/Provider/TTS/__init__.py +2 -1
  90. webscout/Provider/TTS/speechma.py +500 -100
  91. webscout/Provider/TTS/sthir.py +94 -0
  92. webscout/Provider/TeachAnything.py +3 -7
  93. webscout/Provider/TextPollinationsAI.py +4 -2
  94. webscout/Provider/{aimathgpt.py → UNFINISHED/ChatHub.py} +88 -68
  95. webscout/Provider/UNFINISHED/liner_api_request.py +263 -0
  96. webscout/Provider/UNFINISHED/oivscode.py +351 -0
  97. webscout/Provider/UNFINISHED/test_lmarena.py +119 -0
  98. webscout/Provider/Writecream.py +11 -2
  99. webscout/Provider/__init__.py +8 -14
  100. webscout/Provider/ai4chat.py +4 -58
  101. webscout/Provider/asksteve.py +17 -9
  102. webscout/Provider/cerebras.py +3 -1
  103. webscout/Provider/koala.py +170 -268
  104. webscout/Provider/llmchat.py +3 -0
  105. webscout/Provider/lmarena.py +198 -0
  106. webscout/Provider/meta.py +7 -4
  107. webscout/Provider/samurai.py +223 -0
  108. webscout/Provider/scira_chat.py +4 -2
  109. webscout/Provider/typefully.py +23 -151
  110. webscout/__init__.py +4 -2
  111. webscout/cli.py +3 -28
  112. webscout/conversation.py +35 -35
  113. webscout/litagent/Readme.md +276 -0
  114. webscout/scout/README.md +402 -0
  115. webscout/swiftcli/Readme.md +323 -0
  116. webscout/version.py +1 -1
  117. webscout/webscout_search.py +2 -182
  118. webscout/webscout_search_async.py +1 -179
  119. webscout/zeroart/README.md +89 -0
  120. webscout/zeroart/__init__.py +134 -54
  121. webscout/zeroart/base.py +19 -13
  122. webscout/zeroart/effects.py +101 -99
  123. webscout/zeroart/fonts.py +1239 -816
  124. {webscout-8.2.6.dist-info → webscout-8.2.8.dist-info}/METADATA +116 -74
  125. {webscout-8.2.6.dist-info → webscout-8.2.8.dist-info}/RECORD +130 -103
  126. {webscout-8.2.6.dist-info → webscout-8.2.8.dist-info}/WHEEL +1 -1
  127. webscout-8.2.8.dist-info/entry_points.txt +3 -0
  128. webscout-8.2.8.dist-info/top_level.txt +1 -0
  129. webscout/Provider/AISEARCH/ISou.py +0 -256
  130. webscout/Provider/ElectronHub.py +0 -773
  131. webscout/Provider/Free2GPT.py +0 -241
  132. webscout/Provider/GPTWeb.py +0 -249
  133. webscout/Provider/bagoodex.py +0 -145
  134. webscout/Provider/geminiprorealtime.py +0 -160
  135. webscout/scout/core.py +0 -881
  136. webscout-8.2.6.dist-info/entry_points.txt +0 -3
  137. webscout-8.2.6.dist-info/top_level.txt +0 -2
  138. webstoken/__init__.py +0 -30
  139. webstoken/classifier.py +0 -189
  140. webstoken/keywords.py +0 -216
  141. webstoken/language.py +0 -128
  142. webstoken/ner.py +0 -164
  143. webstoken/normalizer.py +0 -35
  144. webstoken/processor.py +0 -77
  145. webstoken/sentiment.py +0 -206
  146. webstoken/stemmer.py +0 -73
  147. webstoken/tagger.py +0 -60
  148. webstoken/tokenizer.py +0 -158
  149. /webscout/Provider/{Youchat.py → UNFINISHED/Youchat.py} +0 -0
  150. {webscout-8.2.6.dist-info → webscout-8.2.8.dist-info}/licenses/LICENSE.md +0 -0
@@ -0,0 +1,276 @@
1
+ # 🔥 LitAgent - The Lit User Agent Generator
2
+
3
+ LitAgent is a powerful and modern user agent generator that keeps your requests fresh and undetectable! Built with style and packed with features, it's your go-to solution for managing user agents in your web scraping projects.
4
+
5
+ ## 🚀 Quick Start
6
+
7
+ ```python
8
+ from webscout import LitAgent
9
+
10
+ # Create a LitAgent instance
11
+ agent = LitAgent()
12
+
13
+ # Get a random user agent
14
+ ua = agent.random()
15
+ print(ua) # Mozilla/5.0 (Windows NT 11.0) AppleWebKit/537.36 ...
16
+ ```
17
+
18
+ ## 🎯 Features
19
+
20
+ ### Browser-Specific Agents
21
+
22
+ ```python
23
+ # Get agents for specific browsers
24
+ chrome_ua = agent.chrome() # Latest Chrome agent
25
+ firefox_ua = agent.firefox() # Latest Firefox agent
26
+ safari_ua = agent.safari() # Latest Safari agent
27
+ edge_ua = agent.edge() # Latest Edge agent
28
+ opera_ua = agent.opera() # Latest Opera agent
29
+ ```
30
+
31
+ ### Device-Specific Agents
32
+
33
+ ```python
34
+ # Get mobile or desktop agents
35
+ mobile_ua = agent.mobile() # Mobile device agent
36
+ desktop_ua = agent.desktop() # Desktop device agent
37
+
38
+ # New - Get agents for specific device types
39
+ tablet_ua = agent.tablet() # Tablet device agent
40
+ tv_ua = agent.smart_tv() # Smart TV agent
41
+ console_ua = agent.gaming() # Gaming console agent
42
+ ```
43
+
44
+ ### OS-Specific Agents
45
+
46
+ ```python
47
+ # New - Get agents for specific operating systems
48
+ windows_ua = agent.windows() # Windows agent
49
+ mac_ua = agent.macos() # macOS agent
50
+ linux_ua = agent.linux() # Linux agent
51
+ android_ua = agent.android() # Android agent
52
+ ios_ua = agent.ios() # iOS agent
53
+ ```
54
+
55
+ ### Custom Agent Generation
56
+
57
+ ```python
58
+ # New - Generate custom user agents with specific attributes
59
+ custom_ua = agent.custom(
60
+ browser="chrome",
61
+ version="91.0",
62
+ os="windows",
63
+ os_version="10",
64
+ device_type="desktop"
65
+ )
66
+ ```
67
+
68
+ ### Keep It Fresh
69
+
70
+ ```python
71
+ # Refresh your agents pool anytime
72
+ agent.refresh() # Generates new set of agents
73
+
74
+ # New - Schedule automatic refreshes
75
+ agent.auto_refresh(interval_minutes=30) # Auto-refresh every 30 minutes
76
+ ```
77
+
78
+ ## 💫 Real-World Examples
79
+
80
+ ### With Requests
81
+
82
+ ```python
83
+ import requests
84
+ from webscout import LitAgent
85
+
86
+ agent = LitAgent()
87
+
88
+ def make_request(url):
89
+ headers = {
90
+ 'User-Agent': agent.random(),
91
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
92
+ 'Accept-Language': 'en-US,en;q=0.5',
93
+ 'Connection': 'keep-alive',
94
+ }
95
+ return requests.get(url, headers=headers)
96
+
97
+ # Make requests with different agents
98
+ response1 = make_request('https://api.example.com') # Random agent
99
+ response2 = make_request('https://mobile.example.com') # Another random agent
100
+ ```
101
+
102
+ ### With aiohttp
103
+
104
+ ```python
105
+ import aiohttp
106
+ import asyncio
107
+ from webscout import LitAgent
108
+
109
+ agent = LitAgent()
110
+
111
+ async def fetch(url):
112
+ headers = {'User-Agent': agent.random()}
113
+ async with aiohttp.ClientSession() as session:
114
+ async with session.get(url, headers=headers) as response:
115
+ return await response.text()
116
+
117
+ # Use it in your async code
118
+ async def main():
119
+ urls = [
120
+ 'https://api1.example.com',
121
+ 'https://api2.example.com',
122
+ 'https://api3.example.com'
123
+ ]
124
+ tasks = [fetch(url) for url in urls]
125
+ results = await asyncio.gather(*tasks)
126
+ return results
127
+ ```
128
+
129
+ ### With Selenium
130
+
131
+ ```python
132
+ from selenium import webdriver
133
+ from webscout import LitAgent
134
+
135
+ agent = LitAgent()
136
+
137
+ def create_driver():
138
+ options = webdriver.ChromeOptions()
139
+ options.add_argument(f'user-agent={agent.chrome()}')
140
+ return webdriver.Chrome(options=options)
141
+
142
+ # Use it with Selenium
143
+ driver = create_driver()
144
+ driver.get('https://example.com')
145
+ ```
146
+
147
+ ### New - With Playwright
148
+
149
+ ```python
150
+ from playwright.sync_api import sync_playwright
151
+ from webscout import LitAgent
152
+
153
+ agent = LitAgent()
154
+
155
+ def browse_with_playwright():
156
+ with sync_playwright() as p:
157
+ browser_options = {
158
+ "user_agent": agent.chrome(),
159
+ "viewport": {"width": 1280, "height": 720}
160
+ }
161
+ browser = p.chromium.launch()
162
+ context = browser.new_context(**browser_options)
163
+ page = context.new_page()
164
+ page.goto('https://example.com')
165
+ # Continue with your scraping logic
166
+ browser.close()
167
+ ```
168
+
169
+ ## 🌟 Pro Tips
170
+
171
+ 1. **Rotate Agents**: Refresh your agents pool periodically to avoid detection
172
+ ```python
173
+ agent = LitAgent()
174
+ for _ in range(10):
175
+ response = requests.get(url, headers={'User-Agent': agent.random()})
176
+ if _ % 3 == 0: # Refresh every 3 requests
177
+ agent.refresh()
178
+ ```
179
+
180
+ 2. **Device-Specific Scraping**: Use appropriate agents for different platforms
181
+ ```python
182
+ # Mobile site scraping
183
+ mobile_response = requests.get(
184
+ 'https://m.example.com',
185
+ headers={'User-Agent': agent.mobile()}
186
+ )
187
+
188
+ # Desktop site scraping
189
+ desktop_response = requests.get(
190
+ 'https://example.com',
191
+ headers={'User-Agent': agent.desktop()}
192
+ )
193
+ ```
194
+
195
+ 3. **Browser Consistency**: Stick to one browser type per session
196
+ ```python
197
+ chrome_agent = agent.chrome()
198
+ headers = {
199
+ 'User-Agent': chrome_agent,
200
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
201
+ }
202
+ # Use these headers for all requests in this session
203
+ ```
204
+
205
+ 4. **New - Browser Fingerprinting Defense**:
206
+ ```python
207
+ # Create consistent browser fingerprinting
208
+ fingerprint = agent.generate_fingerprint(browser="chrome")
209
+
210
+ headers = {
211
+ 'User-Agent': fingerprint['user_agent'],
212
+ 'Accept-Language': fingerprint['accept_language'],
213
+ 'Accept': fingerprint['accept'],
214
+ 'Sec-Ch-Ua': fingerprint['sec_ch_ua'],
215
+ 'Sec-Ch-Ua-Platform': fingerprint['platform']
216
+ }
217
+
218
+ # Use this consistent set for all session requests
219
+ ```
220
+
221
+ 5. **New - Multi-threading Support**:
222
+ ```python
223
+ import concurrent.futures
224
+ from webscout import LitAgent
225
+
226
+ agent = LitAgent(thread_safe=True) # Thread-safe instance
227
+
228
+ def fetch_url(url):
229
+ headers = {'User-Agent': agent.random()}
230
+ return requests.get(url, headers=headers).text
231
+
232
+ urls = ['https://example1.com', 'https://example2.com', 'https://example3.com']
233
+
234
+ with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
235
+ results = list(executor.map(fetch_url, urls))
236
+ ```
237
+
238
+ ## 🔧 Supported Browsers & Devices
239
+
240
+ - **Browsers**: Chrome, Firefox, Safari, Edge, Opera, Brave, Vivaldi
241
+ - **Operating Systems**: Windows, macOS, Linux, Android, iOS, Chrome OS
242
+ - **Devices**: Mobile phones, Tablets, Desktops, Game consoles, Smart TVs, Wearables
243
+
244
+ ## 🎨 Why LitAgent?
245
+
246
+ - 🚀 Modern and up-to-date user agents
247
+ - 💪 Easy to use, hard to detect
248
+ - 🔄 Fresh agents on demand
249
+ - 📱 Device-specific agents
250
+ - 🌐 All major browsers supported
251
+ - ⚡ Lightweight and fast
252
+ - 🧩 Advanced fingerprinting protection
253
+ - 🔄 Seamless proxy integration
254
+ - 🧵 Thread-safe operation
255
+ - 🕰️ Automatic refresh scheduling
256
+
257
+ ## 📊 New - Analytics and Reporting
258
+
259
+ ```python
260
+ # Get statistics on your agent usage
261
+ stats = agent.get_stats()
262
+ print(f"Agents generated: {stats.total_generated}")
263
+ print(f"Most used browser: {stats.top_browser}")
264
+ print(f"Detection avoidance rate: {stats.avoidance_rate}%")
265
+
266
+ # Export your usage data
267
+ agent.export_stats('agent_usage.json')
268
+ ```
269
+
270
+ ## 📋 Installation
271
+
272
+ ```bash
273
+ pip install webscout
274
+ ```
275
+
276
+ Made with 💖 by the HelpingAI team
@@ -0,0 +1,402 @@
1
+ # 🕵️ Scout: Next-Gen Web Parsing Library
2
+
3
+ <div align="center">
4
+
5
+ [![Python](https://img.shields.io/badge/Python-3.8%2B-blue)](https://www.python.org/)
6
+ [![License](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
7
+ [![Maintenance](https://img.shields.io/badge/Maintained-Yes-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout)
8
+ [![Documentation](https://img.shields.io/badge/Docs-Wiki-orange)](https://github.com/OE-LUCIFER/Webscout/wiki)
9
+ [![PRs Welcome](https://img.shields.io/badge/PRs-Welcome-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout/pulls)
10
+
11
+ </div>
12
+
13
+ ## 📋 Overview
14
+
15
+ Scout is a powerful, flexible, and performant HTML parsing library that makes web scraping a breeze! It provides intelligent HTML/XML parsing with advanced features like web crawling, text analysis, semantic extraction, and Markdown conversion. Scout goes beyond traditional parsing libraries with its intuitive API and comprehensive feature set.
16
+
17
+ <details open>
18
+ <summary><b>Why Choose Scout?</b></summary>
19
+
20
+ - **Powerful Parsing**: Multiple parser backends with intelligent markup handling
21
+ - **Advanced Analysis**: Built-in text and web content analysis tools
22
+ - **Concurrent Crawling**: Efficient multi-threaded web crawling
23
+ - **Flexible API**: Intuitive interface similar to BeautifulSoup but with enhanced capabilities
24
+ - **Format Conversion**: Convert HTML to JSON, Markdown, and more
25
+
26
+ </details>
27
+
28
+ ## 📑 Table of Contents
29
+
30
+ - [Installation](#-installation)
31
+ - [Quick Start](#-quick-start)
32
+ - [Features](#-features)
33
+ - [Advanced Usage](#-advanced-usage)
34
+ - [API Reference](#-api-reference)
35
+ - [Dependencies](#-dependencies)
36
+ - [Supported Python Versions](#-supported-python-versions)
37
+ - [Contributing](#-contributing)
38
+ - [License](#-license)
39
+
40
+ ## 📦 Installation
41
+
42
+ ```bash
43
+ pip install webscout
44
+ ```
45
+
46
+ Or install the latest version from GitHub:
47
+
48
+ ```bash
49
+ pip install git+https://github.com/OE-LUCIFER/Webscout.git
50
+ ```
51
+
52
+ ## 🚀 Quick Start
53
+
54
+ ### Basic Parsing
55
+
56
+ ```python
57
+ from webscout.scout import Scout
58
+
59
+ # Parse HTML content
60
+ html_content = """
61
+ <html>
62
+ <body>
63
+ <h1>Hello, Scout!</h1>
64
+ <div class="content">
65
+ <p>Web parsing made easy.</p>
66
+ <a href="https://example.com">Link</a>
67
+ </div>
68
+ </body>
69
+ </html>
70
+ """
71
+
72
+ scout = Scout(html_content)
73
+
74
+ # Find elements
75
+ title = scout.find('h1')
76
+ links = scout.find_all('a')
77
+
78
+ # Extract text
79
+ print(title[0].get_text()) # Output: Hello, Scout!
80
+ print(links.attrs('href')) # Output: ['https://example.com']
81
+ ```
82
+
83
+ ### Web Crawling
84
+
85
+ ```python
86
+ from webscout.scout import ScoutCrawler
87
+
88
+ # Crawl a website with default settings
89
+ crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
90
+
91
+ # Or customize the crawler
92
+ crawler = ScoutCrawler(
93
+ 'https://example.com', # base_url
94
+ max_pages=100, # maximum pages to crawl
95
+ tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
96
+ )
97
+
98
+ # Start crawling
99
+ crawled_pages = crawler.crawl()
100
+
101
+ for page in crawled_pages:
102
+ print(f"URL: {page['url']}")
103
+ print(f"Title: {page['title']}")
104
+ print(f"Links found: {len(page['links'])}")
105
+ print(f"Crawl depth: {page['depth']}")
106
+ ```
107
+
108
+ ### Text Analysis
109
+
110
+ ```python
111
+ from webscout.scout import Scout
112
+
113
+ # Parse a webpage
114
+ html = """<div><h1>Climate Change</h1><p>Email us at info@example.com or call 555-123-4567.</p>
115
+ <p>Visit https://climate-action.org for more information.</p></div>"""
116
+ scout = Scout(html)
117
+
118
+ # Analyze text and extract entities
119
+ analysis = scout.analyze_text()
120
+ print(f"Word frequencies: {analysis['word_count']}")
121
+ print(f"Entities found: {analysis['entities']}")
122
+ ```
123
+
124
+ ## ✨ Features
125
+
126
+ ### 🔍 Multiple Parser Support
127
+
128
+ Scout supports multiple HTML/XML parsers, allowing you to choose the best tool for your specific needs:
129
+
130
+ | Parser | Description | Best For |
131
+ |--------|-------------|----------|
132
+ | `html.parser` | Python's built-in parser | General-purpose parsing, no dependencies |
133
+ | `lxml` | Fast C-based parser | Performance-critical applications |
134
+ | `html5lib` | Highly compliant HTML5 parser | Handling malformed HTML |
135
+ | `lxml-xml` | XML parser | XML document parsing |
136
+
137
+ ```python
138
+ # Choose your parser
139
+ scout = Scout(html_content, features='lxml') # For speed
140
+ scout = Scout(html_content, features='html5lib') # For compliance
141
+ ```
142
+
143
+ ### 🌐 Advanced Parsing Capabilities
144
+
145
+ Scout provides powerful tools for navigating and manipulating HTML/XML documents:
146
+
147
+ - **Element Selection**: Find elements by tag name, attributes, CSS selectors, and more
148
+ - **Tree Traversal**: Navigate parent-child relationships and sibling elements
149
+ - **Content Extraction**: Extract text, attributes, and structured data
150
+ - **Document Manipulation**: Modify, replace, or remove elements
151
+
152
+ ```python
153
+ # CSS selector support
154
+ elements = scout.select('div.content > p')
155
+
156
+ # Advanced find with attribute matching
157
+ results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
158
+
159
+ # Tree traversal
160
+ parent = element.find_parent('div')
161
+ siblings = element.find_next_siblings('p')
162
+ ```
163
+
164
+ ### 🧠 Intelligent Analysis
165
+
166
+ Scout includes built-in analysis tools for extracting insights from web content:
167
+
168
+ #### Text Analysis
169
+
170
+ ```python
171
+ # Extract and analyze text
172
+ text = scout.get_text()
173
+ word_counts = scout.text_analyzer.count_words(text)
174
+ entities = scout.text_analyzer.extract_entities(text)
175
+ ```
176
+
177
+ #### Web Structure Analysis
178
+
179
+ ```python
180
+ # Analyze page structure
181
+ structure = scout.analyze_page_structure()
182
+ print(f"Most common tags: {structure['tag_distribution']}")
183
+ print(f"Page depth: {max(structure['depth_analysis'].keys())}")
184
+ ```
185
+
186
+ #### Semantic Information Extraction
187
+
188
+ ```python
189
+ # Extract semantic information
190
+ semantics = scout.extract_semantic_info()
191
+ print(f"Headings: {semantics['headings']}")
192
+ print(f"Lists: {len(semantics['lists']['ul']) + len(semantics['lists']['ol'])}")
193
+ print(f"Tables: {semantics['tables']['count']}")
194
+ ```
195
+
196
+ ### 🕸️ Web Crawling
197
+
198
+ Scout includes a powerful concurrent web crawler for fetching and analyzing multiple pages:
199
+
200
+ ```python
201
+ from webscout.scout import ScoutCrawler
202
+
203
+ # Create a crawler with default settings
204
+ crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
205
+
206
+ # Or customize the crawler with specific options
207
+ crawler = ScoutCrawler(
208
+ 'https://example.com', # base_url
209
+ max_pages=100, # maximum pages to crawl
210
+ tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
211
+ )
212
+
213
+ # Start crawling
214
+ pages = crawler.crawl()
215
+
216
+ # Process results
217
+ for page in pages:
218
+ print(f"URL: {page['url']}")
219
+ print(f"Title: {page['title']}")
220
+ print(f"Links: {len(page['links'])}")
221
+ print(f"Depth: {page['depth']}")
222
+ ```
223
+
224
+ The crawler automatically:
225
+ - Stays within the same domain as the base URL
226
+ - Uses concurrent requests for faster crawling
227
+ - Removes unwanted tags (like scripts and styles) for cleaner text extraction
228
+ - Tracks crawl depth for each page
229
+
230
+ ### 📄 Format Conversion
231
+
232
+ Scout can convert HTML to various formats:
233
+
234
+ ```python
235
+ # Convert to JSON
236
+ json_data = scout.to_json(indent=2)
237
+
238
+ # Convert to Markdown
239
+ markdown = scout.to_markdown(heading_style='ATX')
240
+
241
+ # Pretty-print HTML
242
+ pretty_html = scout.prettify()
243
+ ```
244
+
245
+ ## 🔬 Advanced Usage
246
+
247
+ ### Working with Search Results
248
+
249
+ Scout's search methods return a `ScoutSearchResult` object with powerful methods for processing results:
250
+
251
+ ```python
252
+ from webscout.scout import Scout
253
+
254
+ scout = Scout(html_content)
255
+
256
+ # Find all paragraphs
257
+ paragraphs = scout.find_all('p')
258
+
259
+ # Extract all text from results
260
+ all_text = paragraphs.texts(separator='\n')
261
+
262
+ # Extract specific attributes
263
+ hrefs = paragraphs.attrs('href')
264
+
265
+ # Filter results with a predicate function
266
+ important = paragraphs.filter(lambda p: 'important' in p.get('class', []))
267
+
268
+ # Transform results
269
+ word_counts = paragraphs.map(lambda p: len(p.get_text().split()))
270
+
271
+ # Analyze text in results
272
+ analysis = paragraphs.analyze_text()
273
+ ```
274
+
275
+ ### URL Handling and Analysis
276
+
277
+ ```python
278
+ from webscout.scout import Scout
279
+
280
+ scout = Scout(html_content)
281
+
282
+ # Parse and analyze URLs
283
+ links = scout.extract_links(base_url='https://example.com')
284
+ for link in links:
285
+ url_components = scout.url_parse(link['href'])
286
+ print(f"Domain: {url_components['netloc']}")
287
+ print(f"Path: {url_components['path']}")
288
+ ```
289
+
290
+ ### Metadata Extraction
291
+
292
+ ```python
293
+ from webscout.scout import Scout
294
+
295
+ scout = Scout(html_content)
296
+
297
+ # Extract metadata
298
+ metadata = scout.extract_metadata()
299
+ print(f"Title: {metadata['title']}")
300
+ print(f"Description: {metadata['description']}")
301
+ print(f"Open Graph: {metadata['og_metadata']}")
302
+ print(f"Twitter Card: {metadata['twitter_metadata']}")
303
+ ```
304
+
305
+ ### Content Hashing and Caching
306
+
307
+ ```python
308
+ from webscout.scout import Scout
309
+
310
+ scout = Scout(html_content)
311
+
312
+ # Generate content hash
313
+ content_hash = scout.hash_content(method='sha256')
314
+
315
+ # Use caching for expensive operations
316
+ if not scout.cache('parsed_data'):
317
+ data = scout.extract_semantic_info()
318
+ scout.cache('parsed_data', data)
319
+
320
+ cached_data = scout.cache('parsed_data')
321
+ ```
322
+
323
+ ## 📚 API Reference
324
+
325
+ ### Core Classes
326
+
327
+ | Class | Description |
328
+ |-------|-------------|
329
+ | `Scout` | Main class for HTML parsing and traversal |
330
+ | `ScoutCrawler` | Web crawler for fetching and parsing multiple pages |
331
+ | `ScoutTextAnalyzer` | Text analysis utilities |
332
+ | `ScoutWebAnalyzer` | Web page analysis utilities |
333
+ | `ScoutSearchResult` | Enhanced search results with filtering and analysis |
334
+ | `Tag` | Represents an HTML/XML tag |
335
+ | `NavigableString` | Represents text within an HTML/XML document |
336
+
337
+ ### Key Methods
338
+
339
+ #### Scout Class
340
+
341
+ - `__init__(markup, features='html.parser', from_encoding=None)`: Initialize with HTML content
342
+ - `find(name, attrs={}, recursive=True, text=None)`: Find first matching element
343
+ - `find_all(name, attrs={}, recursive=True, text=None, limit=None)`: Find all matching elements
344
+ - `select(selector)`: Find elements using CSS selector
345
+ - `get_text(separator=' ', strip=False)`: Extract text from document
346
+ - `analyze_text()`: Perform text analysis
347
+ - `analyze_page_structure()`: Analyze document structure
348
+ - `extract_semantic_info()`: Extract semantic information
349
+ - `extract_links(base_url=None)`: Extract all links
350
+ - `extract_metadata()`: Extract metadata from document
351
+ - `to_json(indent=2)`: Convert to JSON
352
+ - `to_markdown(heading_style='ATX')`: Convert to Markdown
353
+ - `prettify(formatter='minimal')`: Pretty-print HTML
354
+
355
+ #### ScoutCrawler Class
356
+
357
+ - `__init__(base_url, max_pages=50, tags_to_remove=None)`: Initialize the crawler
358
+ - `crawl()`: Start crawling from the base URL
359
+ - `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
360
+ - `_is_valid_url(url)`: Check if a URL is valid (internal method)
361
+
362
+ For detailed API documentation, please refer to the [documentation](https://github.com/OE-LUCIFER/Webscout/wiki).
363
+
364
+ ## 🔧 Dependencies
365
+
366
+ - `requests`: HTTP library for making web requests
367
+ - `lxml`: XML and HTML processing library (optional, recommended)
368
+ - `html5lib`: Standards-compliant HTML parser (optional)
369
+ - `markdownify`: HTML to Markdown conversion
370
+ - `concurrent.futures`: Asynchronous execution (standard library)
371
+
372
+ ## 🌈 Supported Python Versions
373
+
374
+ - Python 3.8+
375
+
376
+ ## 🤝 Contributing
377
+
378
+ Contributions are welcome! Here's how you can contribute:
379
+
380
+ 1. Fork the repository
381
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
382
+ 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
383
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
384
+ 5. Open a Pull Request
385
+
386
+ Please make sure to update tests as appropriate.
387
+
388
+ ## 📄 License
389
+
390
+ This project is licensed under the MIT License - see the LICENSE file for details.
391
+
392
+ ---
393
+
394
+ <div align="center">
395
+ <p>Made with ❤️ by the Webscout team</p>
396
+ <p>
397
+ <a href="https://github.com/OE-LUCIFER/Webscout">GitHub</a> •
398
+ <a href="https://github.com/OE-LUCIFER/Webscout/wiki">Documentation</a> •
399
+ <a href="https://github.com/OE-LUCIFER/Webscout/issues">Report Bug</a> •
400
+ <a href="https://github.com/OE-LUCIFER/Webscout/issues">Request Feature</a>
401
+ </p>
402
+ </div>