webscout 8.3.4__py3-none-any.whl → 8.3.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIutel.py +52 -1016
- webscout/Bard.py +12 -6
- webscout/DWEBS.py +66 -57
- webscout/Provider/AISEARCH/PERPLEXED_search.py +214 -0
- webscout/Provider/AISEARCH/__init__.py +11 -10
- webscout/Provider/AISEARCH/felo_search.py +7 -3
- webscout/Provider/AISEARCH/scira_search.py +2 -0
- webscout/Provider/AISEARCH/stellar_search.py +53 -8
- webscout/Provider/Deepinfra.py +13 -1
- webscout/Provider/Flowith.py +6 -1
- webscout/Provider/GithubChat.py +1 -0
- webscout/Provider/GptOss.py +207 -0
- webscout/Provider/Kimi.py +445 -0
- webscout/Provider/Netwrck.py +3 -6
- webscout/Provider/OPENAI/README.md +2 -1
- webscout/Provider/OPENAI/TogetherAI.py +12 -8
- webscout/Provider/OPENAI/TwoAI.py +94 -1
- webscout/Provider/OPENAI/__init__.py +4 -4
- webscout/Provider/OPENAI/copilot.py +20 -4
- webscout/Provider/OPENAI/deepinfra.py +12 -0
- webscout/Provider/OPENAI/e2b.py +60 -8
- webscout/Provider/OPENAI/flowith.py +4 -3
- webscout/Provider/OPENAI/generate_api_key.py +48 -0
- webscout/Provider/OPENAI/gptoss.py +288 -0
- webscout/Provider/OPENAI/kimi.py +469 -0
- webscout/Provider/OPENAI/netwrck.py +8 -12
- webscout/Provider/OPENAI/refact.py +274 -0
- webscout/Provider/OPENAI/scirachat.py +4 -0
- webscout/Provider/OPENAI/textpollinations.py +11 -10
- webscout/Provider/OPENAI/toolbaz.py +1 -0
- webscout/Provider/OPENAI/venice.py +1 -0
- webscout/Provider/Perplexitylabs.py +163 -147
- webscout/Provider/Qodo.py +30 -6
- webscout/Provider/TTI/__init__.py +1 -0
- webscout/Provider/TTI/bing.py +14 -2
- webscout/Provider/TTI/together.py +11 -9
- webscout/Provider/TTI/venice.py +368 -0
- webscout/Provider/TTS/README.md +0 -1
- webscout/Provider/TTS/__init__.py +0 -1
- webscout/Provider/TTS/base.py +479 -159
- webscout/Provider/TTS/deepgram.py +409 -156
- webscout/Provider/TTS/elevenlabs.py +425 -111
- webscout/Provider/TTS/freetts.py +317 -140
- webscout/Provider/TTS/gesserit.py +192 -128
- webscout/Provider/TTS/murfai.py +248 -113
- webscout/Provider/TTS/openai_fm.py +347 -129
- webscout/Provider/TTS/speechma.py +620 -586
- webscout/Provider/TextPollinationsAI.py +11 -10
- webscout/Provider/TogetherAI.py +12 -4
- webscout/Provider/TwoAI.py +96 -2
- webscout/Provider/TypliAI.py +33 -27
- webscout/Provider/UNFINISHED/VercelAIGateway.py +339 -0
- webscout/Provider/UNFINISHED/fetch_together_models.py +6 -11
- webscout/Provider/Venice.py +1 -0
- webscout/Provider/WiseCat.py +18 -20
- webscout/Provider/__init__.py +2 -96
- webscout/Provider/cerebras.py +83 -33
- webscout/Provider/copilot.py +42 -23
- webscout/Provider/scira_chat.py +4 -0
- webscout/Provider/toolbaz.py +6 -10
- webscout/Provider/typefully.py +1 -11
- webscout/__init__.py +3 -15
- webscout/auth/__init__.py +19 -4
- webscout/auth/api_key_manager.py +189 -189
- webscout/auth/auth_system.py +25 -40
- webscout/auth/config.py +105 -6
- webscout/auth/database.py +377 -22
- webscout/auth/models.py +185 -130
- webscout/auth/request_processing.py +175 -11
- webscout/auth/routes.py +99 -2
- webscout/auth/server.py +9 -2
- webscout/auth/simple_logger.py +236 -0
- webscout/conversation.py +22 -20
- webscout/sanitize.py +1078 -0
- webscout/scout/README.md +20 -23
- webscout/scout/core/crawler.py +125 -38
- webscout/scout/core/scout.py +26 -5
- webscout/version.py +1 -1
- webscout/webscout_search.py +13 -6
- webscout/webscout_search_async.py +10 -8
- webscout/yep_search.py +13 -5
- {webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/METADATA +10 -149
- {webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/RECORD +88 -87
- webscout/Provider/Glider.py +0 -225
- webscout/Provider/OPENAI/README_AUTOPROXY.md +0 -238
- webscout/Provider/OPENAI/c4ai.py +0 -394
- webscout/Provider/OPENAI/glider.py +0 -330
- webscout/Provider/OPENAI/typegpt.py +0 -368
- webscout/Provider/OPENAI/uncovrAI.py +0 -477
- webscout/Provider/TTS/sthir.py +0 -94
- webscout/Provider/WritingMate.py +0 -273
- webscout/Provider/typegpt.py +0 -284
- webscout/Provider/uncovr.py +0 -333
- /webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +0 -0
- {webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/WHEEL +0 -0
- {webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/entry_points.txt +0 -0
- {webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.3.4.dist-info → webscout-8.3.6.dist-info}/top_level.txt +0 -0
webscout/scout/README.md
CHANGED
|
@@ -1,27 +1,24 @@
|
|
|
1
|
-
|
|
1
|
+
**🚀 The Most Advanced HTML Parser & Web Crawler for AI/LLM Data Collection**
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
[](https://www.python.org/)
|
|
6
|
-
[](https://opensource.org/licenses/MIT)
|
|
7
|
-
[](https://github.com/OE-LUCIFER/Webscout)
|
|
8
|
-
[](https://github.com/OE-LUCIFER/Webscout/wiki)
|
|
9
|
-
[](https://github.com/OE-LUCIFER/Webscout/pulls)
|
|
3
|
+
**🌟 Built for the Future • Powered by Intelligence • Trusted by Developers**
|
|
10
4
|
|
|
11
|
-
</div>
|
|
12
5
|
|
|
13
6
|
## 📋 Overview
|
|
14
7
|
|
|
15
|
-
Scout is
|
|
8
|
+
Scout is an ultra-powerful, enterprise-grade HTML parsing and web crawling library designed for the AI era. Built with LLM data collection in mind, Scout provides unparalleled capabilities for extracting, analyzing, and processing web content at scale. With its BeautifulSoup-compatible API enhanced with modern features, Scout is the go-to solution for serious web scraping projects.
|
|
16
9
|
|
|
17
10
|
<details open>
|
|
18
|
-
<summary><b
|
|
19
|
-
|
|
20
|
-
-
|
|
21
|
-
-
|
|
22
|
-
-
|
|
23
|
-
-
|
|
24
|
-
-
|
|
11
|
+
<summary><b>🌟 Why Scout is the Ultimate Choice</b></summary>
|
|
12
|
+
|
|
13
|
+
- **🧠 LLM-Optimized Crawling**: Purpose-built for collecting high-quality training data for Large Language Models
|
|
14
|
+
- **🌐 Subdomain Intelligence**: Automatically discovers and crawls subdomains (e.g., blog.example.com, docs.example.com)
|
|
15
|
+
- **⚡ Lightning-Fast Performance**: Multi-threaded concurrent crawling with intelligent rate limiting
|
|
16
|
+
- **🎯 Surgical Precision**: Advanced content extraction that preserves structure while removing noise
|
|
17
|
+
- **🔍 Deep Analysis**: Built-in NLP capabilities for entity extraction, text analysis, and semantic understanding
|
|
18
|
+
- **🛡️ Enterprise-Ready**: Robust error handling, retry mechanisms, and respect for robots.txt
|
|
19
|
+
- **📊 Rich Data Extraction**: Captures metadata, structured data, semantic content, and more
|
|
20
|
+
- **🔄 Format Flexibility**: Export to JSON, Markdown, CSV, or custom formats
|
|
21
|
+
- **🎨 BeautifulSoup++ API**: Familiar interface with 10x more features
|
|
25
22
|
|
|
26
23
|
</details>
|
|
27
24
|
|
|
@@ -46,7 +43,7 @@ pip install webscout
|
|
|
46
43
|
Or install the latest version from GitHub:
|
|
47
44
|
|
|
48
45
|
```bash
|
|
49
|
-
pip install git+https://github.com/
|
|
46
|
+
pip install git+https://github.com/OEvortex/Webscout.git
|
|
50
47
|
```
|
|
51
48
|
|
|
52
49
|
## 🚀 Quick Start
|
|
@@ -361,7 +358,7 @@ cached_data = scout.cache('parsed_data')
|
|
|
361
358
|
- `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
|
|
362
359
|
- `_is_valid_url(url)`: Check if a URL is valid (internal method)
|
|
363
360
|
|
|
364
|
-
For detailed API documentation, please refer to the [documentation](https://github.com/
|
|
361
|
+
For detailed API documentation, please refer to the [documentation](https://github.com/OEvortex/Webscout/wiki).
|
|
365
362
|
|
|
366
363
|
## 🔧 Dependencies
|
|
367
364
|
|
|
@@ -396,9 +393,9 @@ This project is licensed under the MIT License - see the LICENSE file for detail
|
|
|
396
393
|
<div align="center">
|
|
397
394
|
<p>Made with ❤️ by the Webscout team</p>
|
|
398
395
|
<p>
|
|
399
|
-
<a href="https://github.com/
|
|
400
|
-
<a href="https://github.com/
|
|
401
|
-
<a href="https://github.com/
|
|
402
|
-
<a href="https://github.com/
|
|
396
|
+
<a href="https://github.com/OEvortex/Webscout">GitHub</a> •
|
|
397
|
+
<a href="https://github.com/OEvortex/Webscout/wiki">Documentation</a> •
|
|
398
|
+
<a href="https://github.com/OEvortex/Webscout/issues">Report Bug</a> •
|
|
399
|
+
<a href="https://github.com/OEvortex/Webscout/issues">Request Feature</a>
|
|
403
400
|
</p>
|
|
404
401
|
</div>
|
webscout/scout/core/crawler.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
"""
|
|
2
|
-
Scout Crawler Module
|
|
2
|
+
Scout Crawler Module - Ultra Advanced Web Crawling System
|
|
3
3
|
"""
|
|
4
4
|
|
|
5
5
|
import concurrent.futures
|
|
@@ -7,18 +7,82 @@ import urllib.parse
|
|
|
7
7
|
import time
|
|
8
8
|
import hashlib
|
|
9
9
|
import re
|
|
10
|
+
import json
|
|
11
|
+
import sqlite3
|
|
12
|
+
import threading
|
|
13
|
+
import queue
|
|
14
|
+
import logging
|
|
15
|
+
import mimetypes
|
|
16
|
+
import pickle
|
|
17
|
+
import asyncio
|
|
18
|
+
import aiohttp
|
|
19
|
+
import random
|
|
10
20
|
from urllib import robotparser
|
|
11
|
-
from datetime import datetime
|
|
12
|
-
from typing import Dict, List, Optional, Union
|
|
13
|
-
from
|
|
14
|
-
from
|
|
21
|
+
from datetime import datetime, timedelta
|
|
22
|
+
from typing import Dict, List, Optional, Union, Set, Tuple, Callable, Any
|
|
23
|
+
from collections import defaultdict, deque
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from enum import Enum
|
|
26
|
+
from pathlib import Path
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
from webscout.litagent import LitAgent
|
|
30
|
+
except ImportError:
|
|
31
|
+
LitAgent = None
|
|
32
|
+
|
|
33
|
+
try:
|
|
34
|
+
from curl_cffi.requests import Session
|
|
35
|
+
except ImportError:
|
|
36
|
+
import requests
|
|
37
|
+
Session = requests.Session
|
|
15
38
|
|
|
16
39
|
from .scout import Scout
|
|
40
|
+
from .text_analyzer import ScoutTextAnalyzer
|
|
41
|
+
|
|
17
42
|
|
|
43
|
+
@dataclass
|
|
44
|
+
class CrawlConfig:
|
|
45
|
+
"""Configuration for the crawler."""
|
|
46
|
+
max_pages: int = 1000
|
|
47
|
+
max_depth: int = 10
|
|
48
|
+
delay: float = 0.5
|
|
49
|
+
obey_robots: bool = True
|
|
50
|
+
crawl_subdomains: bool = True
|
|
51
|
+
max_workers: int = 10
|
|
52
|
+
timeout: int = 30
|
|
53
|
+
retry_attempts: int = 3
|
|
54
|
+
include_external_links: bool = False
|
|
55
|
+
extract_metadata: bool = True
|
|
56
|
+
extract_structured_data: bool = True
|
|
57
|
+
extract_semantic_content: bool = True
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass
|
|
61
|
+
class PageData:
|
|
62
|
+
"""Comprehensive page data for LLM training."""
|
|
63
|
+
url: str
|
|
64
|
+
title: str
|
|
65
|
+
text: str
|
|
66
|
+
clean_text: str
|
|
67
|
+
markdown_text: str
|
|
68
|
+
links: List[str]
|
|
69
|
+
internal_links: List[str]
|
|
70
|
+
external_links: List[str]
|
|
71
|
+
metadata: Dict[str, Any]
|
|
72
|
+
structured_data: Dict[str, Any]
|
|
73
|
+
semantic_content: Dict[str, Any]
|
|
74
|
+
headers: Dict[str, str]
|
|
75
|
+
status_code: int
|
|
76
|
+
content_type: str
|
|
77
|
+
language: str
|
|
78
|
+
timestamp: str
|
|
79
|
+
depth: int
|
|
80
|
+
word_count: int
|
|
81
|
+
|
|
18
82
|
|
|
19
83
|
class ScoutCrawler:
|
|
20
84
|
"""
|
|
21
|
-
|
|
85
|
+
Ultra-advanced web crawling utility optimized for LLM data collection.
|
|
22
86
|
"""
|
|
23
87
|
def __init__(self, base_url: str, max_pages: int = 50, tags_to_remove: List[str] = None, session: Optional[Session] = None, delay: float = 0.5, obey_robots: bool = True, allowed_domains: Optional[List[str]] = None):
|
|
24
88
|
"""
|
|
@@ -33,13 +97,7 @@ class ScoutCrawler:
|
|
|
33
97
|
self.max_pages = max_pages
|
|
34
98
|
self.tags_to_remove = tags_to_remove if tags_to_remove is not None else [
|
|
35
99
|
"script",
|
|
36
|
-
"style"
|
|
37
|
-
"header",
|
|
38
|
-
"footer",
|
|
39
|
-
"nav",
|
|
40
|
-
"aside",
|
|
41
|
-
"form",
|
|
42
|
-
"button",
|
|
100
|
+
"style"
|
|
43
101
|
]
|
|
44
102
|
self.visited_urls = set()
|
|
45
103
|
self.crawled_pages = []
|
|
@@ -50,7 +108,10 @@ class ScoutCrawler:
|
|
|
50
108
|
self.session.headers.setdefault("User-Agent", self.agent.chrome())
|
|
51
109
|
self.delay = delay
|
|
52
110
|
self.obey_robots = obey_robots
|
|
53
|
-
|
|
111
|
+
# Allow crawling of subdomains by default
|
|
112
|
+
base_domain = urllib.parse.urlparse(base_url).netloc.split('.')
|
|
113
|
+
self.base_domain = '.'.join(base_domain[-2:]) if len(base_domain) > 1 else base_domain[0]
|
|
114
|
+
self.allowed_domains = allowed_domains or [self.base_domain]
|
|
54
115
|
self.last_request_time = 0
|
|
55
116
|
self.url_hashes = set()
|
|
56
117
|
if obey_robots:
|
|
@@ -84,7 +145,8 @@ class ScoutCrawler:
|
|
|
84
145
|
parsed_url = urllib.parse.urlparse(url)
|
|
85
146
|
if parsed_url.scheme not in ["http", "https"]:
|
|
86
147
|
return False
|
|
87
|
-
|
|
148
|
+
# Allow crawling subdomains
|
|
149
|
+
if not parsed_url.netloc.endswith(self.base_domain):
|
|
88
150
|
return False
|
|
89
151
|
if self.obey_robots and self.robots:
|
|
90
152
|
return self.robots.can_fetch("*", url)
|
|
@@ -127,6 +189,9 @@ class ScoutCrawler:
|
|
|
127
189
|
"""
|
|
128
190
|
if url in self.visited_urls or self._is_duplicate(url):
|
|
129
191
|
return {}
|
|
192
|
+
# Log URL to crawl
|
|
193
|
+
print(f"Attempting to crawl URL: {url} (depth: {depth})")
|
|
194
|
+
|
|
130
195
|
# Throttle requests
|
|
131
196
|
now = time.time()
|
|
132
197
|
if self.last_request_time:
|
|
@@ -142,18 +207,38 @@ class ScoutCrawler:
|
|
|
142
207
|
scout = Scout(response.content, features="lxml")
|
|
143
208
|
title_result = scout.find("title")
|
|
144
209
|
title = title_result[0].get_text() if title_result else ""
|
|
210
|
+
|
|
211
|
+
# Remove only script and style tags before extracting text
|
|
145
212
|
for tag_name in self.tags_to_remove:
|
|
146
213
|
for tag in scout._soup.find_all(tag_name):
|
|
147
|
-
tag.
|
|
214
|
+
tag.decompose()
|
|
215
|
+
|
|
148
216
|
visible_text = self._extract_main_text(scout._soup)
|
|
217
|
+
|
|
218
|
+
# Extract links from header, footer, nav, etc.
|
|
219
|
+
essential_links = []
|
|
220
|
+
for essential_tag in ['header', 'nav', 'footer']:
|
|
221
|
+
elements = scout.find_all(essential_tag)
|
|
222
|
+
for element in elements:
|
|
223
|
+
links = element.find_all('a', href=True)
|
|
224
|
+
essential_links.extend(
|
|
225
|
+
urllib.parse.urljoin(url, link.get('href'))
|
|
226
|
+
for link in links
|
|
227
|
+
if link.get('href') and self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
|
|
228
|
+
)
|
|
229
|
+
|
|
230
|
+
all_links = [
|
|
231
|
+
urllib.parse.urljoin(url, link.get('href'))
|
|
232
|
+
for link in scout.find_all('a', href=True)
|
|
233
|
+
if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
|
|
234
|
+
]
|
|
235
|
+
|
|
236
|
+
combined_links = list(set(all_links + essential_links))
|
|
237
|
+
|
|
149
238
|
page_info = {
|
|
150
239
|
'url': url,
|
|
151
240
|
'title': title,
|
|
152
|
-
'links':
|
|
153
|
-
urllib.parse.urljoin(url, link.get('href'))
|
|
154
|
-
for link in scout.find_all('a', href=True)
|
|
155
|
-
if self._is_valid_url(urllib.parse.urljoin(url, link.get('href')))
|
|
156
|
-
],
|
|
241
|
+
'links': combined_links,
|
|
157
242
|
'text': visible_text,
|
|
158
243
|
'depth': depth,
|
|
159
244
|
'timestamp': datetime.utcnow().isoformat(),
|
|
@@ -178,7 +263,7 @@ class ScoutCrawler:
|
|
|
178
263
|
submitted_links: set[str] = set()
|
|
179
264
|
|
|
180
265
|
while futures:
|
|
181
|
-
if len(self.visited_urls) >= self.max_pages:
|
|
266
|
+
if self.max_pages is not None and len(self.visited_urls) >= self.max_pages:
|
|
182
267
|
break
|
|
183
268
|
done, not_done = concurrent.futures.wait(
|
|
184
269
|
futures, return_when=concurrent.futures.FIRST_COMPLETED
|
|
@@ -190,21 +275,23 @@ class ScoutCrawler:
|
|
|
190
275
|
|
|
191
276
|
if page_info:
|
|
192
277
|
yield page_info
|
|
278
|
+
|
|
279
|
+
if self.max_pages is not None and len(self.visited_urls) >= self.max_pages:
|
|
280
|
+
return
|
|
193
281
|
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
link,
|
|
208
|
-
page_info.get("depth", 0) + 1,
|
|
282
|
+
for link in page_info.get("links", []):
|
|
283
|
+
if (
|
|
284
|
+
(self.max_pages is None or len(self.visited_urls) < self.max_pages)
|
|
285
|
+
and link not in self.visited_urls
|
|
286
|
+
and link not in submitted_links
|
|
287
|
+
):
|
|
288
|
+
submitted_links.add(link)
|
|
289
|
+
futures.add(
|
|
290
|
+
executor.submit(
|
|
291
|
+
self._crawl_page,
|
|
292
|
+
link,
|
|
293
|
+
page_info.get("depth", 0) + 1,
|
|
294
|
+
)
|
|
209
295
|
)
|
|
210
|
-
|
|
296
|
+
else:
|
|
297
|
+
print(f"No page info retrieved from crawling")
|
webscout/scout/core/scout.py
CHANGED
|
@@ -24,7 +24,8 @@ class Scout:
|
|
|
24
24
|
Enhanced with advanced features and intelligent parsing.
|
|
25
25
|
"""
|
|
26
26
|
|
|
27
|
-
def __init__(self, markup="", features='html.parser', from_encoding=None,
|
|
27
|
+
def __init__(self, markup="", features='html.parser', from_encoding=None,
|
|
28
|
+
exclude_encodings=None, element_classes=None, **kwargs):
|
|
28
29
|
"""
|
|
29
30
|
Initialize Scout with HTML content.
|
|
30
31
|
|
|
@@ -32,8 +33,17 @@ class Scout:
|
|
|
32
33
|
markup (str): HTML content to parse
|
|
33
34
|
features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
|
|
34
35
|
from_encoding (str): Source encoding (if known)
|
|
36
|
+
exclude_encodings (list): Encodings to avoid
|
|
37
|
+
element_classes (dict): Custom classes for different element types
|
|
35
38
|
**kwargs: Additional parsing options
|
|
36
39
|
"""
|
|
40
|
+
# Store original markup and settings
|
|
41
|
+
self.original_encoding = from_encoding
|
|
42
|
+
self.exclude_encodings = exclude_encodings or []
|
|
43
|
+
self.element_classes = element_classes or {}
|
|
44
|
+
self.builder_features = features
|
|
45
|
+
self.contains_replacement_characters = False
|
|
46
|
+
|
|
37
47
|
# Intelligent markup handling
|
|
38
48
|
self.markup = self._preprocess_markup(markup, from_encoding)
|
|
39
49
|
self.features = features
|
|
@@ -50,13 +60,24 @@ class Scout:
|
|
|
50
60
|
|
|
51
61
|
# Parse that HTML! 🎯
|
|
52
62
|
self._soup = self.parser.parse(self.markup)
|
|
53
|
-
|
|
63
|
+
|
|
64
|
+
# Set up the root element properly
|
|
65
|
+
if hasattr(self._soup, 'name'):
|
|
66
|
+
self.name = self._soup.name
|
|
67
|
+
else:
|
|
68
|
+
self.name = '[document]'
|
|
69
|
+
|
|
54
70
|
# BeautifulSoup-like attributes
|
|
55
|
-
self.name = self._soup.name if hasattr(self._soup, 'name') else None
|
|
56
71
|
self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
|
|
57
|
-
|
|
58
|
-
|
|
72
|
+
self.contents = self._soup.contents if hasattr(self._soup, 'contents') else []
|
|
73
|
+
self.parent = None
|
|
74
|
+
self.next_sibling = None
|
|
75
|
+
self.previous_sibling = None
|
|
76
|
+
|
|
77
|
+
# Advanced parsing options and caching
|
|
59
78
|
self._cache = {}
|
|
79
|
+
self._tag_name_cache = {}
|
|
80
|
+
self._css_selector_cache = {}
|
|
60
81
|
|
|
61
82
|
# Text and web analyzers
|
|
62
83
|
self.text_analyzer = ScoutTextAnalyzer()
|
webscout/version.py
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
__version__ = "8.3.
|
|
1
|
+
__version__ = "8.3.6"
|
|
2
2
|
__prog__ = "webscout"
|
webscout/webscout_search.py
CHANGED
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
# import logging
|
|
4
4
|
import json
|
|
5
|
-
|
|
5
|
+
import os
|
|
6
6
|
import warnings
|
|
7
7
|
from concurrent.futures import ThreadPoolExecutor
|
|
8
8
|
from datetime import datetime, timezone
|
|
@@ -13,10 +13,17 @@ from random import choice, shuffle
|
|
|
13
13
|
from threading import Event
|
|
14
14
|
from time import sleep, time
|
|
15
15
|
from types import TracebackType
|
|
16
|
-
from typing import Any,
|
|
17
|
-
import
|
|
18
|
-
|
|
16
|
+
from typing import Any, Literal
|
|
17
|
+
from urllib.parse import quote
|
|
18
|
+
|
|
19
19
|
from webscout.litagent import LitAgent
|
|
20
|
+
|
|
21
|
+
# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
|
|
22
|
+
# See: https://github.com/python-trio/trio/issues/3015
|
|
23
|
+
try:
|
|
24
|
+
import trio # noqa: F401
|
|
25
|
+
except ImportError:
|
|
26
|
+
pass # trio is optional, ignore if not available
|
|
20
27
|
import curl_cffi.requests # type: ignore
|
|
21
28
|
|
|
22
29
|
try:
|
|
@@ -28,7 +35,7 @@ try:
|
|
|
28
35
|
except ImportError:
|
|
29
36
|
LXML_AVAILABLE = False
|
|
30
37
|
|
|
31
|
-
from .exceptions import
|
|
38
|
+
from .exceptions import RatelimitE, TimeoutE, WebscoutE
|
|
32
39
|
from .utils import (
|
|
33
40
|
_calculate_distance,
|
|
34
41
|
_expand_proxy_tb_alias,
|
|
@@ -1173,4 +1180,4 @@ class WEBS:
|
|
|
1173
1180
|
"visibility_m": hour.get("visibility"),
|
|
1174
1181
|
})
|
|
1175
1182
|
|
|
1176
|
-
return formatted_data
|
|
1183
|
+
return formatted_data
|
|
@@ -3,14 +3,19 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import os
|
|
5
5
|
import warnings
|
|
6
|
-
from datetime import datetime, timezone
|
|
7
6
|
from functools import cached_property
|
|
8
7
|
from itertools import cycle
|
|
9
8
|
from random import choice, shuffle
|
|
10
9
|
from time import time
|
|
11
10
|
from types import TracebackType
|
|
12
|
-
from typing import Any, Dict, List, Optional, Type, Union
|
|
13
|
-
|
|
11
|
+
from typing import Any, Dict, List, Optional, Type, Union
|
|
12
|
+
|
|
13
|
+
# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
|
|
14
|
+
# See: https://github.com/python-trio/trio/issues/3015
|
|
15
|
+
try:
|
|
16
|
+
import trio # noqa: F401
|
|
17
|
+
except ImportError:
|
|
18
|
+
pass # trio is optional, ignore if not available
|
|
14
19
|
import curl_cffi.requests
|
|
15
20
|
from lxml.etree import _Element
|
|
16
21
|
from lxml.html import HTMLParser as LHTMLParser
|
|
@@ -18,18 +23,15 @@ from lxml.html import document_fromstring
|
|
|
18
23
|
|
|
19
24
|
from webscout.litagent.agent import LitAgent
|
|
20
25
|
|
|
21
|
-
from .exceptions import
|
|
26
|
+
from .exceptions import RatelimitE, TimeoutE, WebscoutE
|
|
22
27
|
from .utils import (
|
|
23
28
|
_expand_proxy_tb_alias,
|
|
24
29
|
_extract_vqd,
|
|
25
30
|
_normalize,
|
|
26
31
|
_normalize_url,
|
|
27
|
-
json_loads,
|
|
28
32
|
)
|
|
29
33
|
|
|
30
34
|
|
|
31
|
-
|
|
32
|
-
|
|
33
35
|
class AsyncWEBS:
|
|
34
36
|
"""Asynchronous webscout class to get search results."""
|
|
35
37
|
|
|
@@ -644,4 +646,4 @@ class AsyncWEBS:
|
|
|
644
646
|
TimeoutE: Inherits from WebscoutE, raised for API request timeouts.
|
|
645
647
|
"""
|
|
646
648
|
# These methods are not implemented in the async version yet
|
|
647
|
-
raise NotImplementedError("aweather method is not implemented yet")
|
|
649
|
+
raise NotImplementedError("aweather method is not implemented yet")
|
webscout/yep_search.py
CHANGED
|
@@ -1,9 +1,17 @@
|
|
|
1
|
-
|
|
1
|
+
# Import trio before curl_cffi to prevent eventlet socket monkey-patching conflicts
|
|
2
|
+
# See: https://github.com/python-trio/trio/issues/3015
|
|
3
|
+
try:
|
|
4
|
+
import trio # noqa: F401
|
|
5
|
+
except ImportError:
|
|
6
|
+
pass # trio is optional, ignore if not available
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
8
|
+
from typing import Dict, List, Optional
|
|
2
9
|
from urllib.parse import urlencode
|
|
10
|
+
|
|
11
|
+
from curl_cffi.requests import Session
|
|
12
|
+
|
|
3
13
|
from webscout.litagent import LitAgent
|
|
4
|
-
|
|
5
|
-
from concurrent.futures import ThreadPoolExecutor
|
|
6
|
-
import json
|
|
14
|
+
|
|
7
15
|
|
|
8
16
|
class YepSearch:
|
|
9
17
|
"""Yep.com search class to get search results."""
|
|
@@ -335,4 +343,4 @@ if __name__ == "__main__":
|
|
|
335
343
|
print("---" * 30)
|
|
336
344
|
print(image_results)
|
|
337
345
|
print("---" * 30)
|
|
338
|
-
print(suggestions)
|
|
346
|
+
print(suggestions)
|