webscout 8.2.8__py3-none-any.whl → 8.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIauto.py +34 -16
- webscout/AIbase.py +96 -37
- webscout/AIutel.py +491 -87
- webscout/Bard.py +441 -323
- webscout/Extra/GitToolkit/__init__.py +10 -10
- webscout/Extra/YTToolkit/ytapi/video.py +232 -232
- webscout/Litlogger/README.md +10 -0
- webscout/Litlogger/__init__.py +7 -59
- webscout/Litlogger/formats.py +4 -0
- webscout/Litlogger/handlers.py +103 -0
- webscout/Litlogger/levels.py +13 -0
- webscout/Litlogger/logger.py +92 -0
- webscout/Provider/AISEARCH/Perplexity.py +332 -358
- webscout/Provider/AISEARCH/felo_search.py +9 -35
- webscout/Provider/AISEARCH/genspark_search.py +30 -56
- webscout/Provider/AISEARCH/hika_search.py +4 -16
- webscout/Provider/AISEARCH/iask_search.py +410 -436
- webscout/Provider/AISEARCH/monica_search.py +4 -30
- webscout/Provider/AISEARCH/scira_search.py +6 -32
- webscout/Provider/AISEARCH/webpilotai_search.py +38 -64
- webscout/Provider/Blackboxai.py +155 -35
- webscout/Provider/ChatSandbox.py +2 -1
- webscout/Provider/Deepinfra.py +339 -339
- webscout/Provider/ExaChat.py +358 -358
- webscout/Provider/Gemini.py +169 -169
- webscout/Provider/GithubChat.py +1 -2
- webscout/Provider/Glider.py +3 -3
- webscout/Provider/HeckAI.py +172 -82
- webscout/Provider/LambdaChat.py +1 -0
- webscout/Provider/MCPCore.py +7 -3
- webscout/Provider/OPENAI/BLACKBOXAI.py +421 -139
- webscout/Provider/OPENAI/Cloudflare.py +38 -21
- webscout/Provider/OPENAI/FalconH1.py +457 -0
- webscout/Provider/OPENAI/FreeGemini.py +35 -18
- webscout/Provider/OPENAI/NEMOTRON.py +34 -34
- webscout/Provider/OPENAI/PI.py +427 -0
- webscout/Provider/OPENAI/Qwen3.py +304 -0
- webscout/Provider/OPENAI/README.md +952 -1253
- webscout/Provider/OPENAI/TwoAI.py +374 -0
- webscout/Provider/OPENAI/__init__.py +7 -1
- webscout/Provider/OPENAI/ai4chat.py +73 -63
- webscout/Provider/OPENAI/api.py +869 -644
- webscout/Provider/OPENAI/base.py +2 -0
- webscout/Provider/OPENAI/c4ai.py +34 -13
- webscout/Provider/OPENAI/chatgpt.py +575 -556
- webscout/Provider/OPENAI/chatgptclone.py +512 -487
- webscout/Provider/OPENAI/chatsandbox.py +11 -6
- webscout/Provider/OPENAI/copilot.py +258 -0
- webscout/Provider/OPENAI/deepinfra.py +327 -318
- webscout/Provider/OPENAI/e2b.py +140 -104
- webscout/Provider/OPENAI/exaai.py +420 -411
- webscout/Provider/OPENAI/exachat.py +448 -443
- webscout/Provider/OPENAI/flowith.py +7 -3
- webscout/Provider/OPENAI/freeaichat.py +12 -8
- webscout/Provider/OPENAI/glider.py +15 -8
- webscout/Provider/OPENAI/groq.py +5 -2
- webscout/Provider/OPENAI/heckai.py +311 -307
- webscout/Provider/OPENAI/llmchatco.py +9 -7
- webscout/Provider/OPENAI/mcpcore.py +18 -9
- webscout/Provider/OPENAI/multichat.py +7 -5
- webscout/Provider/OPENAI/netwrck.py +16 -11
- webscout/Provider/OPENAI/oivscode.py +290 -0
- webscout/Provider/OPENAI/opkfc.py +507 -496
- webscout/Provider/OPENAI/pydantic_imports.py +172 -0
- webscout/Provider/OPENAI/scirachat.py +29 -17
- webscout/Provider/OPENAI/sonus.py +308 -303
- webscout/Provider/OPENAI/standardinput.py +442 -433
- webscout/Provider/OPENAI/textpollinations.py +18 -11
- webscout/Provider/OPENAI/toolbaz.py +419 -413
- webscout/Provider/OPENAI/typefully.py +17 -10
- webscout/Provider/OPENAI/typegpt.py +21 -11
- webscout/Provider/OPENAI/uncovrAI.py +477 -462
- webscout/Provider/OPENAI/utils.py +90 -79
- webscout/Provider/OPENAI/venice.py +435 -425
- webscout/Provider/OPENAI/wisecat.py +387 -381
- webscout/Provider/OPENAI/writecream.py +166 -163
- webscout/Provider/OPENAI/x0gpt.py +26 -37
- webscout/Provider/OPENAI/yep.py +384 -356
- webscout/Provider/PI.py +2 -1
- webscout/Provider/TTI/README.md +55 -101
- webscout/Provider/TTI/__init__.py +4 -9
- webscout/Provider/TTI/aiarta.py +365 -0
- webscout/Provider/TTI/artbit.py +0 -0
- webscout/Provider/TTI/base.py +64 -0
- webscout/Provider/TTI/fastflux.py +200 -0
- webscout/Provider/TTI/magicstudio.py +201 -0
- webscout/Provider/TTI/piclumen.py +203 -0
- webscout/Provider/TTI/pixelmuse.py +225 -0
- webscout/Provider/TTI/pollinations.py +221 -0
- webscout/Provider/TTI/utils.py +11 -0
- webscout/Provider/TTS/__init__.py +2 -1
- webscout/Provider/TTS/base.py +159 -159
- webscout/Provider/TTS/openai_fm.py +129 -0
- webscout/Provider/TextPollinationsAI.py +308 -308
- webscout/Provider/TwoAI.py +239 -44
- webscout/Provider/UNFINISHED/Youchat.py +330 -330
- webscout/Provider/UNFINISHED/puterjs.py +635 -0
- webscout/Provider/UNFINISHED/test_lmarena.py +119 -119
- webscout/Provider/Writecream.py +246 -246
- webscout/Provider/__init__.py +2 -2
- webscout/Provider/ai4chat.py +33 -8
- webscout/Provider/granite.py +41 -6
- webscout/Provider/koala.py +169 -169
- webscout/Provider/oivscode.py +309 -0
- webscout/Provider/samurai.py +3 -2
- webscout/Provider/scnet.py +1 -0
- webscout/Provider/typegpt.py +3 -3
- webscout/Provider/uncovr.py +368 -368
- webscout/client.py +70 -0
- webscout/litprinter/__init__.py +58 -58
- webscout/optimizers.py +419 -419
- webscout/scout/README.md +3 -1
- webscout/scout/core/crawler.py +134 -64
- webscout/scout/core/scout.py +148 -109
- webscout/scout/element.py +106 -88
- webscout/swiftcli/Readme.md +323 -323
- webscout/swiftcli/plugins/manager.py +9 -2
- webscout/version.py +1 -1
- webscout/zeroart/__init__.py +134 -134
- webscout/zeroart/effects.py +100 -100
- webscout/zeroart/fonts.py +1238 -1238
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/METADATA +160 -35
- webscout-8.3.dist-info/RECORD +290 -0
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/WHEEL +1 -1
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/entry_points.txt +1 -0
- webscout/Litlogger/Readme.md +0 -175
- webscout/Litlogger/core/__init__.py +0 -6
- webscout/Litlogger/core/level.py +0 -23
- webscout/Litlogger/core/logger.py +0 -165
- webscout/Litlogger/handlers/__init__.py +0 -12
- webscout/Litlogger/handlers/console.py +0 -33
- webscout/Litlogger/handlers/file.py +0 -143
- webscout/Litlogger/handlers/network.py +0 -173
- webscout/Litlogger/styles/__init__.py +0 -7
- webscout/Litlogger/styles/colors.py +0 -249
- webscout/Litlogger/styles/formats.py +0 -458
- webscout/Litlogger/styles/text.py +0 -87
- webscout/Litlogger/utils/__init__.py +0 -6
- webscout/Litlogger/utils/detectors.py +0 -153
- webscout/Litlogger/utils/formatters.py +0 -200
- webscout/Provider/ChatGPTGratis.py +0 -194
- webscout/Provider/TTI/AiForce/README.md +0 -159
- webscout/Provider/TTI/AiForce/__init__.py +0 -22
- webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
- webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
- webscout/Provider/TTI/FreeAIPlayground/README.md +0 -99
- webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
- webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
- webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
- webscout/Provider/TTI/ImgSys/README.md +0 -174
- webscout/Provider/TTI/ImgSys/__init__.py +0 -23
- webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
- webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
- webscout/Provider/TTI/MagicStudio/README.md +0 -101
- webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
- webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
- webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
- webscout/Provider/TTI/Nexra/README.md +0 -155
- webscout/Provider/TTI/Nexra/__init__.py +0 -22
- webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
- webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
- webscout/Provider/TTI/PollinationsAI/README.md +0 -146
- webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
- webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
- webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
- webscout/Provider/TTI/aiarta/README.md +0 -134
- webscout/Provider/TTI/aiarta/__init__.py +0 -2
- webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
- webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
- webscout/Provider/TTI/artbit/README.md +0 -100
- webscout/Provider/TTI/artbit/__init__.py +0 -22
- webscout/Provider/TTI/artbit/async_artbit.py +0 -155
- webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
- webscout/Provider/TTI/fastflux/README.md +0 -129
- webscout/Provider/TTI/fastflux/__init__.py +0 -22
- webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
- webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
- webscout/Provider/TTI/huggingface/README.md +0 -114
- webscout/Provider/TTI/huggingface/__init__.py +0 -22
- webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
- webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
- webscout/Provider/TTI/piclumen/README.md +0 -161
- webscout/Provider/TTI/piclumen/__init__.py +0 -23
- webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
- webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
- webscout/Provider/TTI/pixelmuse/README.md +0 -79
- webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
- webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
- webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
- webscout/Provider/TTI/talkai/README.md +0 -139
- webscout/Provider/TTI/talkai/__init__.py +0 -4
- webscout/Provider/TTI/talkai/async_talkai.py +0 -229
- webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
- webscout/Provider/UNFINISHED/oivscode.py +0 -351
- webscout-8.2.8.dist-info/RECORD +0 -334
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.2.8.dist-info → webscout-8.3.dist-info}/top_level.txt +0 -0
webscout/scout/core/scout.py
CHANGED
|
@@ -1,20 +1,20 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Scout Main Module - HTML Parsing and Traversal
|
|
3
3
|
"""
|
|
4
|
-
import re
|
|
5
|
-
import json
|
|
6
4
|
import hashlib
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
7
|
import unicodedata
|
|
8
8
|
import urllib.parse
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Any, Dict, List, Optional
|
|
10
10
|
|
|
11
|
+
from ..element import NavigableString, Tag
|
|
11
12
|
from ..parsers import ParserRegistry
|
|
12
|
-
from ..element import Tag, NavigableString
|
|
13
13
|
from ..utils import decode_markup
|
|
14
|
-
from .text_analyzer import ScoutTextAnalyzer
|
|
15
|
-
from .web_analyzer import ScoutWebAnalyzer
|
|
16
14
|
from .search_result import ScoutSearchResult
|
|
15
|
+
from .text_analyzer import ScoutTextAnalyzer
|
|
17
16
|
from .text_utils import SentenceTokenizer
|
|
17
|
+
from .web_analyzer import ScoutWebAnalyzer
|
|
18
18
|
|
|
19
19
|
|
|
20
20
|
class Scout:
|
|
@@ -23,11 +23,11 @@ class Scout:
|
|
|
23
23
|
A comprehensive HTML parsing and traversal library.
|
|
24
24
|
Enhanced with advanced features and intelligent parsing.
|
|
25
25
|
"""
|
|
26
|
-
|
|
26
|
+
|
|
27
27
|
def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
|
|
28
28
|
"""
|
|
29
29
|
Initialize Scout with HTML content.
|
|
30
|
-
|
|
30
|
+
|
|
31
31
|
Args:
|
|
32
32
|
markup (str): HTML content to parse
|
|
33
33
|
features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
|
|
@@ -38,50 +38,50 @@ class Scout:
|
|
|
38
38
|
self.markup = self._preprocess_markup(markup, from_encoding)
|
|
39
39
|
self.features = features
|
|
40
40
|
self.from_encoding = from_encoding
|
|
41
|
-
|
|
41
|
+
|
|
42
42
|
# Get the right parser for the job
|
|
43
43
|
if features not in ParserRegistry.list_parsers():
|
|
44
44
|
raise ValueError(
|
|
45
45
|
f"Invalid parser '{features}'! Choose from: {', '.join(ParserRegistry.list_parsers().keys())}"
|
|
46
46
|
)
|
|
47
|
-
|
|
47
|
+
|
|
48
48
|
parser_class = ParserRegistry.get_parser(features)
|
|
49
49
|
self.parser = parser_class
|
|
50
|
-
|
|
50
|
+
|
|
51
51
|
# Parse that HTML! 🎯
|
|
52
52
|
self._soup = self.parser.parse(self.markup)
|
|
53
|
-
|
|
53
|
+
|
|
54
54
|
# BeautifulSoup-like attributes
|
|
55
55
|
self.name = self._soup.name if hasattr(self._soup, 'name') else None
|
|
56
56
|
self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
|
|
57
|
-
|
|
57
|
+
|
|
58
58
|
# Advanced parsing options
|
|
59
59
|
self._cache = {}
|
|
60
|
-
|
|
60
|
+
|
|
61
61
|
# Text and web analyzers
|
|
62
62
|
self.text_analyzer = ScoutTextAnalyzer()
|
|
63
63
|
self.web_analyzer = ScoutWebAnalyzer()
|
|
64
|
-
|
|
64
|
+
|
|
65
65
|
def normalize_text(self, text: str, form='NFKD') -> str:
|
|
66
66
|
"""
|
|
67
67
|
Normalize text using Unicode normalization.
|
|
68
|
-
|
|
68
|
+
|
|
69
69
|
Args:
|
|
70
70
|
text (str): Input text
|
|
71
71
|
form (str, optional): Normalization form
|
|
72
|
-
|
|
72
|
+
|
|
73
73
|
Returns:
|
|
74
74
|
str: Normalized text
|
|
75
75
|
"""
|
|
76
76
|
return unicodedata.normalize(form, text)
|
|
77
|
-
|
|
77
|
+
|
|
78
78
|
def url_parse(self, url: str) -> Dict[str, str]:
|
|
79
79
|
"""
|
|
80
80
|
Parse and analyze a URL.
|
|
81
|
-
|
|
81
|
+
|
|
82
82
|
Args:
|
|
83
83
|
url (str): URL to parse
|
|
84
|
-
|
|
84
|
+
|
|
85
85
|
Returns:
|
|
86
86
|
Dict[str, str]: Parsed URL components
|
|
87
87
|
"""
|
|
@@ -94,39 +94,39 @@ class Scout:
|
|
|
94
94
|
'query': parsed.query,
|
|
95
95
|
'fragment': parsed.fragment
|
|
96
96
|
}
|
|
97
|
-
|
|
97
|
+
|
|
98
98
|
def analyze_page_structure(self) -> Dict[str, Any]:
|
|
99
99
|
"""
|
|
100
100
|
Analyze the structure of the parsed page.
|
|
101
|
-
|
|
101
|
+
|
|
102
102
|
Returns:
|
|
103
103
|
Dict[str, Any]: Page structure analysis
|
|
104
104
|
"""
|
|
105
105
|
return self.web_analyzer.analyze_page_structure(self)
|
|
106
|
-
|
|
106
|
+
|
|
107
107
|
def analyze_text(self, text: Optional[str] = None) -> Dict[str, Any]:
|
|
108
108
|
"""
|
|
109
109
|
Perform advanced text analysis.
|
|
110
|
-
|
|
110
|
+
|
|
111
111
|
Args:
|
|
112
112
|
text (str, optional): Text to analyze. If None, uses page text.
|
|
113
|
-
|
|
113
|
+
|
|
114
114
|
Returns:
|
|
115
115
|
Dict[str, Any]: Text analysis results
|
|
116
116
|
"""
|
|
117
117
|
if text is None:
|
|
118
118
|
text = self.get_text()
|
|
119
|
-
|
|
119
|
+
|
|
120
120
|
return {
|
|
121
121
|
'word_count': self.text_analyzer.count_words(text),
|
|
122
122
|
'entities': self.text_analyzer.extract_entities(text),
|
|
123
123
|
'tokens': self.text_analyzer.tokenize(text)
|
|
124
124
|
}
|
|
125
|
-
|
|
125
|
+
|
|
126
126
|
def extract_semantic_info(self) -> Dict[str, Any]:
|
|
127
127
|
"""
|
|
128
128
|
Extract semantic information from the document.
|
|
129
|
-
|
|
129
|
+
|
|
130
130
|
Returns:
|
|
131
131
|
Dict[str, Any]: Semantic information
|
|
132
132
|
"""
|
|
@@ -146,29 +146,29 @@ class Scout:
|
|
|
146
146
|
}
|
|
147
147
|
}
|
|
148
148
|
return semantic_info
|
|
149
|
-
|
|
149
|
+
|
|
150
150
|
def cache(self, key: str, value: Any = None) -> Any:
|
|
151
151
|
"""
|
|
152
152
|
Manage a cache for parsed content.
|
|
153
|
-
|
|
153
|
+
|
|
154
154
|
Args:
|
|
155
155
|
key (str): Cache key
|
|
156
156
|
value (Any, optional): Value to cache
|
|
157
|
-
|
|
157
|
+
|
|
158
158
|
Returns:
|
|
159
159
|
Any: Cached value or None
|
|
160
160
|
"""
|
|
161
161
|
if value is not None:
|
|
162
162
|
self._cache[key] = value
|
|
163
163
|
return self._cache.get(key)
|
|
164
|
-
|
|
164
|
+
|
|
165
165
|
def hash_content(self, method='md5') -> str:
|
|
166
166
|
"""
|
|
167
167
|
Generate a hash of the parsed content.
|
|
168
|
-
|
|
168
|
+
|
|
169
169
|
Args:
|
|
170
170
|
method (str, optional): Hashing method
|
|
171
|
-
|
|
171
|
+
|
|
172
172
|
Returns:
|
|
173
173
|
str: Content hash
|
|
174
174
|
"""
|
|
@@ -177,21 +177,21 @@ class Scout:
|
|
|
177
177
|
'sha1': hashlib.sha1,
|
|
178
178
|
'sha256': hashlib.sha256
|
|
179
179
|
}
|
|
180
|
-
|
|
180
|
+
|
|
181
181
|
if method not in hash_methods:
|
|
182
182
|
raise ValueError(f"Unsupported hash method: {method}")
|
|
183
|
-
|
|
183
|
+
|
|
184
184
|
hasher = hash_methods[method]()
|
|
185
185
|
hasher.update(str(self._soup).encode('utf-8'))
|
|
186
186
|
return hasher.hexdigest()
|
|
187
|
-
|
|
187
|
+
|
|
188
188
|
def extract_links(self, base_url: Optional[str] = None) -> List[Dict[str, str]]:
|
|
189
189
|
"""
|
|
190
190
|
Extract all links from the document.
|
|
191
|
-
|
|
191
|
+
|
|
192
192
|
Args:
|
|
193
193
|
base_url (str, optional): Base URL for resolving relative links
|
|
194
|
-
|
|
194
|
+
|
|
195
195
|
Returns:
|
|
196
196
|
List[Dict[str, str]]: List of link dictionaries
|
|
197
197
|
"""
|
|
@@ -202,7 +202,7 @@ class Scout:
|
|
|
202
202
|
# Resolve relative URLs if base_url is provided
|
|
203
203
|
if base_url and not href.startswith(('http://', 'https://', '//')):
|
|
204
204
|
href = f"{base_url.rstrip('/')}/{href.lstrip('/')}"
|
|
205
|
-
|
|
205
|
+
|
|
206
206
|
links.append({
|
|
207
207
|
'href': href,
|
|
208
208
|
'text': link.get_text(strip=True),
|
|
@@ -210,11 +210,11 @@ class Scout:
|
|
|
210
210
|
'type': link.get('type')
|
|
211
211
|
})
|
|
212
212
|
return links
|
|
213
|
-
|
|
213
|
+
|
|
214
214
|
def extract_metadata(self) -> Dict[str, Any]:
|
|
215
215
|
"""
|
|
216
216
|
Extract metadata from HTML document.
|
|
217
|
-
|
|
217
|
+
|
|
218
218
|
Returns:
|
|
219
219
|
Dict[str, Any]: Extracted metadata
|
|
220
220
|
"""
|
|
@@ -225,87 +225,87 @@ class Scout:
|
|
|
225
225
|
'og_metadata': {},
|
|
226
226
|
'twitter_metadata': {}
|
|
227
227
|
}
|
|
228
|
-
|
|
228
|
+
|
|
229
229
|
# Open Graph metadata
|
|
230
230
|
for meta in self.find_all('meta', attrs={'property': re.compile(r'^og:')}):
|
|
231
231
|
key = meta.attrs('property')[0][3:]
|
|
232
232
|
metadata['og_metadata'][key] = meta.attrs('content')[0]
|
|
233
|
-
|
|
233
|
+
|
|
234
234
|
# Twitter Card metadata
|
|
235
235
|
for meta in self.find_all('meta', attrs={'name': re.compile(r'^twitter:')}):
|
|
236
236
|
key = meta.attrs('name')[0][8:]
|
|
237
237
|
metadata['twitter_metadata'][key] = meta.attrs('content')[0]
|
|
238
|
-
|
|
238
|
+
|
|
239
239
|
return metadata
|
|
240
|
-
|
|
240
|
+
|
|
241
241
|
def to_json(self, indent=2) -> str:
|
|
242
242
|
"""
|
|
243
243
|
Convert parsed content to JSON.
|
|
244
|
-
|
|
244
|
+
|
|
245
245
|
Args:
|
|
246
246
|
indent (int, optional): JSON indentation
|
|
247
|
-
|
|
247
|
+
|
|
248
248
|
Returns:
|
|
249
249
|
str: JSON representation of the document
|
|
250
250
|
"""
|
|
251
251
|
def _tag_to_dict(tag):
|
|
252
252
|
if isinstance(tag, NavigableString):
|
|
253
253
|
return str(tag)
|
|
254
|
-
|
|
254
|
+
|
|
255
255
|
result = {
|
|
256
256
|
'name': tag.name,
|
|
257
257
|
'attrs': tag.attrs,
|
|
258
258
|
'text': tag.get_text(strip=True)
|
|
259
259
|
}
|
|
260
|
-
|
|
260
|
+
|
|
261
261
|
if tag.contents:
|
|
262
262
|
result['children'] = [_tag_to_dict(child) for child in tag.contents]
|
|
263
|
-
|
|
263
|
+
|
|
264
264
|
return result
|
|
265
|
-
|
|
265
|
+
|
|
266
266
|
return json.dumps(_tag_to_dict(self._soup), indent=indent)
|
|
267
|
-
|
|
267
|
+
|
|
268
268
|
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> ScoutSearchResult:
|
|
269
269
|
"""
|
|
270
270
|
Find the first matching element.
|
|
271
|
-
|
|
271
|
+
|
|
272
272
|
Args:
|
|
273
273
|
name (str, optional): Tag name to search for
|
|
274
274
|
attrs (dict, optional): Attributes to match
|
|
275
275
|
recursive (bool, optional): Search recursively
|
|
276
276
|
text (str, optional): Text content to match
|
|
277
|
-
|
|
277
|
+
|
|
278
278
|
Returns:
|
|
279
279
|
ScoutSearchResult: First matching element
|
|
280
280
|
"""
|
|
281
281
|
result = self._soup.find(name, attrs, recursive, text, **kwargs)
|
|
282
282
|
return ScoutSearchResult([result]) if result else ScoutSearchResult([])
|
|
283
|
-
|
|
283
|
+
|
|
284
284
|
def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> ScoutSearchResult:
|
|
285
285
|
"""
|
|
286
286
|
Find all matching elements.
|
|
287
|
-
|
|
287
|
+
|
|
288
288
|
Args:
|
|
289
289
|
name (str, optional): Tag name to search for
|
|
290
290
|
attrs (dict, optional): Attributes to match
|
|
291
291
|
recursive (bool, optional): Search recursively
|
|
292
292
|
text (str, optional): Text content to match
|
|
293
293
|
limit (int, optional): Maximum number of results
|
|
294
|
-
|
|
294
|
+
|
|
295
295
|
Returns:
|
|
296
296
|
ScoutSearchResult: List of matching elements
|
|
297
297
|
"""
|
|
298
298
|
results = self._soup.find_all(name, attrs, recursive, text, limit, **kwargs)
|
|
299
299
|
return ScoutSearchResult(results)
|
|
300
|
-
|
|
300
|
+
|
|
301
301
|
def find_parent(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
|
|
302
302
|
"""
|
|
303
303
|
Find the first parent matching given criteria.
|
|
304
|
-
|
|
304
|
+
|
|
305
305
|
Args:
|
|
306
306
|
name (str, optional): Tag name to search for
|
|
307
307
|
attrs (dict, optional): Attributes to match
|
|
308
|
-
|
|
308
|
+
|
|
309
309
|
Returns:
|
|
310
310
|
Tag or None: First matching parent
|
|
311
311
|
"""
|
|
@@ -316,16 +316,16 @@ class Scout:
|
|
|
316
316
|
return current
|
|
317
317
|
current = current.parent
|
|
318
318
|
return None
|
|
319
|
-
|
|
319
|
+
|
|
320
320
|
def find_parents(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
|
|
321
321
|
"""
|
|
322
322
|
Find all parents matching given criteria.
|
|
323
|
-
|
|
323
|
+
|
|
324
324
|
Args:
|
|
325
325
|
name (str, optional): Tag name to search for
|
|
326
326
|
attrs (dict, optional): Attributes to match
|
|
327
327
|
limit (int, optional): Maximum number of results
|
|
328
|
-
|
|
328
|
+
|
|
329
329
|
Returns:
|
|
330
330
|
List[Tag]: List of matching parents
|
|
331
331
|
"""
|
|
@@ -337,21 +337,21 @@ class Scout:
|
|
|
337
337
|
parents.append(current)
|
|
338
338
|
current = current.parent
|
|
339
339
|
return parents
|
|
340
|
-
|
|
340
|
+
|
|
341
341
|
def find_next_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
|
|
342
342
|
"""
|
|
343
343
|
Find the next sibling matching given criteria.
|
|
344
|
-
|
|
344
|
+
|
|
345
345
|
Args:
|
|
346
346
|
name (str, optional): Tag name to search for
|
|
347
347
|
attrs (dict, optional): Attributes to match
|
|
348
|
-
|
|
348
|
+
|
|
349
349
|
Returns:
|
|
350
350
|
Tag or None: First matching next sibling
|
|
351
351
|
"""
|
|
352
352
|
if not self._soup.parent:
|
|
353
353
|
return None
|
|
354
|
-
|
|
354
|
+
|
|
355
355
|
siblings = self._soup.parent.contents
|
|
356
356
|
try:
|
|
357
357
|
current_index = siblings.index(self._soup)
|
|
@@ -363,22 +363,22 @@ class Scout:
|
|
|
363
363
|
except ValueError:
|
|
364
364
|
pass
|
|
365
365
|
return None
|
|
366
|
-
|
|
366
|
+
|
|
367
367
|
def find_next_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
|
|
368
368
|
"""
|
|
369
369
|
Find all next siblings matching given criteria.
|
|
370
|
-
|
|
370
|
+
|
|
371
371
|
Args:
|
|
372
372
|
name (str, optional): Tag name to search for
|
|
373
373
|
attrs (dict, optional): Attributes to match
|
|
374
374
|
limit (int, optional): Maximum number of results
|
|
375
|
-
|
|
375
|
+
|
|
376
376
|
Returns:
|
|
377
377
|
List[Tag]: List of matching next siblings
|
|
378
378
|
"""
|
|
379
379
|
if not self._soup.parent:
|
|
380
380
|
return []
|
|
381
|
-
|
|
381
|
+
|
|
382
382
|
siblings = []
|
|
383
383
|
siblings_list = self._soup.parent.contents
|
|
384
384
|
try:
|
|
@@ -393,40 +393,79 @@ class Scout:
|
|
|
393
393
|
except ValueError:
|
|
394
394
|
pass
|
|
395
395
|
return siblings
|
|
396
|
-
|
|
396
|
+
|
|
397
|
+
def find_previous_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
|
|
398
|
+
"""Find the previous sibling matching given criteria."""
|
|
399
|
+
if not self._soup.parent:
|
|
400
|
+
return None
|
|
401
|
+
|
|
402
|
+
siblings = self._soup.parent.contents
|
|
403
|
+
try:
|
|
404
|
+
current_index = siblings.index(self._soup)
|
|
405
|
+
for sibling in reversed(siblings[:current_index]):
|
|
406
|
+
if isinstance(sibling, Tag):
|
|
407
|
+
if (name is None or sibling.name == name) and all(
|
|
408
|
+
sibling.get(k) == v for k, v in attrs.items()
|
|
409
|
+
):
|
|
410
|
+
return sibling
|
|
411
|
+
except ValueError:
|
|
412
|
+
pass
|
|
413
|
+
return None
|
|
414
|
+
|
|
415
|
+
def find_previous_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
|
|
416
|
+
"""Find all previous siblings matching given criteria."""
|
|
417
|
+
if not self._soup.parent:
|
|
418
|
+
return []
|
|
419
|
+
|
|
420
|
+
siblings = []
|
|
421
|
+
siblings_list = self._soup.parent.contents
|
|
422
|
+
try:
|
|
423
|
+
current_index = siblings_list.index(self._soup)
|
|
424
|
+
for sibling in reversed(siblings_list[:current_index]):
|
|
425
|
+
if isinstance(sibling, Tag):
|
|
426
|
+
if (name is None or sibling.name == name) and all(
|
|
427
|
+
sibling.get(k) == v for k, v in attrs.items()
|
|
428
|
+
):
|
|
429
|
+
siblings.append(sibling)
|
|
430
|
+
if limit and len(siblings) == limit:
|
|
431
|
+
break
|
|
432
|
+
except ValueError:
|
|
433
|
+
pass
|
|
434
|
+
return siblings
|
|
435
|
+
|
|
397
436
|
def select(self, selector: str) -> List[Tag]:
|
|
398
437
|
"""
|
|
399
438
|
Select elements using CSS selector.
|
|
400
|
-
|
|
439
|
+
|
|
401
440
|
Args:
|
|
402
441
|
selector (str): CSS selector string
|
|
403
|
-
|
|
442
|
+
|
|
404
443
|
Returns:
|
|
405
444
|
List[Tag]: List of matching elements
|
|
406
445
|
"""
|
|
407
446
|
return self._soup.select(selector)
|
|
408
|
-
|
|
447
|
+
|
|
409
448
|
def select_one(self, selector: str) -> Optional[Tag]:
|
|
410
449
|
"""
|
|
411
450
|
Select the first element matching the CSS selector.
|
|
412
|
-
|
|
451
|
+
|
|
413
452
|
Args:
|
|
414
453
|
selector (str): CSS selector string
|
|
415
|
-
|
|
454
|
+
|
|
416
455
|
Returns:
|
|
417
456
|
Tag or None: First matching element
|
|
418
457
|
"""
|
|
419
458
|
return self._soup.select_one(selector)
|
|
420
|
-
|
|
459
|
+
|
|
421
460
|
def get_text(self, separator=' ', strip=False, types=None) -> str:
|
|
422
461
|
"""
|
|
423
462
|
Extract all text from the parsed document.
|
|
424
|
-
|
|
463
|
+
|
|
425
464
|
Args:
|
|
426
465
|
separator (str, optional): Text separator
|
|
427
466
|
strip (bool, optional): Strip whitespace
|
|
428
467
|
types (list, optional): Types of content to extract
|
|
429
|
-
|
|
468
|
+
|
|
430
469
|
Returns:
|
|
431
470
|
str: Extracted text
|
|
432
471
|
"""
|
|
@@ -434,113 +473,113 @@ class Scout:
|
|
|
434
473
|
text = self._soup.get_text(separator, strip, types)
|
|
435
474
|
sentences = tokenizer.tokenize(text)
|
|
436
475
|
return "\n\n".join(sentences)
|
|
437
|
-
|
|
476
|
+
|
|
438
477
|
def remove_tags(self, tags: List[str]) -> None:
|
|
439
478
|
"""
|
|
440
479
|
Remove specified tags and their contents from the document.
|
|
441
|
-
|
|
480
|
+
|
|
442
481
|
Args:
|
|
443
482
|
tags (List[str]): List of tag names to remove
|
|
444
483
|
"""
|
|
445
484
|
for tag_name in tags:
|
|
446
485
|
for tag in self._soup.find_all(tag_name):
|
|
447
486
|
tag.decompose()
|
|
448
|
-
|
|
487
|
+
|
|
449
488
|
def prettify(self, formatter='minimal') -> str:
|
|
450
489
|
"""
|
|
451
490
|
Return a formatted, pretty-printed version of the HTML.
|
|
452
|
-
|
|
491
|
+
|
|
453
492
|
Args:
|
|
454
493
|
formatter (str, optional): Formatting style
|
|
455
|
-
|
|
494
|
+
|
|
456
495
|
Returns:
|
|
457
496
|
str: Prettified HTML
|
|
458
497
|
"""
|
|
459
498
|
return self._soup.prettify(formatter)
|
|
460
|
-
|
|
499
|
+
|
|
461
500
|
def decompose(self, tag: Tag = None) -> None:
|
|
462
501
|
"""
|
|
463
502
|
Remove a tag and its contents from the document.
|
|
464
|
-
|
|
503
|
+
|
|
465
504
|
Args:
|
|
466
505
|
tag (Tag, optional): Tag to remove. If None, removes the root tag.
|
|
467
506
|
"""
|
|
468
507
|
if tag is None:
|
|
469
508
|
tag = self._soup
|
|
470
509
|
tag.decompose()
|
|
471
|
-
|
|
510
|
+
|
|
472
511
|
def extract(self, tag: Tag = None) -> Tag:
|
|
473
512
|
"""
|
|
474
513
|
Remove a tag from the document and return it.
|
|
475
|
-
|
|
514
|
+
|
|
476
515
|
Args:
|
|
477
516
|
tag (Tag, optional): Tag to extract. If None, extracts the root tag.
|
|
478
|
-
|
|
517
|
+
|
|
479
518
|
Returns:
|
|
480
519
|
Tag: Extracted tag
|
|
481
520
|
"""
|
|
482
521
|
if tag is None:
|
|
483
522
|
tag = self._soup
|
|
484
523
|
return tag.extract()
|
|
485
|
-
|
|
524
|
+
|
|
486
525
|
def clear(self, tag: Tag = None) -> None:
|
|
487
526
|
"""
|
|
488
527
|
Remove a tag's contents while keeping the tag itself.
|
|
489
|
-
|
|
528
|
+
|
|
490
529
|
Args:
|
|
491
530
|
tag (Tag, optional): Tag to clear. If None, clears the root tag.
|
|
492
531
|
"""
|
|
493
532
|
if tag is None:
|
|
494
533
|
tag = self._soup
|
|
495
534
|
tag.clear()
|
|
496
|
-
|
|
535
|
+
|
|
497
536
|
def replace_with(self, old_tag: Tag, new_tag: Tag) -> None:
|
|
498
537
|
"""
|
|
499
538
|
Replace one tag with another.
|
|
500
|
-
|
|
539
|
+
|
|
501
540
|
Args:
|
|
502
541
|
old_tag (Tag): Tag to replace
|
|
503
542
|
new_tag (Tag): Replacement tag
|
|
504
543
|
"""
|
|
505
544
|
old_tag.replace_with(new_tag)
|
|
506
|
-
|
|
545
|
+
|
|
507
546
|
def encode(self, encoding='utf-8') -> bytes:
|
|
508
547
|
"""
|
|
509
548
|
Encode the document to a specific encoding.
|
|
510
|
-
|
|
549
|
+
|
|
511
550
|
Args:
|
|
512
551
|
encoding (str, optional): Encoding to use
|
|
513
|
-
|
|
552
|
+
|
|
514
553
|
Returns:
|
|
515
554
|
bytes: Encoded document
|
|
516
555
|
"""
|
|
517
556
|
return str(self._soup).encode(encoding)
|
|
518
|
-
|
|
557
|
+
|
|
519
558
|
def decode(self, encoding='utf-8') -> str:
|
|
520
559
|
"""
|
|
521
560
|
Decode the document from a specific encoding.
|
|
522
|
-
|
|
561
|
+
|
|
523
562
|
Args:
|
|
524
563
|
encoding (str, optional): Encoding to use
|
|
525
|
-
|
|
564
|
+
|
|
526
565
|
Returns:
|
|
527
566
|
str: Decoded document
|
|
528
567
|
"""
|
|
529
568
|
return str(self._soup)
|
|
530
|
-
|
|
569
|
+
|
|
531
570
|
def __str__(self) -> str:
|
|
532
571
|
"""
|
|
533
572
|
String representation of the parsed document.
|
|
534
|
-
|
|
573
|
+
|
|
535
574
|
Returns:
|
|
536
575
|
str: HTML content
|
|
537
576
|
"""
|
|
538
577
|
return str(self._soup)
|
|
539
|
-
|
|
578
|
+
|
|
540
579
|
def __repr__(self) -> str:
|
|
541
580
|
"""
|
|
542
581
|
Detailed representation of the Scout object.
|
|
543
|
-
|
|
582
|
+
|
|
544
583
|
Returns:
|
|
545
584
|
str: Scout object description
|
|
546
585
|
"""
|
|
@@ -549,20 +588,20 @@ class Scout:
|
|
|
549
588
|
def _preprocess_markup(self, markup: str, encoding: Optional[str] = None) -> str:
|
|
550
589
|
"""
|
|
551
590
|
Preprocess markup before parsing.
|
|
552
|
-
|
|
591
|
+
|
|
553
592
|
Args:
|
|
554
593
|
markup (str): Input markup
|
|
555
594
|
encoding (str, optional): Encoding to use
|
|
556
|
-
|
|
595
|
+
|
|
557
596
|
Returns:
|
|
558
597
|
str: Preprocessed markup
|
|
559
598
|
"""
|
|
560
599
|
# Decode markup
|
|
561
600
|
decoded_markup = decode_markup(markup, encoding)
|
|
562
|
-
|
|
601
|
+
|
|
563
602
|
# Basic HTML cleaning
|
|
564
603
|
# Remove comments, normalize whitespace, etc.
|
|
565
604
|
decoded_markup = re.sub(r'<!--.*?-->', '', decoded_markup, flags=re.DOTALL)
|
|
566
605
|
decoded_markup = re.sub(r'\s+', ' ', decoded_markup)
|
|
567
|
-
|
|
606
|
+
|
|
568
607
|
return decoded_markup
|