webscout 8.2.7__py3-none-any.whl → 8.2.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webscout/AIauto.py +33 -15
- webscout/AIbase.py +96 -37
- webscout/AIutel.py +703 -250
- webscout/Bard.py +441 -323
- webscout/Extra/Act.md +309 -0
- webscout/Extra/GitToolkit/__init__.py +10 -0
- webscout/Extra/GitToolkit/gitapi/README.md +110 -0
- webscout/Extra/GitToolkit/gitapi/__init__.py +12 -0
- webscout/Extra/GitToolkit/gitapi/repository.py +195 -0
- webscout/Extra/GitToolkit/gitapi/user.py +96 -0
- webscout/Extra/GitToolkit/gitapi/utils.py +62 -0
- webscout/Extra/YTToolkit/README.md +375 -0
- webscout/Extra/YTToolkit/YTdownloader.py +957 -0
- webscout/Extra/YTToolkit/__init__.py +3 -0
- webscout/Extra/YTToolkit/transcriber.py +476 -0
- webscout/Extra/YTToolkit/ytapi/README.md +44 -0
- webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
- webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
- webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
- webscout/Extra/YTToolkit/ytapi/extras.py +118 -0
- webscout/Extra/YTToolkit/ytapi/https.py +88 -0
- webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
- webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
- webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
- webscout/Extra/YTToolkit/ytapi/query.py +40 -0
- webscout/Extra/YTToolkit/ytapi/stream.py +63 -0
- webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
- webscout/Extra/YTToolkit/ytapi/video.py +232 -0
- webscout/Extra/__init__.py +7 -0
- webscout/Extra/autocoder/__init__.py +9 -0
- webscout/Extra/autocoder/autocoder.py +1105 -0
- webscout/Extra/autocoder/autocoder_utiles.py +332 -0
- webscout/Extra/gguf.md +430 -0
- webscout/Extra/gguf.py +684 -0
- webscout/Extra/tempmail/README.md +488 -0
- webscout/Extra/tempmail/__init__.py +28 -0
- webscout/Extra/tempmail/async_utils.py +141 -0
- webscout/Extra/tempmail/base.py +161 -0
- webscout/Extra/tempmail/cli.py +187 -0
- webscout/Extra/tempmail/emailnator.py +84 -0
- webscout/Extra/tempmail/mail_tm.py +361 -0
- webscout/Extra/tempmail/temp_mail_io.py +292 -0
- webscout/Extra/weather.md +281 -0
- webscout/Extra/weather.py +194 -0
- webscout/Extra/weather_ascii.py +76 -0
- webscout/Litlogger/README.md +10 -0
- webscout/Litlogger/__init__.py +15 -0
- webscout/Litlogger/formats.py +4 -0
- webscout/Litlogger/handlers.py +103 -0
- webscout/Litlogger/levels.py +13 -0
- webscout/Litlogger/logger.py +92 -0
- webscout/Provider/AI21.py +177 -0
- webscout/Provider/AISEARCH/DeepFind.py +254 -0
- webscout/Provider/AISEARCH/Perplexity.py +333 -0
- webscout/Provider/AISEARCH/README.md +279 -0
- webscout/Provider/AISEARCH/__init__.py +9 -0
- webscout/Provider/AISEARCH/felo_search.py +202 -0
- webscout/Provider/AISEARCH/genspark_search.py +324 -0
- webscout/Provider/AISEARCH/hika_search.py +186 -0
- webscout/Provider/AISEARCH/iask_search.py +410 -0
- webscout/Provider/AISEARCH/monica_search.py +220 -0
- webscout/Provider/AISEARCH/scira_search.py +298 -0
- webscout/Provider/AISEARCH/webpilotai_search.py +255 -0
- webscout/Provider/Aitopia.py +316 -0
- webscout/Provider/AllenAI.py +440 -0
- webscout/Provider/Andi.py +228 -0
- webscout/Provider/Blackboxai.py +791 -0
- webscout/Provider/ChatGPTClone.py +237 -0
- webscout/Provider/ChatGPTGratis.py +194 -0
- webscout/Provider/ChatSandbox.py +342 -0
- webscout/Provider/Cloudflare.py +324 -0
- webscout/Provider/Cohere.py +208 -0
- webscout/Provider/Deepinfra.py +340 -0
- webscout/Provider/ExaAI.py +261 -0
- webscout/Provider/ExaChat.py +358 -0
- webscout/Provider/Flowith.py +217 -0
- webscout/Provider/FreeGemini.py +250 -0
- webscout/Provider/Gemini.py +169 -0
- webscout/Provider/GithubChat.py +369 -0
- webscout/Provider/GizAI.py +295 -0
- webscout/Provider/Glider.py +225 -0
- webscout/Provider/Groq.py +801 -0
- webscout/Provider/HF_space/__init__.py +0 -0
- webscout/Provider/HF_space/qwen_qwen2.py +206 -0
- webscout/Provider/HeckAI.py +375 -0
- webscout/Provider/HuggingFaceChat.py +469 -0
- webscout/Provider/Hunyuan.py +283 -0
- webscout/Provider/Jadve.py +291 -0
- webscout/Provider/Koboldai.py +384 -0
- webscout/Provider/LambdaChat.py +411 -0
- webscout/Provider/Llama3.py +259 -0
- webscout/Provider/MCPCore.py +315 -0
- webscout/Provider/Marcus.py +198 -0
- webscout/Provider/Nemotron.py +218 -0
- webscout/Provider/Netwrck.py +270 -0
- webscout/Provider/OLLAMA.py +396 -0
- webscout/Provider/OPENAI/BLACKBOXAI.py +766 -0
- webscout/Provider/OPENAI/Cloudflare.py +378 -0
- webscout/Provider/OPENAI/FreeGemini.py +283 -0
- webscout/Provider/OPENAI/NEMOTRON.py +232 -0
- webscout/Provider/OPENAI/Qwen3.py +283 -0
- webscout/Provider/OPENAI/README.md +952 -0
- webscout/Provider/OPENAI/TwoAI.py +357 -0
- webscout/Provider/OPENAI/__init__.py +40 -0
- webscout/Provider/OPENAI/ai4chat.py +293 -0
- webscout/Provider/OPENAI/api.py +969 -0
- webscout/Provider/OPENAI/base.py +249 -0
- webscout/Provider/OPENAI/c4ai.py +373 -0
- webscout/Provider/OPENAI/chatgpt.py +556 -0
- webscout/Provider/OPENAI/chatgptclone.py +494 -0
- webscout/Provider/OPENAI/chatsandbox.py +173 -0
- webscout/Provider/OPENAI/copilot.py +242 -0
- webscout/Provider/OPENAI/deepinfra.py +322 -0
- webscout/Provider/OPENAI/e2b.py +1414 -0
- webscout/Provider/OPENAI/exaai.py +417 -0
- webscout/Provider/OPENAI/exachat.py +444 -0
- webscout/Provider/OPENAI/flowith.py +162 -0
- webscout/Provider/OPENAI/freeaichat.py +359 -0
- webscout/Provider/OPENAI/glider.py +326 -0
- webscout/Provider/OPENAI/groq.py +364 -0
- webscout/Provider/OPENAI/heckai.py +308 -0
- webscout/Provider/OPENAI/llmchatco.py +335 -0
- webscout/Provider/OPENAI/mcpcore.py +389 -0
- webscout/Provider/OPENAI/multichat.py +376 -0
- webscout/Provider/OPENAI/netwrck.py +357 -0
- webscout/Provider/OPENAI/oivscode.py +287 -0
- webscout/Provider/OPENAI/opkfc.py +496 -0
- webscout/Provider/OPENAI/pydantic_imports.py +172 -0
- webscout/Provider/OPENAI/scirachat.py +477 -0
- webscout/Provider/OPENAI/sonus.py +304 -0
- webscout/Provider/OPENAI/standardinput.py +433 -0
- webscout/Provider/OPENAI/textpollinations.py +339 -0
- webscout/Provider/OPENAI/toolbaz.py +413 -0
- webscout/Provider/OPENAI/typefully.py +355 -0
- webscout/Provider/OPENAI/typegpt.py +364 -0
- webscout/Provider/OPENAI/uncovrAI.py +463 -0
- webscout/Provider/OPENAI/utils.py +318 -0
- webscout/Provider/OPENAI/venice.py +431 -0
- webscout/Provider/OPENAI/wisecat.py +387 -0
- webscout/Provider/OPENAI/writecream.py +163 -0
- webscout/Provider/OPENAI/x0gpt.py +365 -0
- webscout/Provider/OPENAI/yep.py +382 -0
- webscout/Provider/OpenGPT.py +209 -0
- webscout/Provider/Openai.py +496 -0
- webscout/Provider/PI.py +429 -0
- webscout/Provider/Perplexitylabs.py +415 -0
- webscout/Provider/QwenLM.py +254 -0
- webscout/Provider/Reka.py +214 -0
- webscout/Provider/StandardInput.py +290 -0
- webscout/Provider/TTI/README.md +82 -0
- webscout/Provider/TTI/__init__.py +7 -0
- webscout/Provider/TTI/aiarta.py +365 -0
- webscout/Provider/TTI/artbit.py +0 -0
- webscout/Provider/TTI/base.py +64 -0
- webscout/Provider/TTI/fastflux.py +200 -0
- webscout/Provider/TTI/magicstudio.py +201 -0
- webscout/Provider/TTI/piclumen.py +203 -0
- webscout/Provider/TTI/pixelmuse.py +225 -0
- webscout/Provider/TTI/pollinations.py +221 -0
- webscout/Provider/TTI/utils.py +11 -0
- webscout/Provider/TTS/README.md +192 -0
- webscout/Provider/TTS/__init__.py +10 -0
- webscout/Provider/TTS/base.py +159 -0
- webscout/Provider/TTS/deepgram.py +156 -0
- webscout/Provider/TTS/elevenlabs.py +111 -0
- webscout/Provider/TTS/gesserit.py +128 -0
- webscout/Provider/TTS/murfai.py +113 -0
- webscout/Provider/TTS/openai_fm.py +129 -0
- webscout/Provider/TTS/parler.py +111 -0
- webscout/Provider/TTS/speechma.py +580 -0
- webscout/Provider/TTS/sthir.py +94 -0
- webscout/Provider/TTS/streamElements.py +333 -0
- webscout/Provider/TTS/utils.py +280 -0
- webscout/Provider/TeachAnything.py +229 -0
- webscout/Provider/TextPollinationsAI.py +308 -0
- webscout/Provider/TwoAI.py +475 -0
- webscout/Provider/TypliAI.py +305 -0
- webscout/Provider/UNFINISHED/ChatHub.py +209 -0
- webscout/Provider/UNFINISHED/Youchat.py +330 -0
- webscout/Provider/UNFINISHED/liner_api_request.py +263 -0
- webscout/Provider/UNFINISHED/puterjs.py +635 -0
- webscout/Provider/UNFINISHED/test_lmarena.py +119 -0
- webscout/Provider/Venice.py +258 -0
- webscout/Provider/VercelAI.py +253 -0
- webscout/Provider/WiseCat.py +233 -0
- webscout/Provider/WrDoChat.py +370 -0
- webscout/Provider/Writecream.py +246 -0
- webscout/Provider/WritingMate.py +269 -0
- webscout/Provider/__init__.py +174 -0
- webscout/Provider/ai4chat.py +174 -0
- webscout/Provider/akashgpt.py +335 -0
- webscout/Provider/asksteve.py +220 -0
- webscout/Provider/cerebras.py +290 -0
- webscout/Provider/chatglm.py +215 -0
- webscout/Provider/cleeai.py +213 -0
- webscout/Provider/copilot.py +425 -0
- webscout/Provider/elmo.py +283 -0
- webscout/Provider/freeaichat.py +285 -0
- webscout/Provider/geminiapi.py +208 -0
- webscout/Provider/granite.py +235 -0
- webscout/Provider/hermes.py +266 -0
- webscout/Provider/julius.py +223 -0
- webscout/Provider/koala.py +170 -0
- webscout/Provider/learnfastai.py +325 -0
- webscout/Provider/llama3mitril.py +215 -0
- webscout/Provider/llmchat.py +258 -0
- webscout/Provider/llmchatco.py +306 -0
- webscout/Provider/lmarena.py +198 -0
- webscout/Provider/meta.py +801 -0
- webscout/Provider/multichat.py +364 -0
- webscout/Provider/oivscode.py +309 -0
- webscout/Provider/samurai.py +224 -0
- webscout/Provider/scira_chat.py +299 -0
- webscout/Provider/scnet.py +243 -0
- webscout/Provider/searchchat.py +292 -0
- webscout/Provider/sonus.py +258 -0
- webscout/Provider/talkai.py +194 -0
- webscout/Provider/toolbaz.py +353 -0
- webscout/Provider/turboseek.py +266 -0
- webscout/Provider/typefully.py +202 -0
- webscout/Provider/typegpt.py +289 -0
- webscout/Provider/uncovr.py +368 -0
- webscout/Provider/x0gpt.py +299 -0
- webscout/Provider/yep.py +389 -0
- webscout/__init__.py +4 -2
- webscout/cli.py +3 -28
- webscout/client.py +70 -0
- webscout/conversation.py +35 -35
- webscout/litagent/Readme.md +276 -0
- webscout/litagent/__init__.py +29 -0
- webscout/litagent/agent.py +455 -0
- webscout/litagent/constants.py +60 -0
- webscout/litprinter/__init__.py +59 -0
- webscout/optimizers.py +419 -419
- webscout/scout/README.md +404 -0
- webscout/scout/__init__.py +8 -0
- webscout/scout/core/__init__.py +7 -0
- webscout/scout/core/crawler.py +210 -0
- webscout/scout/core/scout.py +607 -0
- webscout/scout/core/search_result.py +96 -0
- webscout/scout/core/text_analyzer.py +63 -0
- webscout/scout/core/text_utils.py +277 -0
- webscout/scout/core/web_analyzer.py +52 -0
- webscout/scout/element.py +478 -0
- webscout/scout/parsers/__init__.py +69 -0
- webscout/scout/parsers/html5lib_parser.py +172 -0
- webscout/scout/parsers/html_parser.py +236 -0
- webscout/scout/parsers/lxml_parser.py +178 -0
- webscout/scout/utils.py +37 -0
- webscout/swiftcli/Readme.md +323 -0
- webscout/swiftcli/__init__.py +95 -0
- webscout/swiftcli/core/__init__.py +7 -0
- webscout/swiftcli/core/cli.py +297 -0
- webscout/swiftcli/core/context.py +104 -0
- webscout/swiftcli/core/group.py +241 -0
- webscout/swiftcli/decorators/__init__.py +28 -0
- webscout/swiftcli/decorators/command.py +221 -0
- webscout/swiftcli/decorators/options.py +220 -0
- webscout/swiftcli/decorators/output.py +252 -0
- webscout/swiftcli/exceptions.py +21 -0
- webscout/swiftcli/plugins/__init__.py +9 -0
- webscout/swiftcli/plugins/base.py +135 -0
- webscout/swiftcli/plugins/manager.py +269 -0
- webscout/swiftcli/utils/__init__.py +59 -0
- webscout/swiftcli/utils/formatting.py +252 -0
- webscout/swiftcli/utils/parsing.py +267 -0
- webscout/version.py +1 -1
- webscout/webscout_search.py +2 -182
- webscout/webscout_search_async.py +1 -179
- webscout/zeroart/README.md +89 -0
- webscout/zeroart/__init__.py +135 -0
- webscout/zeroart/base.py +66 -0
- webscout/zeroart/effects.py +101 -0
- webscout/zeroart/fonts.py +1239 -0
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/METADATA +262 -83
- webscout-8.2.9.dist-info/RECORD +289 -0
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/WHEEL +1 -1
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/entry_points.txt +1 -0
- webscout-8.2.7.dist-info/RECORD +0 -26
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.2.7.dist-info → webscout-8.2.9.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,478 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scout Element Module - Advanced HTML Element Representation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NavigableString(str):
|
|
10
|
+
"""
|
|
11
|
+
A string that knows its place in the document tree.
|
|
12
|
+
Mimics BeautifulSoup's NavigableString for better compatibility.
|
|
13
|
+
"""
|
|
14
|
+
def __new__(cls, text: str):
|
|
15
|
+
"""
|
|
16
|
+
Create a new NavigableString instance.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
text (str): String content
|
|
20
|
+
"""
|
|
21
|
+
return str.__new__(cls, text)
|
|
22
|
+
|
|
23
|
+
def __init__(self, text: str):
|
|
24
|
+
"""
|
|
25
|
+
Initialize a navigable string.
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text (str): String content
|
|
29
|
+
"""
|
|
30
|
+
self.parent = None
|
|
31
|
+
|
|
32
|
+
def __repr__(self):
|
|
33
|
+
"""String representation."""
|
|
34
|
+
return f"NavigableString({super().__repr__()})"
|
|
35
|
+
|
|
36
|
+
def __add__(self, other):
|
|
37
|
+
"""
|
|
38
|
+
Allow concatenation of NavigableString with other strings.
|
|
39
|
+
|
|
40
|
+
Args:
|
|
41
|
+
other (str): String to concatenate
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
str: Concatenated string
|
|
45
|
+
"""
|
|
46
|
+
return str(self) + str(other)
|
|
47
|
+
|
|
48
|
+
def strip(self, chars=None):
|
|
49
|
+
"""
|
|
50
|
+
Strip whitespace or specified characters.
|
|
51
|
+
|
|
52
|
+
Args:
|
|
53
|
+
chars (str, optional): Characters to strip
|
|
54
|
+
|
|
55
|
+
Returns:
|
|
56
|
+
str: Stripped string
|
|
57
|
+
"""
|
|
58
|
+
return NavigableString(super().strip(chars))
|
|
59
|
+
|
|
60
|
+
class Tag:
|
|
61
|
+
"""
|
|
62
|
+
Represents an HTML tag with advanced traversal and manipulation capabilities.
|
|
63
|
+
Enhanced to closely mimic BeautifulSoup's Tag class.
|
|
64
|
+
"""
|
|
65
|
+
def __init__(self, name: str, attrs: Dict[str, str] = None):
|
|
66
|
+
"""
|
|
67
|
+
Initialize a Tag with name and attributes.
|
|
68
|
+
|
|
69
|
+
Args:
|
|
70
|
+
name (str): Tag name
|
|
71
|
+
attrs (dict, optional): Tag attributes
|
|
72
|
+
"""
|
|
73
|
+
self.name = name
|
|
74
|
+
self.attrs = attrs or {}
|
|
75
|
+
self.contents = []
|
|
76
|
+
self.parent = None
|
|
77
|
+
self.string = None # For single string content
|
|
78
|
+
|
|
79
|
+
def __str__(self):
|
|
80
|
+
"""String representation of the tag."""
|
|
81
|
+
return self.decode_contents()
|
|
82
|
+
|
|
83
|
+
def __repr__(self):
|
|
84
|
+
"""Detailed representation of the tag."""
|
|
85
|
+
return f"<{self.name} {self.attrs}>"
|
|
86
|
+
|
|
87
|
+
def __call__(self, *args, **kwargs):
|
|
88
|
+
"""
|
|
89
|
+
Allows calling find_all directly on the tag.
|
|
90
|
+
Mimics BeautifulSoup's behavior.
|
|
91
|
+
"""
|
|
92
|
+
return self.find_all(*args, **kwargs)
|
|
93
|
+
|
|
94
|
+
def __contains__(self, item):
|
|
95
|
+
"""
|
|
96
|
+
Check if an item is in the tag's contents.
|
|
97
|
+
|
|
98
|
+
Args:
|
|
99
|
+
item: Item to search for
|
|
100
|
+
|
|
101
|
+
Returns:
|
|
102
|
+
bool: True if item is in contents, False otherwise
|
|
103
|
+
"""
|
|
104
|
+
return item in self.contents
|
|
105
|
+
|
|
106
|
+
def __getitem__(self, key):
|
|
107
|
+
"""
|
|
108
|
+
Get an attribute value using dictionary-like access.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
key (str): Attribute name
|
|
112
|
+
|
|
113
|
+
Returns:
|
|
114
|
+
Any: Attribute value
|
|
115
|
+
"""
|
|
116
|
+
return self.attrs[key]
|
|
117
|
+
|
|
118
|
+
def __iter__(self):
|
|
119
|
+
"""
|
|
120
|
+
Iterate through tag's contents.
|
|
121
|
+
|
|
122
|
+
Returns:
|
|
123
|
+
Iterator: Contents of the tag
|
|
124
|
+
"""
|
|
125
|
+
return iter(self.contents)
|
|
126
|
+
|
|
127
|
+
def __eq__(self, other):
|
|
128
|
+
"""
|
|
129
|
+
Compare tags based on name and attributes.
|
|
130
|
+
|
|
131
|
+
Args:
|
|
132
|
+
other (Tag): Tag to compare
|
|
133
|
+
|
|
134
|
+
Returns:
|
|
135
|
+
bool: True if tags are equivalent
|
|
136
|
+
"""
|
|
137
|
+
if not isinstance(other, Tag):
|
|
138
|
+
return False
|
|
139
|
+
return (
|
|
140
|
+
self.name == other.name and
|
|
141
|
+
self.attrs == other.attrs and
|
|
142
|
+
str(self) == str(other)
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
def __hash__(self):
|
|
146
|
+
"""
|
|
147
|
+
Generate a hash for the tag.
|
|
148
|
+
|
|
149
|
+
Returns:
|
|
150
|
+
int: Hash value
|
|
151
|
+
"""
|
|
152
|
+
return hash((self.name, frozenset(self.attrs.items()), str(self)))
|
|
153
|
+
|
|
154
|
+
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> Optional['Tag']:
|
|
155
|
+
"""
|
|
156
|
+
Find the first matching child element.
|
|
157
|
+
Enhanced with more flexible matching.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
name (str, optional): Tag name to search for
|
|
161
|
+
attrs (dict, optional): Attributes to match
|
|
162
|
+
recursive (bool, optional): Search recursively
|
|
163
|
+
text (str, optional): Text content to match
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
Tag or None: First matching element
|
|
167
|
+
"""
|
|
168
|
+
results = self.find_all(name, attrs, recursive, text, limit=1, **kwargs)
|
|
169
|
+
return results[0] if results else None
|
|
170
|
+
|
|
171
|
+
def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> List['Tag']:
|
|
172
|
+
"""
|
|
173
|
+
Find all matching child elements.
|
|
174
|
+
Enhanced with more flexible matching and BeautifulSoup-like features.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
name (str, optional): Tag name to search for
|
|
178
|
+
attrs (dict, optional): Attributes to match
|
|
179
|
+
recursive (bool, optional): Search recursively
|
|
180
|
+
text (str, optional): Text content to match
|
|
181
|
+
limit (int, optional): Maximum number of results
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
List[Tag]: List of matching elements
|
|
185
|
+
"""
|
|
186
|
+
results = []
|
|
187
|
+
|
|
188
|
+
def _match(tag):
|
|
189
|
+
# Check tag name with case-insensitive and regex support
|
|
190
|
+
if name:
|
|
191
|
+
if isinstance(name, str):
|
|
192
|
+
if tag.name.lower() != name.lower():
|
|
193
|
+
return False
|
|
194
|
+
elif isinstance(name, re.Pattern):
|
|
195
|
+
if not name.search(tag.name):
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
# Check attributes with more flexible matching
|
|
199
|
+
for k, v in attrs.items():
|
|
200
|
+
# Handle special attribute matching
|
|
201
|
+
if k == 'class':
|
|
202
|
+
tag_classes = tag.get('class', [])
|
|
203
|
+
if isinstance(v, str) and v not in tag_classes:
|
|
204
|
+
return False
|
|
205
|
+
elif isinstance(v, list) and not all(cls in tag_classes for cls in v):
|
|
206
|
+
return False
|
|
207
|
+
elif k == 'id':
|
|
208
|
+
if tag.get('id') != v:
|
|
209
|
+
return False
|
|
210
|
+
else:
|
|
211
|
+
# Regex or exact match for other attributes
|
|
212
|
+
tag_attr = tag.attrs.get(k)
|
|
213
|
+
if v is True:
|
|
214
|
+
if tag_attr is None:
|
|
215
|
+
return False
|
|
216
|
+
elif isinstance(v, re.Pattern):
|
|
217
|
+
if tag_attr is None or not v.search(str(tag_attr)):
|
|
218
|
+
return False
|
|
219
|
+
elif tag_attr != v:
|
|
220
|
+
return False
|
|
221
|
+
|
|
222
|
+
# Check text content
|
|
223
|
+
if text:
|
|
224
|
+
tag_text = tag.get_text(strip=True)
|
|
225
|
+
if isinstance(text, str) and text.lower() not in tag_text.lower():
|
|
226
|
+
return False
|
|
227
|
+
elif isinstance(text, re.Pattern) and not text.search(tag_text):
|
|
228
|
+
return False
|
|
229
|
+
|
|
230
|
+
return True
|
|
231
|
+
|
|
232
|
+
def _search(element):
|
|
233
|
+
if _match(element):
|
|
234
|
+
results.append(element)
|
|
235
|
+
if limit and len(results) == limit:
|
|
236
|
+
return
|
|
237
|
+
|
|
238
|
+
if recursive:
|
|
239
|
+
for child in element.contents:
|
|
240
|
+
if isinstance(child, Tag):
|
|
241
|
+
_search(child)
|
|
242
|
+
|
|
243
|
+
_search(self)
|
|
244
|
+
return results
|
|
245
|
+
|
|
246
|
+
def select(self, selector: str) -> List['Tag']:
|
|
247
|
+
"""
|
|
248
|
+
Select elements using CSS selector.
|
|
249
|
+
Enhanced to support more complex selectors.
|
|
250
|
+
|
|
251
|
+
Args:
|
|
252
|
+
selector (str): CSS selector string
|
|
253
|
+
|
|
254
|
+
Returns:
|
|
255
|
+
List[Tag]: List of matching elements
|
|
256
|
+
"""
|
|
257
|
+
# More advanced CSS selector parsing
|
|
258
|
+
# This is a simplified implementation and might need more robust parsing
|
|
259
|
+
parts = re.split(r'\s+', selector.strip())
|
|
260
|
+
results = []
|
|
261
|
+
|
|
262
|
+
def _match_selector(tag, selector_part):
|
|
263
|
+
# Support more complex selectors
|
|
264
|
+
if selector_part.startswith('.'):
|
|
265
|
+
# Class selector
|
|
266
|
+
return selector_part[1:] in tag.get('class', [])
|
|
267
|
+
elif selector_part.startswith('#'):
|
|
268
|
+
# ID selector
|
|
269
|
+
return tag.get('id') == selector_part[1:]
|
|
270
|
+
elif '[' in selector_part and ']' in selector_part:
|
|
271
|
+
# Attribute selector
|
|
272
|
+
attr_match = re.match(r'(\w+)\[([^=]+)(?:=(.+))?\]', selector_part)
|
|
273
|
+
if attr_match:
|
|
274
|
+
tag_name, attr, value = attr_match.groups()
|
|
275
|
+
if tag_name and tag.name != tag_name:
|
|
276
|
+
return False
|
|
277
|
+
if value:
|
|
278
|
+
return tag.get(attr) == value.strip("'\"")
|
|
279
|
+
return attr in tag.attrs
|
|
280
|
+
else:
|
|
281
|
+
# Tag selector
|
|
282
|
+
return tag.name == selector_part
|
|
283
|
+
|
|
284
|
+
def _recursive_select(element, selector_parts):
|
|
285
|
+
if not selector_parts:
|
|
286
|
+
results.append(element)
|
|
287
|
+
return
|
|
288
|
+
|
|
289
|
+
current_selector = selector_parts[0]
|
|
290
|
+
remaining_selectors = selector_parts[1:]
|
|
291
|
+
|
|
292
|
+
if _match_selector(element, current_selector):
|
|
293
|
+
if not remaining_selectors:
|
|
294
|
+
results.append(element)
|
|
295
|
+
else:
|
|
296
|
+
for child in element.contents:
|
|
297
|
+
if isinstance(child, Tag):
|
|
298
|
+
_recursive_select(child, remaining_selectors)
|
|
299
|
+
|
|
300
|
+
for child in self.contents:
|
|
301
|
+
if isinstance(child, Tag):
|
|
302
|
+
_recursive_select(child, parts)
|
|
303
|
+
|
|
304
|
+
return results
|
|
305
|
+
|
|
306
|
+
def select_one(self, selector: str) -> Optional['Tag']:
|
|
307
|
+
"""
|
|
308
|
+
Select the first element matching the CSS selector.
|
|
309
|
+
|
|
310
|
+
Args:
|
|
311
|
+
selector (str): CSS selector string
|
|
312
|
+
|
|
313
|
+
Returns:
|
|
314
|
+
Tag or None: First matching element
|
|
315
|
+
"""
|
|
316
|
+
results = self.select(selector)
|
|
317
|
+
return results[0] if results else None
|
|
318
|
+
|
|
319
|
+
def get_text(self, separator=' ', strip=False, types=None) -> str:
|
|
320
|
+
"""
|
|
321
|
+
Extract text from the tag and its descendants.
|
|
322
|
+
Enhanced to support more flexible text extraction.
|
|
323
|
+
|
|
324
|
+
Args:
|
|
325
|
+
separator (str, optional): Text separator
|
|
326
|
+
strip (bool, optional): Strip whitespace
|
|
327
|
+
types (list, optional): Types of content to extract
|
|
328
|
+
|
|
329
|
+
Returns:
|
|
330
|
+
str: Extracted text
|
|
331
|
+
"""
|
|
332
|
+
texts = []
|
|
333
|
+
for content in self.contents:
|
|
334
|
+
# Support filtering by content type
|
|
335
|
+
if types is None or type(content) in types:
|
|
336
|
+
if isinstance(content, NavigableString):
|
|
337
|
+
texts.append(str(content))
|
|
338
|
+
elif isinstance(content, Tag):
|
|
339
|
+
texts.append(content.get_text(separator, strip))
|
|
340
|
+
|
|
341
|
+
text = separator.join(texts)
|
|
342
|
+
text = re.sub(r'\n\n+', '\n', text) # Replace multiple newlines with single newlines
|
|
343
|
+
return text.strip() if strip else text
|
|
344
|
+
|
|
345
|
+
def find_text(self, pattern: Union[str, re.Pattern], **kwargs) -> Optional[str]:
|
|
346
|
+
"""
|
|
347
|
+
Find the first text matching a pattern.
|
|
348
|
+
|
|
349
|
+
Args:
|
|
350
|
+
pattern (str or re.Pattern): Pattern to match
|
|
351
|
+
**kwargs: Additional arguments for get_text()
|
|
352
|
+
|
|
353
|
+
Returns:
|
|
354
|
+
str or None: First matching text
|
|
355
|
+
"""
|
|
356
|
+
text = self.get_text(**kwargs)
|
|
357
|
+
|
|
358
|
+
if isinstance(pattern, str):
|
|
359
|
+
return pattern if pattern in text else None
|
|
360
|
+
elif isinstance(pattern, re.Pattern):
|
|
361
|
+
match = pattern.search(text)
|
|
362
|
+
return match.group(0) if match else None
|
|
363
|
+
|
|
364
|
+
def replace_text(self, old: Union[str, re.Pattern], new: str, **kwargs) -> str:
|
|
365
|
+
"""
|
|
366
|
+
Replace text matching a pattern.
|
|
367
|
+
|
|
368
|
+
Args:
|
|
369
|
+
old (str or re.Pattern): Pattern to replace
|
|
370
|
+
new (str): Replacement text
|
|
371
|
+
**kwargs: Additional arguments for get_text()
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
str: Modified text
|
|
375
|
+
"""
|
|
376
|
+
text = self.get_text(**kwargs)
|
|
377
|
+
|
|
378
|
+
if isinstance(old, str):
|
|
379
|
+
return text.replace(old, new)
|
|
380
|
+
elif isinstance(old, re.Pattern):
|
|
381
|
+
return old.sub(new, text)
|
|
382
|
+
|
|
383
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
384
|
+
"""
|
|
385
|
+
Get an attribute value.
|
|
386
|
+
|
|
387
|
+
Args:
|
|
388
|
+
key (str): Attribute name
|
|
389
|
+
default (Any, optional): Default value if attribute not found
|
|
390
|
+
|
|
391
|
+
Returns:
|
|
392
|
+
Any: Attribute value or default
|
|
393
|
+
"""
|
|
394
|
+
return self.attrs.get(key, default)
|
|
395
|
+
|
|
396
|
+
def decompose(self) -> None:
|
|
397
|
+
"""Remove the tag and its contents from the document."""
|
|
398
|
+
if self.parent:
|
|
399
|
+
self.parent.contents.remove(self)
|
|
400
|
+
|
|
401
|
+
def extract(self) -> 'Tag':
|
|
402
|
+
"""
|
|
403
|
+
Remove the tag from the document and return it.
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Tag: Extracted tag
|
|
407
|
+
"""
|
|
408
|
+
self.decompose()
|
|
409
|
+
return self
|
|
410
|
+
|
|
411
|
+
def clear(self) -> None:
|
|
412
|
+
"""Remove all contents of the tag."""
|
|
413
|
+
self.contents.clear()
|
|
414
|
+
|
|
415
|
+
def append(self, new_child: Union['Tag', NavigableString, str]) -> None:
|
|
416
|
+
"""Append a new child to this tag."""
|
|
417
|
+
if isinstance(new_child, str):
|
|
418
|
+
new_child = NavigableString(new_child)
|
|
419
|
+
new_child.parent = self
|
|
420
|
+
self.contents.append(new_child)
|
|
421
|
+
|
|
422
|
+
def insert(self, index: int, new_child: Union['Tag', NavigableString, str]) -> None:
|
|
423
|
+
"""Insert a new child at the given index."""
|
|
424
|
+
if isinstance(new_child, str):
|
|
425
|
+
new_child = NavigableString(new_child)
|
|
426
|
+
new_child.parent = self
|
|
427
|
+
self.contents.insert(index, new_child)
|
|
428
|
+
|
|
429
|
+
def replace_with(self, new_tag: 'Tag') -> None:
|
|
430
|
+
"""
|
|
431
|
+
Replace this tag with another tag.
|
|
432
|
+
|
|
433
|
+
Args:
|
|
434
|
+
new_tag (Tag): Tag to replace the current tag
|
|
435
|
+
"""
|
|
436
|
+
if self.parent:
|
|
437
|
+
index = self.parent.contents.index(self)
|
|
438
|
+
self.parent.contents[index] = new_tag
|
|
439
|
+
new_tag.parent = self.parent
|
|
440
|
+
|
|
441
|
+
def decode_contents(self, eventual_encoding='utf-8') -> str:
|
|
442
|
+
"""
|
|
443
|
+
Decode the contents of the tag to a string.
|
|
444
|
+
|
|
445
|
+
Args:
|
|
446
|
+
eventual_encoding (str, optional): Encoding to use
|
|
447
|
+
|
|
448
|
+
Returns:
|
|
449
|
+
str: Decoded contents
|
|
450
|
+
"""
|
|
451
|
+
return ''.join(str(content) for content in self.contents)
|
|
452
|
+
|
|
453
|
+
def prettify(self, formatter='minimal') -> str:
|
|
454
|
+
"""
|
|
455
|
+
Return a nicely formatted representation of the tag.
|
|
456
|
+
|
|
457
|
+
Args:
|
|
458
|
+
formatter (str, optional): Formatting style
|
|
459
|
+
|
|
460
|
+
Returns:
|
|
461
|
+
str: Prettified tag representation
|
|
462
|
+
"""
|
|
463
|
+
def _prettify(tag, indent=0):
|
|
464
|
+
result = ' ' * indent + f'<{tag.name}'
|
|
465
|
+
for k, v in tag.attrs.items():
|
|
466
|
+
result += f' {k}="{v}"'
|
|
467
|
+
result += '>\n'
|
|
468
|
+
|
|
469
|
+
for content in tag.contents:
|
|
470
|
+
if isinstance(content, Tag):
|
|
471
|
+
result += _prettify(content, indent + 2)
|
|
472
|
+
else:
|
|
473
|
+
result += ' ' * (indent + 2) + str(content) + '\n'
|
|
474
|
+
|
|
475
|
+
result += ' ' * indent + f'</{tag.name}>\n'
|
|
476
|
+
return result
|
|
477
|
+
|
|
478
|
+
return _prettify(self)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scout Parsers - Unified Parsing Interfaces
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Dict, Type, Any
|
|
6
|
+
|
|
7
|
+
from .html_parser import HTMLParser
|
|
8
|
+
from .lxml_parser import LXMLParser
|
|
9
|
+
from .html5lib_parser import HTML5Parser
|
|
10
|
+
|
|
11
|
+
class ParserRegistry:
|
|
12
|
+
"""
|
|
13
|
+
Centralized parser registry for Scout library.
|
|
14
|
+
Manages and provides access to different HTML parsing strategies.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
_PARSERS: Dict[str, Type[Any]] = {
|
|
18
|
+
'html.parser': HTMLParser,
|
|
19
|
+
'lxml': LXMLParser,
|
|
20
|
+
'html5lib': HTML5Parser
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def get_parser(cls, parser_name: str = 'html.parser') -> Any:
|
|
25
|
+
"""
|
|
26
|
+
Retrieve a parser by its name.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
parser_name (str): Name of the parser to retrieve
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Parser instance
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
ValueError: If the parser is not found
|
|
36
|
+
"""
|
|
37
|
+
if parser_name not in cls._PARSERS:
|
|
38
|
+
raise ValueError(f"Parser '{parser_name}' not found. Available parsers: {list(cls._PARSERS.keys())}")
|
|
39
|
+
|
|
40
|
+
return cls._PARSERS[parser_name]()
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def register_parser(cls, name: str, parser_class: Type[Any]):
|
|
44
|
+
"""
|
|
45
|
+
Register a new parser dynamically.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
name (str): Name of the parser
|
|
49
|
+
parser_class (Type): Parser class to register
|
|
50
|
+
"""
|
|
51
|
+
cls._PARSERS[name] = parser_class
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def list_parsers(cls) -> Dict[str, Type[Any]]:
|
|
55
|
+
"""
|
|
56
|
+
List all registered parsers.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Dict of available parsers
|
|
60
|
+
"""
|
|
61
|
+
return cls._PARSERS.copy()
|
|
62
|
+
|
|
63
|
+
# Expose key classes and functions
|
|
64
|
+
__all__ = [
|
|
65
|
+
'HTMLParser',
|
|
66
|
+
'LXMLParser',
|
|
67
|
+
'HTML5Parser',
|
|
68
|
+
'ParserRegistry'
|
|
69
|
+
]
|