webscout 8.2.7__py3-none-any.whl → 8.2.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIauto.py +1 -1
- webscout/AIutel.py +298 -249
- webscout/Extra/Act.md +309 -0
- webscout/Extra/GitToolkit/__init__.py +10 -0
- webscout/Extra/GitToolkit/gitapi/README.md +110 -0
- webscout/Extra/GitToolkit/gitapi/__init__.py +12 -0
- webscout/Extra/GitToolkit/gitapi/repository.py +195 -0
- webscout/Extra/GitToolkit/gitapi/user.py +96 -0
- webscout/Extra/GitToolkit/gitapi/utils.py +62 -0
- webscout/Extra/YTToolkit/README.md +375 -0
- webscout/Extra/YTToolkit/YTdownloader.py +957 -0
- webscout/Extra/YTToolkit/__init__.py +3 -0
- webscout/Extra/YTToolkit/transcriber.py +476 -0
- webscout/Extra/YTToolkit/ytapi/README.md +44 -0
- webscout/Extra/YTToolkit/ytapi/__init__.py +6 -0
- webscout/Extra/YTToolkit/ytapi/channel.py +307 -0
- webscout/Extra/YTToolkit/ytapi/errors.py +13 -0
- webscout/Extra/YTToolkit/ytapi/extras.py +118 -0
- webscout/Extra/YTToolkit/ytapi/https.py +88 -0
- webscout/Extra/YTToolkit/ytapi/patterns.py +61 -0
- webscout/Extra/YTToolkit/ytapi/playlist.py +59 -0
- webscout/Extra/YTToolkit/ytapi/pool.py +8 -0
- webscout/Extra/YTToolkit/ytapi/query.py +40 -0
- webscout/Extra/YTToolkit/ytapi/stream.py +63 -0
- webscout/Extra/YTToolkit/ytapi/utils.py +62 -0
- webscout/Extra/YTToolkit/ytapi/video.py +232 -0
- webscout/Extra/__init__.py +7 -0
- webscout/Extra/autocoder/__init__.py +9 -0
- webscout/Extra/autocoder/autocoder.py +1105 -0
- webscout/Extra/autocoder/autocoder_utiles.py +332 -0
- webscout/Extra/gguf.md +430 -0
- webscout/Extra/gguf.py +684 -0
- webscout/Extra/tempmail/README.md +488 -0
- webscout/Extra/tempmail/__init__.py +28 -0
- webscout/Extra/tempmail/async_utils.py +141 -0
- webscout/Extra/tempmail/base.py +161 -0
- webscout/Extra/tempmail/cli.py +187 -0
- webscout/Extra/tempmail/emailnator.py +84 -0
- webscout/Extra/tempmail/mail_tm.py +361 -0
- webscout/Extra/tempmail/temp_mail_io.py +292 -0
- webscout/Extra/weather.md +281 -0
- webscout/Extra/weather.py +194 -0
- webscout/Extra/weather_ascii.py +76 -0
- webscout/Litlogger/Readme.md +175 -0
- webscout/Litlogger/__init__.py +67 -0
- webscout/Litlogger/core/__init__.py +6 -0
- webscout/Litlogger/core/level.py +23 -0
- webscout/Litlogger/core/logger.py +165 -0
- webscout/Litlogger/handlers/__init__.py +12 -0
- webscout/Litlogger/handlers/console.py +33 -0
- webscout/Litlogger/handlers/file.py +143 -0
- webscout/Litlogger/handlers/network.py +173 -0
- webscout/Litlogger/styles/__init__.py +7 -0
- webscout/Litlogger/styles/colors.py +249 -0
- webscout/Litlogger/styles/formats.py +458 -0
- webscout/Litlogger/styles/text.py +87 -0
- webscout/Litlogger/utils/__init__.py +6 -0
- webscout/Litlogger/utils/detectors.py +153 -0
- webscout/Litlogger/utils/formatters.py +200 -0
- webscout/Provider/AI21.py +177 -0
- webscout/Provider/AISEARCH/DeepFind.py +254 -0
- webscout/Provider/AISEARCH/Perplexity.py +359 -0
- webscout/Provider/AISEARCH/README.md +279 -0
- webscout/Provider/AISEARCH/__init__.py +9 -0
- webscout/Provider/AISEARCH/felo_search.py +228 -0
- webscout/Provider/AISEARCH/genspark_search.py +350 -0
- webscout/Provider/AISEARCH/hika_search.py +198 -0
- webscout/Provider/AISEARCH/iask_search.py +436 -0
- webscout/Provider/AISEARCH/monica_search.py +246 -0
- webscout/Provider/AISEARCH/scira_search.py +324 -0
- webscout/Provider/AISEARCH/webpilotai_search.py +281 -0
- webscout/Provider/Aitopia.py +316 -0
- webscout/Provider/AllenAI.py +440 -0
- webscout/Provider/Andi.py +228 -0
- webscout/Provider/Blackboxai.py +673 -0
- webscout/Provider/ChatGPTClone.py +237 -0
- webscout/Provider/ChatGPTGratis.py +194 -0
- webscout/Provider/ChatSandbox.py +342 -0
- webscout/Provider/Cloudflare.py +324 -0
- webscout/Provider/Cohere.py +208 -0
- webscout/Provider/Deepinfra.py +340 -0
- webscout/Provider/ExaAI.py +261 -0
- webscout/Provider/ExaChat.py +358 -0
- webscout/Provider/Flowith.py +217 -0
- webscout/Provider/FreeGemini.py +250 -0
- webscout/Provider/Gemini.py +169 -0
- webscout/Provider/GithubChat.py +370 -0
- webscout/Provider/GizAI.py +295 -0
- webscout/Provider/Glider.py +225 -0
- webscout/Provider/Groq.py +801 -0
- webscout/Provider/HF_space/__init__.py +0 -0
- webscout/Provider/HF_space/qwen_qwen2.py +206 -0
- webscout/Provider/HeckAI.py +285 -0
- webscout/Provider/HuggingFaceChat.py +469 -0
- webscout/Provider/Hunyuan.py +283 -0
- webscout/Provider/Jadve.py +291 -0
- webscout/Provider/Koboldai.py +384 -0
- webscout/Provider/LambdaChat.py +411 -0
- webscout/Provider/Llama3.py +259 -0
- webscout/Provider/MCPCore.py +315 -0
- webscout/Provider/Marcus.py +198 -0
- webscout/Provider/Nemotron.py +218 -0
- webscout/Provider/Netwrck.py +270 -0
- webscout/Provider/OLLAMA.py +396 -0
- webscout/Provider/OPENAI/BLACKBOXAI.py +735 -0
- webscout/Provider/OPENAI/Cloudflare.py +378 -0
- webscout/Provider/OPENAI/FreeGemini.py +282 -0
- webscout/Provider/OPENAI/NEMOTRON.py +244 -0
- webscout/Provider/OPENAI/README.md +1253 -0
- webscout/Provider/OPENAI/__init__.py +36 -0
- webscout/Provider/OPENAI/ai4chat.py +293 -0
- webscout/Provider/OPENAI/api.py +810 -0
- webscout/Provider/OPENAI/base.py +249 -0
- webscout/Provider/OPENAI/c4ai.py +373 -0
- webscout/Provider/OPENAI/chatgpt.py +556 -0
- webscout/Provider/OPENAI/chatgptclone.py +488 -0
- webscout/Provider/OPENAI/chatsandbox.py +172 -0
- webscout/Provider/OPENAI/deepinfra.py +319 -0
- webscout/Provider/OPENAI/e2b.py +1356 -0
- webscout/Provider/OPENAI/exaai.py +411 -0
- webscout/Provider/OPENAI/exachat.py +443 -0
- webscout/Provider/OPENAI/flowith.py +162 -0
- webscout/Provider/OPENAI/freeaichat.py +359 -0
- webscout/Provider/OPENAI/glider.py +323 -0
- webscout/Provider/OPENAI/groq.py +361 -0
- webscout/Provider/OPENAI/heckai.py +307 -0
- webscout/Provider/OPENAI/llmchatco.py +335 -0
- webscout/Provider/OPENAI/mcpcore.py +383 -0
- webscout/Provider/OPENAI/multichat.py +376 -0
- webscout/Provider/OPENAI/netwrck.py +356 -0
- webscout/Provider/OPENAI/opkfc.py +496 -0
- webscout/Provider/OPENAI/scirachat.py +471 -0
- webscout/Provider/OPENAI/sonus.py +303 -0
- webscout/Provider/OPENAI/standardinput.py +433 -0
- webscout/Provider/OPENAI/textpollinations.py +339 -0
- webscout/Provider/OPENAI/toolbaz.py +413 -0
- webscout/Provider/OPENAI/typefully.py +355 -0
- webscout/Provider/OPENAI/typegpt.py +358 -0
- webscout/Provider/OPENAI/uncovrAI.py +462 -0
- webscout/Provider/OPENAI/utils.py +307 -0
- webscout/Provider/OPENAI/venice.py +425 -0
- webscout/Provider/OPENAI/wisecat.py +381 -0
- webscout/Provider/OPENAI/writecream.py +163 -0
- webscout/Provider/OPENAI/x0gpt.py +378 -0
- webscout/Provider/OPENAI/yep.py +356 -0
- webscout/Provider/OpenGPT.py +209 -0
- webscout/Provider/Openai.py +496 -0
- webscout/Provider/PI.py +429 -0
- webscout/Provider/Perplexitylabs.py +415 -0
- webscout/Provider/QwenLM.py +254 -0
- webscout/Provider/Reka.py +214 -0
- webscout/Provider/StandardInput.py +290 -0
- webscout/Provider/TTI/AiForce/README.md +159 -0
- webscout/Provider/TTI/AiForce/__init__.py +22 -0
- webscout/Provider/TTI/AiForce/async_aiforce.py +224 -0
- webscout/Provider/TTI/AiForce/sync_aiforce.py +245 -0
- webscout/Provider/TTI/FreeAIPlayground/README.md +99 -0
- webscout/Provider/TTI/FreeAIPlayground/__init__.py +9 -0
- webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +181 -0
- webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +180 -0
- webscout/Provider/TTI/ImgSys/README.md +174 -0
- webscout/Provider/TTI/ImgSys/__init__.py +23 -0
- webscout/Provider/TTI/ImgSys/async_imgsys.py +202 -0
- webscout/Provider/TTI/ImgSys/sync_imgsys.py +195 -0
- webscout/Provider/TTI/MagicStudio/README.md +101 -0
- webscout/Provider/TTI/MagicStudio/__init__.py +2 -0
- webscout/Provider/TTI/MagicStudio/async_magicstudio.py +111 -0
- webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +109 -0
- webscout/Provider/TTI/Nexra/README.md +155 -0
- webscout/Provider/TTI/Nexra/__init__.py +22 -0
- webscout/Provider/TTI/Nexra/async_nexra.py +286 -0
- webscout/Provider/TTI/Nexra/sync_nexra.py +258 -0
- webscout/Provider/TTI/PollinationsAI/README.md +146 -0
- webscout/Provider/TTI/PollinationsAI/__init__.py +23 -0
- webscout/Provider/TTI/PollinationsAI/async_pollinations.py +311 -0
- webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +265 -0
- webscout/Provider/TTI/README.md +128 -0
- webscout/Provider/TTI/__init__.py +12 -0
- webscout/Provider/TTI/aiarta/README.md +134 -0
- webscout/Provider/TTI/aiarta/__init__.py +2 -0
- webscout/Provider/TTI/aiarta/async_aiarta.py +482 -0
- webscout/Provider/TTI/aiarta/sync_aiarta.py +440 -0
- webscout/Provider/TTI/artbit/README.md +100 -0
- webscout/Provider/TTI/artbit/__init__.py +22 -0
- webscout/Provider/TTI/artbit/async_artbit.py +155 -0
- webscout/Provider/TTI/artbit/sync_artbit.py +148 -0
- webscout/Provider/TTI/fastflux/README.md +129 -0
- webscout/Provider/TTI/fastflux/__init__.py +22 -0
- webscout/Provider/TTI/fastflux/async_fastflux.py +261 -0
- webscout/Provider/TTI/fastflux/sync_fastflux.py +252 -0
- webscout/Provider/TTI/huggingface/README.md +114 -0
- webscout/Provider/TTI/huggingface/__init__.py +22 -0
- webscout/Provider/TTI/huggingface/async_huggingface.py +199 -0
- webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -0
- webscout/Provider/TTI/piclumen/README.md +161 -0
- webscout/Provider/TTI/piclumen/__init__.py +23 -0
- webscout/Provider/TTI/piclumen/async_piclumen.py +268 -0
- webscout/Provider/TTI/piclumen/sync_piclumen.py +233 -0
- webscout/Provider/TTI/pixelmuse/README.md +79 -0
- webscout/Provider/TTI/pixelmuse/__init__.py +4 -0
- webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +249 -0
- webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +182 -0
- webscout/Provider/TTI/talkai/README.md +139 -0
- webscout/Provider/TTI/talkai/__init__.py +4 -0
- webscout/Provider/TTI/talkai/async_talkai.py +229 -0
- webscout/Provider/TTI/talkai/sync_talkai.py +207 -0
- webscout/Provider/TTS/README.md +192 -0
- webscout/Provider/TTS/__init__.py +9 -0
- webscout/Provider/TTS/base.py +159 -0
- webscout/Provider/TTS/deepgram.py +156 -0
- webscout/Provider/TTS/elevenlabs.py +111 -0
- webscout/Provider/TTS/gesserit.py +128 -0
- webscout/Provider/TTS/murfai.py +113 -0
- webscout/Provider/TTS/parler.py +111 -0
- webscout/Provider/TTS/speechma.py +580 -0
- webscout/Provider/TTS/sthir.py +94 -0
- webscout/Provider/TTS/streamElements.py +333 -0
- webscout/Provider/TTS/utils.py +280 -0
- webscout/Provider/TeachAnything.py +229 -0
- webscout/Provider/TextPollinationsAI.py +308 -0
- webscout/Provider/TwoAI.py +280 -0
- webscout/Provider/TypliAI.py +305 -0
- webscout/Provider/UNFINISHED/ChatHub.py +209 -0
- webscout/Provider/UNFINISHED/Youchat.py +330 -0
- webscout/Provider/UNFINISHED/liner_api_request.py +263 -0
- webscout/Provider/UNFINISHED/oivscode.py +351 -0
- webscout/Provider/UNFINISHED/test_lmarena.py +119 -0
- webscout/Provider/Venice.py +258 -0
- webscout/Provider/VercelAI.py +253 -0
- webscout/Provider/WiseCat.py +233 -0
- webscout/Provider/WrDoChat.py +370 -0
- webscout/Provider/Writecream.py +246 -0
- webscout/Provider/WritingMate.py +269 -0
- webscout/Provider/__init__.py +172 -0
- webscout/Provider/ai4chat.py +149 -0
- webscout/Provider/akashgpt.py +335 -0
- webscout/Provider/asksteve.py +220 -0
- webscout/Provider/cerebras.py +290 -0
- webscout/Provider/chatglm.py +215 -0
- webscout/Provider/cleeai.py +213 -0
- webscout/Provider/copilot.py +425 -0
- webscout/Provider/elmo.py +283 -0
- webscout/Provider/freeaichat.py +285 -0
- webscout/Provider/geminiapi.py +208 -0
- webscout/Provider/granite.py +235 -0
- webscout/Provider/hermes.py +266 -0
- webscout/Provider/julius.py +223 -0
- webscout/Provider/koala.py +170 -0
- webscout/Provider/learnfastai.py +325 -0
- webscout/Provider/llama3mitril.py +215 -0
- webscout/Provider/llmchat.py +258 -0
- webscout/Provider/llmchatco.py +306 -0
- webscout/Provider/lmarena.py +198 -0
- webscout/Provider/meta.py +801 -0
- webscout/Provider/multichat.py +364 -0
- webscout/Provider/samurai.py +223 -0
- webscout/Provider/scira_chat.py +299 -0
- webscout/Provider/scnet.py +243 -0
- webscout/Provider/searchchat.py +292 -0
- webscout/Provider/sonus.py +258 -0
- webscout/Provider/talkai.py +194 -0
- webscout/Provider/toolbaz.py +353 -0
- webscout/Provider/turboseek.py +266 -0
- webscout/Provider/typefully.py +202 -0
- webscout/Provider/typegpt.py +289 -0
- webscout/Provider/uncovr.py +368 -0
- webscout/Provider/x0gpt.py +299 -0
- webscout/Provider/yep.py +389 -0
- webscout/__init__.py +4 -2
- webscout/cli.py +3 -28
- webscout/conversation.py +35 -35
- webscout/litagent/Readme.md +276 -0
- webscout/litagent/__init__.py +29 -0
- webscout/litagent/agent.py +455 -0
- webscout/litagent/constants.py +60 -0
- webscout/litprinter/__init__.py +59 -0
- webscout/scout/README.md +402 -0
- webscout/scout/__init__.py +8 -0
- webscout/scout/core/__init__.py +7 -0
- webscout/scout/core/crawler.py +140 -0
- webscout/scout/core/scout.py +568 -0
- webscout/scout/core/search_result.py +96 -0
- webscout/scout/core/text_analyzer.py +63 -0
- webscout/scout/core/text_utils.py +277 -0
- webscout/scout/core/web_analyzer.py +52 -0
- webscout/scout/element.py +460 -0
- webscout/scout/parsers/__init__.py +69 -0
- webscout/scout/parsers/html5lib_parser.py +172 -0
- webscout/scout/parsers/html_parser.py +236 -0
- webscout/scout/parsers/lxml_parser.py +178 -0
- webscout/scout/utils.py +37 -0
- webscout/swiftcli/Readme.md +323 -0
- webscout/swiftcli/__init__.py +95 -0
- webscout/swiftcli/core/__init__.py +7 -0
- webscout/swiftcli/core/cli.py +297 -0
- webscout/swiftcli/core/context.py +104 -0
- webscout/swiftcli/core/group.py +241 -0
- webscout/swiftcli/decorators/__init__.py +28 -0
- webscout/swiftcli/decorators/command.py +221 -0
- webscout/swiftcli/decorators/options.py +220 -0
- webscout/swiftcli/decorators/output.py +252 -0
- webscout/swiftcli/exceptions.py +21 -0
- webscout/swiftcli/plugins/__init__.py +9 -0
- webscout/swiftcli/plugins/base.py +135 -0
- webscout/swiftcli/plugins/manager.py +262 -0
- webscout/swiftcli/utils/__init__.py +59 -0
- webscout/swiftcli/utils/formatting.py +252 -0
- webscout/swiftcli/utils/parsing.py +267 -0
- webscout/version.py +1 -1
- webscout/webscout_search.py +2 -182
- webscout/webscout_search_async.py +1 -179
- webscout/zeroart/README.md +89 -0
- webscout/zeroart/__init__.py +135 -0
- webscout/zeroart/base.py +66 -0
- webscout/zeroart/effects.py +101 -0
- webscout/zeroart/fonts.py +1239 -0
- {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/METADATA +115 -60
- webscout-8.2.8.dist-info/RECORD +334 -0
- {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/WHEEL +1 -1
- webscout-8.2.7.dist-info/RECORD +0 -26
- {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/entry_points.txt +0 -0
- {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.2.7.dist-info → webscout-8.2.8.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,460 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scout Element Module - Advanced HTML Element Representation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Optional, List, Dict, Union, Any
|
|
7
|
+
|
|
8
|
+
class NavigableString(str):
|
|
9
|
+
"""
|
|
10
|
+
A string that knows its place in the document tree.
|
|
11
|
+
Mimics BeautifulSoup's NavigableString for better compatibility.
|
|
12
|
+
"""
|
|
13
|
+
def __new__(cls, text: str):
|
|
14
|
+
"""
|
|
15
|
+
Create a new NavigableString instance.
|
|
16
|
+
|
|
17
|
+
Args:
|
|
18
|
+
text (str): String content
|
|
19
|
+
"""
|
|
20
|
+
return str.__new__(cls, text)
|
|
21
|
+
|
|
22
|
+
def __init__(self, text: str):
|
|
23
|
+
"""
|
|
24
|
+
Initialize a navigable string.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
text (str): String content
|
|
28
|
+
"""
|
|
29
|
+
self.parent = None
|
|
30
|
+
|
|
31
|
+
def __repr__(self):
|
|
32
|
+
"""String representation."""
|
|
33
|
+
return f"NavigableString({super().__repr__()})"
|
|
34
|
+
|
|
35
|
+
def __add__(self, other):
|
|
36
|
+
"""
|
|
37
|
+
Allow concatenation of NavigableString with other strings.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
other (str): String to concatenate
|
|
41
|
+
|
|
42
|
+
Returns:
|
|
43
|
+
str: Concatenated string
|
|
44
|
+
"""
|
|
45
|
+
return str(self) + str(other)
|
|
46
|
+
|
|
47
|
+
def strip(self, chars=None):
|
|
48
|
+
"""
|
|
49
|
+
Strip whitespace or specified characters.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
chars (str, optional): Characters to strip
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
str: Stripped string
|
|
56
|
+
"""
|
|
57
|
+
return NavigableString(super().strip(chars))
|
|
58
|
+
|
|
59
|
+
class Tag:
|
|
60
|
+
"""
|
|
61
|
+
Represents an HTML tag with advanced traversal and manipulation capabilities.
|
|
62
|
+
Enhanced to closely mimic BeautifulSoup's Tag class.
|
|
63
|
+
"""
|
|
64
|
+
def __init__(self, name: str, attrs: Dict[str, str] = None):
|
|
65
|
+
"""
|
|
66
|
+
Initialize a Tag with name and attributes.
|
|
67
|
+
|
|
68
|
+
Args:
|
|
69
|
+
name (str): Tag name
|
|
70
|
+
attrs (dict, optional): Tag attributes
|
|
71
|
+
"""
|
|
72
|
+
self.name = name
|
|
73
|
+
self.attrs = attrs or {}
|
|
74
|
+
self.contents = []
|
|
75
|
+
self.parent = None
|
|
76
|
+
self.string = None # For single string content
|
|
77
|
+
|
|
78
|
+
def __str__(self):
|
|
79
|
+
"""String representation of the tag."""
|
|
80
|
+
return self.decode_contents()
|
|
81
|
+
|
|
82
|
+
def __repr__(self):
|
|
83
|
+
"""Detailed representation of the tag."""
|
|
84
|
+
return f"<{self.name} {self.attrs}>"
|
|
85
|
+
|
|
86
|
+
def __call__(self, *args, **kwargs):
|
|
87
|
+
"""
|
|
88
|
+
Allows calling find_all directly on the tag.
|
|
89
|
+
Mimics BeautifulSoup's behavior.
|
|
90
|
+
"""
|
|
91
|
+
return self.find_all(*args, **kwargs)
|
|
92
|
+
|
|
93
|
+
def __contains__(self, item):
|
|
94
|
+
"""
|
|
95
|
+
Check if an item is in the tag's contents.
|
|
96
|
+
|
|
97
|
+
Args:
|
|
98
|
+
item: Item to search for
|
|
99
|
+
|
|
100
|
+
Returns:
|
|
101
|
+
bool: True if item is in contents, False otherwise
|
|
102
|
+
"""
|
|
103
|
+
return item in self.contents
|
|
104
|
+
|
|
105
|
+
def __getitem__(self, key):
|
|
106
|
+
"""
|
|
107
|
+
Get an attribute value using dictionary-like access.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
key (str): Attribute name
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
Any: Attribute value
|
|
114
|
+
"""
|
|
115
|
+
return self.attrs[key]
|
|
116
|
+
|
|
117
|
+
def __iter__(self):
|
|
118
|
+
"""
|
|
119
|
+
Iterate through tag's contents.
|
|
120
|
+
|
|
121
|
+
Returns:
|
|
122
|
+
Iterator: Contents of the tag
|
|
123
|
+
"""
|
|
124
|
+
return iter(self.contents)
|
|
125
|
+
|
|
126
|
+
def __eq__(self, other):
|
|
127
|
+
"""
|
|
128
|
+
Compare tags based on name and attributes.
|
|
129
|
+
|
|
130
|
+
Args:
|
|
131
|
+
other (Tag): Tag to compare
|
|
132
|
+
|
|
133
|
+
Returns:
|
|
134
|
+
bool: True if tags are equivalent
|
|
135
|
+
"""
|
|
136
|
+
if not isinstance(other, Tag):
|
|
137
|
+
return False
|
|
138
|
+
return (
|
|
139
|
+
self.name == other.name and
|
|
140
|
+
self.attrs == other.attrs and
|
|
141
|
+
str(self) == str(other)
|
|
142
|
+
)
|
|
143
|
+
|
|
144
|
+
def __hash__(self):
|
|
145
|
+
"""
|
|
146
|
+
Generate a hash for the tag.
|
|
147
|
+
|
|
148
|
+
Returns:
|
|
149
|
+
int: Hash value
|
|
150
|
+
"""
|
|
151
|
+
return hash((self.name, frozenset(self.attrs.items()), str(self)))
|
|
152
|
+
|
|
153
|
+
def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> Optional['Tag']:
|
|
154
|
+
"""
|
|
155
|
+
Find the first matching child element.
|
|
156
|
+
Enhanced with more flexible matching.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
name (str, optional): Tag name to search for
|
|
160
|
+
attrs (dict, optional): Attributes to match
|
|
161
|
+
recursive (bool, optional): Search recursively
|
|
162
|
+
text (str, optional): Text content to match
|
|
163
|
+
|
|
164
|
+
Returns:
|
|
165
|
+
Tag or None: First matching element
|
|
166
|
+
"""
|
|
167
|
+
results = self.find_all(name, attrs, recursive, text, limit=1, **kwargs)
|
|
168
|
+
return results[0] if results else None
|
|
169
|
+
|
|
170
|
+
def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> List['Tag']:
|
|
171
|
+
"""
|
|
172
|
+
Find all matching child elements.
|
|
173
|
+
Enhanced with more flexible matching and BeautifulSoup-like features.
|
|
174
|
+
|
|
175
|
+
Args:
|
|
176
|
+
name (str, optional): Tag name to search for
|
|
177
|
+
attrs (dict, optional): Attributes to match
|
|
178
|
+
recursive (bool, optional): Search recursively
|
|
179
|
+
text (str, optional): Text content to match
|
|
180
|
+
limit (int, optional): Maximum number of results
|
|
181
|
+
|
|
182
|
+
Returns:
|
|
183
|
+
List[Tag]: List of matching elements
|
|
184
|
+
"""
|
|
185
|
+
results = []
|
|
186
|
+
|
|
187
|
+
def _match(tag):
|
|
188
|
+
# Check tag name with case-insensitive and regex support
|
|
189
|
+
if name:
|
|
190
|
+
if isinstance(name, str):
|
|
191
|
+
if tag.name.lower() != name.lower():
|
|
192
|
+
return False
|
|
193
|
+
elif isinstance(name, re.Pattern):
|
|
194
|
+
if not name.search(tag.name):
|
|
195
|
+
return False
|
|
196
|
+
|
|
197
|
+
# Check attributes with more flexible matching
|
|
198
|
+
for k, v in attrs.items():
|
|
199
|
+
# Handle special attribute matching
|
|
200
|
+
if k == 'class':
|
|
201
|
+
tag_classes = tag.get('class', [])
|
|
202
|
+
if isinstance(v, str) and v not in tag_classes:
|
|
203
|
+
return False
|
|
204
|
+
elif isinstance(v, list) and not all(cls in tag_classes for cls in v):
|
|
205
|
+
return False
|
|
206
|
+
elif k == 'id':
|
|
207
|
+
if tag.get('id') != v:
|
|
208
|
+
return False
|
|
209
|
+
else:
|
|
210
|
+
# Regex or exact match for other attributes
|
|
211
|
+
tag_attr = tag.attrs.get(k)
|
|
212
|
+
if isinstance(v, re.Pattern):
|
|
213
|
+
if not v.search(str(tag_attr)):
|
|
214
|
+
return False
|
|
215
|
+
elif tag_attr != v:
|
|
216
|
+
return False
|
|
217
|
+
|
|
218
|
+
# Check text content
|
|
219
|
+
if text:
|
|
220
|
+
tag_text = tag.get_text(strip=True)
|
|
221
|
+
if isinstance(text, str) and text.lower() not in tag_text.lower():
|
|
222
|
+
return False
|
|
223
|
+
elif isinstance(text, re.Pattern) and not text.search(tag_text):
|
|
224
|
+
return False
|
|
225
|
+
|
|
226
|
+
return True
|
|
227
|
+
|
|
228
|
+
def _search(element):
|
|
229
|
+
if _match(element):
|
|
230
|
+
results.append(element)
|
|
231
|
+
if limit and len(results) == limit:
|
|
232
|
+
return
|
|
233
|
+
|
|
234
|
+
if recursive:
|
|
235
|
+
for child in element.contents:
|
|
236
|
+
if isinstance(child, Tag):
|
|
237
|
+
_search(child)
|
|
238
|
+
|
|
239
|
+
_search(self)
|
|
240
|
+
return results
|
|
241
|
+
|
|
242
|
+
def select(self, selector: str) -> List['Tag']:
|
|
243
|
+
"""
|
|
244
|
+
Select elements using CSS selector.
|
|
245
|
+
Enhanced to support more complex selectors.
|
|
246
|
+
|
|
247
|
+
Args:
|
|
248
|
+
selector (str): CSS selector string
|
|
249
|
+
|
|
250
|
+
Returns:
|
|
251
|
+
List[Tag]: List of matching elements
|
|
252
|
+
"""
|
|
253
|
+
# More advanced CSS selector parsing
|
|
254
|
+
# This is a simplified implementation and might need more robust parsing
|
|
255
|
+
parts = re.split(r'\s+', selector.strip())
|
|
256
|
+
results = []
|
|
257
|
+
|
|
258
|
+
def _match_selector(tag, selector_part):
|
|
259
|
+
# Support more complex selectors
|
|
260
|
+
if selector_part.startswith('.'):
|
|
261
|
+
# Class selector
|
|
262
|
+
return selector_part[1:] in tag.get('class', [])
|
|
263
|
+
elif selector_part.startswith('#'):
|
|
264
|
+
# ID selector
|
|
265
|
+
return tag.get('id') == selector_part[1:]
|
|
266
|
+
elif '[' in selector_part and ']' in selector_part:
|
|
267
|
+
# Attribute selector
|
|
268
|
+
attr_match = re.match(r'(\w+)\[([^=]+)(?:=(.+))?\]', selector_part)
|
|
269
|
+
if attr_match:
|
|
270
|
+
tag_name, attr, value = attr_match.groups()
|
|
271
|
+
if tag_name and tag.name != tag_name:
|
|
272
|
+
return False
|
|
273
|
+
if value:
|
|
274
|
+
return tag.get(attr) == value.strip("'\"")
|
|
275
|
+
return attr in tag.attrs
|
|
276
|
+
else:
|
|
277
|
+
# Tag selector
|
|
278
|
+
return tag.name == selector_part
|
|
279
|
+
|
|
280
|
+
def _recursive_select(element, selector_parts):
|
|
281
|
+
if not selector_parts:
|
|
282
|
+
results.append(element)
|
|
283
|
+
return
|
|
284
|
+
|
|
285
|
+
current_selector = selector_parts[0]
|
|
286
|
+
remaining_selectors = selector_parts[1:]
|
|
287
|
+
|
|
288
|
+
if _match_selector(element, current_selector):
|
|
289
|
+
if not remaining_selectors:
|
|
290
|
+
results.append(element)
|
|
291
|
+
else:
|
|
292
|
+
for child in element.contents:
|
|
293
|
+
if isinstance(child, Tag):
|
|
294
|
+
_recursive_select(child, remaining_selectors)
|
|
295
|
+
|
|
296
|
+
for child in self.contents:
|
|
297
|
+
if isinstance(child, Tag):
|
|
298
|
+
_recursive_select(child, parts)
|
|
299
|
+
|
|
300
|
+
return results
|
|
301
|
+
|
|
302
|
+
def select_one(self, selector: str) -> Optional['Tag']:
|
|
303
|
+
"""
|
|
304
|
+
Select the first element matching the CSS selector.
|
|
305
|
+
|
|
306
|
+
Args:
|
|
307
|
+
selector (str): CSS selector string
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Tag or None: First matching element
|
|
311
|
+
"""
|
|
312
|
+
results = self.select(selector)
|
|
313
|
+
return results[0] if results else None
|
|
314
|
+
|
|
315
|
+
def get_text(self, separator=' ', strip=False, types=None) -> str:
|
|
316
|
+
"""
|
|
317
|
+
Extract text from the tag and its descendants.
|
|
318
|
+
Enhanced to support more flexible text extraction.
|
|
319
|
+
|
|
320
|
+
Args:
|
|
321
|
+
separator (str, optional): Text separator
|
|
322
|
+
strip (bool, optional): Strip whitespace
|
|
323
|
+
types (list, optional): Types of content to extract
|
|
324
|
+
|
|
325
|
+
Returns:
|
|
326
|
+
str: Extracted text
|
|
327
|
+
"""
|
|
328
|
+
texts = []
|
|
329
|
+
for content in self.contents:
|
|
330
|
+
# Support filtering by content type
|
|
331
|
+
if types is None or type(content) in types:
|
|
332
|
+
if isinstance(content, NavigableString):
|
|
333
|
+
texts.append(str(content))
|
|
334
|
+
elif isinstance(content, Tag):
|
|
335
|
+
texts.append(content.get_text(separator, strip))
|
|
336
|
+
|
|
337
|
+
text = separator.join(texts)
|
|
338
|
+
text = re.sub(r'\n\n+', '\n', text) # Replace multiple newlines with single newlines
|
|
339
|
+
return text.strip() if strip else text
|
|
340
|
+
|
|
341
|
+
def find_text(self, pattern: Union[str, re.Pattern], **kwargs) -> Optional[str]:
|
|
342
|
+
"""
|
|
343
|
+
Find the first text matching a pattern.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
pattern (str or re.Pattern): Pattern to match
|
|
347
|
+
**kwargs: Additional arguments for get_text()
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
str or None: First matching text
|
|
351
|
+
"""
|
|
352
|
+
text = self.get_text(**kwargs)
|
|
353
|
+
|
|
354
|
+
if isinstance(pattern, str):
|
|
355
|
+
return pattern if pattern in text else None
|
|
356
|
+
elif isinstance(pattern, re.Pattern):
|
|
357
|
+
match = pattern.search(text)
|
|
358
|
+
return match.group(0) if match else None
|
|
359
|
+
|
|
360
|
+
def replace_text(self, old: Union[str, re.Pattern], new: str, **kwargs) -> str:
|
|
361
|
+
"""
|
|
362
|
+
Replace text matching a pattern.
|
|
363
|
+
|
|
364
|
+
Args:
|
|
365
|
+
old (str or re.Pattern): Pattern to replace
|
|
366
|
+
new (str): Replacement text
|
|
367
|
+
**kwargs: Additional arguments for get_text()
|
|
368
|
+
|
|
369
|
+
Returns:
|
|
370
|
+
str: Modified text
|
|
371
|
+
"""
|
|
372
|
+
text = self.get_text(**kwargs)
|
|
373
|
+
|
|
374
|
+
if isinstance(old, str):
|
|
375
|
+
return text.replace(old, new)
|
|
376
|
+
elif isinstance(old, re.Pattern):
|
|
377
|
+
return old.sub(new, text)
|
|
378
|
+
|
|
379
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
380
|
+
"""
|
|
381
|
+
Get an attribute value.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
key (str): Attribute name
|
|
385
|
+
default (Any, optional): Default value if attribute not found
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
Any: Attribute value or default
|
|
389
|
+
"""
|
|
390
|
+
return self.attrs.get(key, default)
|
|
391
|
+
|
|
392
|
+
def decompose(self) -> None:
|
|
393
|
+
"""Remove the tag and its contents from the document."""
|
|
394
|
+
if self.parent:
|
|
395
|
+
self.parent.contents.remove(self)
|
|
396
|
+
|
|
397
|
+
def extract(self) -> 'Tag':
|
|
398
|
+
"""
|
|
399
|
+
Remove the tag from the document and return it.
|
|
400
|
+
|
|
401
|
+
Returns:
|
|
402
|
+
Tag: Extracted tag
|
|
403
|
+
"""
|
|
404
|
+
self.decompose()
|
|
405
|
+
return self
|
|
406
|
+
|
|
407
|
+
def clear(self) -> None:
|
|
408
|
+
"""Remove all contents of the tag."""
|
|
409
|
+
self.contents.clear()
|
|
410
|
+
|
|
411
|
+
def replace_with(self, new_tag: 'Tag') -> None:
|
|
412
|
+
"""
|
|
413
|
+
Replace this tag with another tag.
|
|
414
|
+
|
|
415
|
+
Args:
|
|
416
|
+
new_tag (Tag): Tag to replace the current tag
|
|
417
|
+
"""
|
|
418
|
+
if self.parent:
|
|
419
|
+
index = self.parent.contents.index(self)
|
|
420
|
+
self.parent.contents[index] = new_tag
|
|
421
|
+
new_tag.parent = self.parent
|
|
422
|
+
|
|
423
|
+
def decode_contents(self, eventual_encoding='utf-8') -> str:
|
|
424
|
+
"""
|
|
425
|
+
Decode the contents of the tag to a string.
|
|
426
|
+
|
|
427
|
+
Args:
|
|
428
|
+
eventual_encoding (str, optional): Encoding to use
|
|
429
|
+
|
|
430
|
+
Returns:
|
|
431
|
+
str: Decoded contents
|
|
432
|
+
"""
|
|
433
|
+
return ''.join(str(content) for content in self.contents)
|
|
434
|
+
|
|
435
|
+
def prettify(self, formatter='minimal') -> str:
|
|
436
|
+
"""
|
|
437
|
+
Return a nicely formatted representation of the tag.
|
|
438
|
+
|
|
439
|
+
Args:
|
|
440
|
+
formatter (str, optional): Formatting style
|
|
441
|
+
|
|
442
|
+
Returns:
|
|
443
|
+
str: Prettified tag representation
|
|
444
|
+
"""
|
|
445
|
+
def _prettify(tag, indent=0):
|
|
446
|
+
result = ' ' * indent + f'<{tag.name}'
|
|
447
|
+
for k, v in tag.attrs.items():
|
|
448
|
+
result += f' {k}="{v}"'
|
|
449
|
+
result += '>\n'
|
|
450
|
+
|
|
451
|
+
for content in tag.contents:
|
|
452
|
+
if isinstance(content, Tag):
|
|
453
|
+
result += _prettify(content, indent + 2)
|
|
454
|
+
else:
|
|
455
|
+
result += ' ' * (indent + 2) + str(content) + '\n'
|
|
456
|
+
|
|
457
|
+
result += ' ' * indent + f'</{tag.name}>\n'
|
|
458
|
+
return result
|
|
459
|
+
|
|
460
|
+
return _prettify(self)
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scout Parsers - Unified Parsing Interfaces
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Dict, Type, Any
|
|
6
|
+
|
|
7
|
+
from .html_parser import HTMLParser
|
|
8
|
+
from .lxml_parser import LXMLParser
|
|
9
|
+
from .html5lib_parser import HTML5Parser
|
|
10
|
+
|
|
11
|
+
class ParserRegistry:
|
|
12
|
+
"""
|
|
13
|
+
Centralized parser registry for Scout library.
|
|
14
|
+
Manages and provides access to different HTML parsing strategies.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
_PARSERS: Dict[str, Type[Any]] = {
|
|
18
|
+
'html.parser': HTMLParser,
|
|
19
|
+
'lxml': LXMLParser,
|
|
20
|
+
'html5lib': HTML5Parser
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
@classmethod
|
|
24
|
+
def get_parser(cls, parser_name: str = 'html.parser') -> Any:
|
|
25
|
+
"""
|
|
26
|
+
Retrieve a parser by its name.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
parser_name (str): Name of the parser to retrieve
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
Parser instance
|
|
33
|
+
|
|
34
|
+
Raises:
|
|
35
|
+
ValueError: If the parser is not found
|
|
36
|
+
"""
|
|
37
|
+
if parser_name not in cls._PARSERS:
|
|
38
|
+
raise ValueError(f"Parser '{parser_name}' not found. Available parsers: {list(cls._PARSERS.keys())}")
|
|
39
|
+
|
|
40
|
+
return cls._PARSERS[parser_name]()
|
|
41
|
+
|
|
42
|
+
@classmethod
|
|
43
|
+
def register_parser(cls, name: str, parser_class: Type[Any]):
|
|
44
|
+
"""
|
|
45
|
+
Register a new parser dynamically.
|
|
46
|
+
|
|
47
|
+
Args:
|
|
48
|
+
name (str): Name of the parser
|
|
49
|
+
parser_class (Type): Parser class to register
|
|
50
|
+
"""
|
|
51
|
+
cls._PARSERS[name] = parser_class
|
|
52
|
+
|
|
53
|
+
@classmethod
|
|
54
|
+
def list_parsers(cls) -> Dict[str, Type[Any]]:
|
|
55
|
+
"""
|
|
56
|
+
List all registered parsers.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
Dict of available parsers
|
|
60
|
+
"""
|
|
61
|
+
return cls._PARSERS.copy()
|
|
62
|
+
|
|
63
|
+
# Expose key classes and functions
|
|
64
|
+
__all__ = [
|
|
65
|
+
'HTMLParser',
|
|
66
|
+
'LXMLParser',
|
|
67
|
+
'HTML5Parser',
|
|
68
|
+
'ParserRegistry'
|
|
69
|
+
]
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Scout HTML5 Parser - Advanced HTML5 Parsing with html5lib
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import List, Optional, Dict, Any, Union
|
|
7
|
+
|
|
8
|
+
import html5lib
|
|
9
|
+
from ..element import Tag, NavigableString
|
|
10
|
+
|
|
11
|
+
class HTML5Parser:
|
|
12
|
+
"""
|
|
13
|
+
Advanced HTML5 parser using html5lib library.
|
|
14
|
+
Provides robust parsing with enhanced error handling and flexibility.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self, namespaces: bool = False, debug: bool = False):
|
|
18
|
+
"""
|
|
19
|
+
Initialize the HTML5 parser with advanced parsing capabilities.
|
|
20
|
+
|
|
21
|
+
Args:
|
|
22
|
+
namespaces (bool): Whether to preserve namespace information
|
|
23
|
+
debug (bool): Enable debug mode for parsing
|
|
24
|
+
"""
|
|
25
|
+
self._namespaces = namespaces
|
|
26
|
+
self._debug = debug
|
|
27
|
+
self._parsing_errors = []
|
|
28
|
+
|
|
29
|
+
def parse(self, markup: str) -> Tag:
|
|
30
|
+
"""
|
|
31
|
+
Parse HTML5 markup and return the root tag.
|
|
32
|
+
|
|
33
|
+
Args:
|
|
34
|
+
markup (str): HTML5 content to parse
|
|
35
|
+
|
|
36
|
+
Returns:
|
|
37
|
+
Tag: Parsed document root
|
|
38
|
+
"""
|
|
39
|
+
try:
|
|
40
|
+
# Preprocess markup to handle common issues
|
|
41
|
+
markup = self._preprocess_markup(markup)
|
|
42
|
+
|
|
43
|
+
# Parse the markup
|
|
44
|
+
tree = html5lib.parse(
|
|
45
|
+
markup,
|
|
46
|
+
namespaceHTMLElements=self._namespaces,
|
|
47
|
+
transport_encoding='utf-8'
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
# Convert parsed tree to Scout Tag
|
|
51
|
+
return self._convert_element(tree.getroot())
|
|
52
|
+
|
|
53
|
+
except Exception as e:
|
|
54
|
+
self._parsing_errors.append(str(e))
|
|
55
|
+
return Tag('root')
|
|
56
|
+
|
|
57
|
+
def _preprocess_markup(self, markup: str) -> str:
|
|
58
|
+
"""
|
|
59
|
+
Preprocess HTML markup to handle common parsing issues.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
markup (str): Raw HTML markup
|
|
63
|
+
|
|
64
|
+
Returns:
|
|
65
|
+
str: Preprocessed HTML markup
|
|
66
|
+
"""
|
|
67
|
+
# Remove HTML comments
|
|
68
|
+
markup = re.sub(r'<!--.*?-->', '', markup, flags=re.DOTALL)
|
|
69
|
+
|
|
70
|
+
# Handle unclosed tags
|
|
71
|
+
markup = re.sub(r'<(br|img|input|hr|meta)([^>]*?)(?<!/)>', r'<\1\2 />', markup, flags=re.IGNORECASE)
|
|
72
|
+
|
|
73
|
+
return markup
|
|
74
|
+
|
|
75
|
+
def _convert_element(self, element) -> Tag:
|
|
76
|
+
"""
|
|
77
|
+
Convert html5lib element to Scout Tag.
|
|
78
|
+
|
|
79
|
+
Args:
|
|
80
|
+
element: html5lib parsed element
|
|
81
|
+
|
|
82
|
+
Returns:
|
|
83
|
+
Tag: Converted Scout Tag
|
|
84
|
+
"""
|
|
85
|
+
# Create Tag with name and attributes
|
|
86
|
+
tag = Tag(element.tag, dict(element.attrib))
|
|
87
|
+
|
|
88
|
+
# Add text content
|
|
89
|
+
if element.text:
|
|
90
|
+
tag.contents.append(NavigableString(element.text))
|
|
91
|
+
|
|
92
|
+
# Recursively add child elements
|
|
93
|
+
for child in element:
|
|
94
|
+
child_tag = self._convert_element(child)
|
|
95
|
+
child_tag.parent = tag
|
|
96
|
+
tag.contents.append(child_tag)
|
|
97
|
+
|
|
98
|
+
# Add tail text
|
|
99
|
+
if child.tail:
|
|
100
|
+
tail_text = NavigableString(child.tail)
|
|
101
|
+
tail_text.parent = tag
|
|
102
|
+
tag.contents.append(tail_text)
|
|
103
|
+
|
|
104
|
+
return tag
|
|
105
|
+
|
|
106
|
+
def get_parsing_errors(self) -> List[str]:
|
|
107
|
+
"""
|
|
108
|
+
Retrieve parsing errors encountered during processing.
|
|
109
|
+
|
|
110
|
+
Returns:
|
|
111
|
+
List[str]: List of parsing error messages
|
|
112
|
+
"""
|
|
113
|
+
return self._parsing_errors
|
|
114
|
+
|
|
115
|
+
def find_all(self, markup: str, tag: Optional[Union[str, List[str]]] = None,
|
|
116
|
+
attrs: Optional[Dict[str, Any]] = None,
|
|
117
|
+
recursive: bool = True,
|
|
118
|
+
text: Optional[str] = None,
|
|
119
|
+
limit: Optional[int] = None) -> List[Tag]:
|
|
120
|
+
"""
|
|
121
|
+
Find all matching elements in the parsed document.
|
|
122
|
+
|
|
123
|
+
Args:
|
|
124
|
+
markup (str): HTML content to parse
|
|
125
|
+
tag (str or List[str], optional): Tag name(s) to search for
|
|
126
|
+
attrs (dict, optional): Attribute filters
|
|
127
|
+
recursive (bool): Whether to search recursively
|
|
128
|
+
text (str, optional): Text content to search for
|
|
129
|
+
limit (int, optional): Maximum number of results
|
|
130
|
+
|
|
131
|
+
Returns:
|
|
132
|
+
List[Tag]: List of matching tags
|
|
133
|
+
"""
|
|
134
|
+
root = self.parse(markup)
|
|
135
|
+
|
|
136
|
+
def matches(element: Tag) -> bool:
|
|
137
|
+
"""Check if an element matches search criteria."""
|
|
138
|
+
# Tag filter
|
|
139
|
+
if tag and isinstance(tag, str) and element.name != tag:
|
|
140
|
+
return False
|
|
141
|
+
if tag and isinstance(tag, list) and element.name not in tag:
|
|
142
|
+
return False
|
|
143
|
+
|
|
144
|
+
# Attribute filter
|
|
145
|
+
if attrs:
|
|
146
|
+
for key, value in attrs.items():
|
|
147
|
+
if key not in element.attrs or element.attrs[key] != value:
|
|
148
|
+
return False
|
|
149
|
+
|
|
150
|
+
# Text filter
|
|
151
|
+
if text:
|
|
152
|
+
element_text = ' '.join([str(c) for c in element.contents if isinstance(c, NavigableString)])
|
|
153
|
+
if text not in element_text:
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
return True
|
|
157
|
+
|
|
158
|
+
def collect_matches(element: Tag, results: List[Tag]):
|
|
159
|
+
"""Recursively collect matching elements."""
|
|
160
|
+
if matches(element):
|
|
161
|
+
results.append(element)
|
|
162
|
+
if limit and len(results) >= limit:
|
|
163
|
+
return
|
|
164
|
+
|
|
165
|
+
if recursive:
|
|
166
|
+
for child in element.contents:
|
|
167
|
+
if isinstance(child, Tag):
|
|
168
|
+
collect_matches(child, results)
|
|
169
|
+
|
|
170
|
+
results = []
|
|
171
|
+
collect_matches(root, results)
|
|
172
|
+
return results
|