webscout 8.2.9__py3-none-any.whl → 2026.1.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- webscout/AIauto.py +524 -251
- webscout/AIbase.py +247 -319
- webscout/AIutel.py +68 -703
- webscout/Bard.py +1072 -1026
- webscout/Extra/GitToolkit/__init__.py +10 -10
- webscout/Extra/GitToolkit/gitapi/__init__.py +20 -12
- webscout/Extra/GitToolkit/gitapi/gist.py +142 -0
- webscout/Extra/GitToolkit/gitapi/organization.py +91 -0
- webscout/Extra/GitToolkit/gitapi/repository.py +308 -195
- webscout/Extra/GitToolkit/gitapi/search.py +162 -0
- webscout/Extra/GitToolkit/gitapi/trending.py +236 -0
- webscout/Extra/GitToolkit/gitapi/user.py +128 -96
- webscout/Extra/GitToolkit/gitapi/utils.py +82 -62
- webscout/Extra/YTToolkit/README.md +443 -375
- webscout/Extra/YTToolkit/YTdownloader.py +953 -957
- webscout/Extra/YTToolkit/__init__.py +3 -3
- webscout/Extra/YTToolkit/transcriber.py +595 -476
- webscout/Extra/YTToolkit/ytapi/README.md +230 -44
- webscout/Extra/YTToolkit/ytapi/__init__.py +22 -6
- webscout/Extra/YTToolkit/ytapi/captions.py +190 -0
- webscout/Extra/YTToolkit/ytapi/channel.py +302 -307
- webscout/Extra/YTToolkit/ytapi/errors.py +13 -13
- webscout/Extra/YTToolkit/ytapi/extras.py +178 -118
- webscout/Extra/YTToolkit/ytapi/hashtag.py +120 -0
- webscout/Extra/YTToolkit/ytapi/https.py +89 -88
- webscout/Extra/YTToolkit/ytapi/patterns.py +61 -61
- webscout/Extra/YTToolkit/ytapi/playlist.py +59 -59
- webscout/Extra/YTToolkit/ytapi/pool.py +8 -8
- webscout/Extra/YTToolkit/ytapi/query.py +143 -40
- webscout/Extra/YTToolkit/ytapi/shorts.py +122 -0
- webscout/Extra/YTToolkit/ytapi/stream.py +68 -63
- webscout/Extra/YTToolkit/ytapi/suggestions.py +97 -0
- webscout/Extra/YTToolkit/ytapi/utils.py +66 -62
- webscout/Extra/YTToolkit/ytapi/video.py +403 -232
- webscout/Extra/__init__.py +2 -3
- webscout/Extra/gguf.py +1298 -684
- webscout/Extra/tempmail/README.md +487 -487
- webscout/Extra/tempmail/__init__.py +28 -28
- webscout/Extra/tempmail/async_utils.py +143 -141
- webscout/Extra/tempmail/base.py +172 -161
- webscout/Extra/tempmail/cli.py +191 -187
- webscout/Extra/tempmail/emailnator.py +88 -84
- webscout/Extra/tempmail/mail_tm.py +378 -361
- webscout/Extra/tempmail/temp_mail_io.py +304 -292
- webscout/Extra/weather.py +196 -194
- webscout/Extra/weather_ascii.py +17 -15
- webscout/Provider/AISEARCH/PERPLEXED_search.py +175 -0
- webscout/Provider/AISEARCH/Perplexity.py +292 -333
- webscout/Provider/AISEARCH/README.md +106 -279
- webscout/Provider/AISEARCH/__init__.py +16 -9
- webscout/Provider/AISEARCH/brave_search.py +298 -0
- webscout/Provider/AISEARCH/iask_search.py +357 -410
- webscout/Provider/AISEARCH/monica_search.py +200 -220
- webscout/Provider/AISEARCH/webpilotai_search.py +242 -255
- webscout/Provider/Algion.py +413 -0
- webscout/Provider/Andi.py +74 -69
- webscout/Provider/Apriel.py +313 -0
- webscout/Provider/Ayle.py +323 -0
- webscout/Provider/ChatSandbox.py +329 -342
- webscout/Provider/ClaudeOnline.py +365 -0
- webscout/Provider/Cohere.py +232 -208
- webscout/Provider/DeepAI.py +367 -0
- webscout/Provider/Deepinfra.py +467 -340
- webscout/Provider/EssentialAI.py +217 -0
- webscout/Provider/ExaAI.py +274 -261
- webscout/Provider/Gemini.py +175 -169
- webscout/Provider/GithubChat.py +385 -369
- webscout/Provider/Gradient.py +286 -0
- webscout/Provider/Groq.py +556 -801
- webscout/Provider/HadadXYZ.py +323 -0
- webscout/Provider/HeckAI.py +392 -375
- webscout/Provider/HuggingFace.py +387 -0
- webscout/Provider/IBM.py +340 -0
- webscout/Provider/Jadve.py +317 -291
- webscout/Provider/K2Think.py +306 -0
- webscout/Provider/Koboldai.py +221 -384
- webscout/Provider/Netwrck.py +273 -270
- webscout/Provider/Nvidia.py +310 -0
- webscout/Provider/OPENAI/DeepAI.py +489 -0
- webscout/Provider/OPENAI/K2Think.py +423 -0
- webscout/Provider/OPENAI/PI.py +463 -0
- webscout/Provider/OPENAI/README.md +890 -952
- webscout/Provider/OPENAI/TogetherAI.py +405 -0
- webscout/Provider/OPENAI/TwoAI.py +255 -357
- webscout/Provider/OPENAI/__init__.py +148 -40
- webscout/Provider/OPENAI/ai4chat.py +348 -293
- webscout/Provider/OPENAI/akashgpt.py +436 -0
- webscout/Provider/OPENAI/algion.py +303 -0
- webscout/Provider/OPENAI/{exachat.py → ayle.py} +365 -444
- webscout/Provider/OPENAI/base.py +253 -249
- webscout/Provider/OPENAI/cerebras.py +296 -0
- webscout/Provider/OPENAI/chatgpt.py +870 -556
- webscout/Provider/OPENAI/chatsandbox.py +233 -173
- webscout/Provider/OPENAI/deepinfra.py +403 -322
- webscout/Provider/OPENAI/e2b.py +2370 -1414
- webscout/Provider/OPENAI/elmo.py +278 -0
- webscout/Provider/OPENAI/exaai.py +452 -417
- webscout/Provider/OPENAI/freeassist.py +446 -0
- webscout/Provider/OPENAI/gradient.py +448 -0
- webscout/Provider/OPENAI/groq.py +380 -364
- webscout/Provider/OPENAI/hadadxyz.py +292 -0
- webscout/Provider/OPENAI/heckai.py +333 -308
- webscout/Provider/OPENAI/huggingface.py +321 -0
- webscout/Provider/OPENAI/ibm.py +425 -0
- webscout/Provider/OPENAI/llmchat.py +253 -0
- webscout/Provider/OPENAI/llmchatco.py +378 -335
- webscout/Provider/OPENAI/meta.py +541 -0
- webscout/Provider/OPENAI/netwrck.py +374 -357
- webscout/Provider/OPENAI/nvidia.py +317 -0
- webscout/Provider/OPENAI/oivscode.py +348 -287
- webscout/Provider/OPENAI/openrouter.py +328 -0
- webscout/Provider/OPENAI/pydantic_imports.py +1 -172
- webscout/Provider/OPENAI/sambanova.py +397 -0
- webscout/Provider/OPENAI/sonus.py +305 -304
- webscout/Provider/OPENAI/textpollinations.py +370 -339
- webscout/Provider/OPENAI/toolbaz.py +375 -413
- webscout/Provider/OPENAI/typefully.py +419 -355
- webscout/Provider/OPENAI/typliai.py +279 -0
- webscout/Provider/OPENAI/utils.py +314 -318
- webscout/Provider/OPENAI/wisecat.py +359 -387
- webscout/Provider/OPENAI/writecream.py +185 -163
- webscout/Provider/OPENAI/x0gpt.py +462 -365
- webscout/Provider/OPENAI/zenmux.py +380 -0
- webscout/Provider/OpenRouter.py +386 -0
- webscout/Provider/Openai.py +337 -496
- webscout/Provider/PI.py +443 -429
- webscout/Provider/QwenLM.py +346 -254
- webscout/Provider/STT/__init__.py +28 -0
- webscout/Provider/STT/base.py +303 -0
- webscout/Provider/STT/elevenlabs.py +264 -0
- webscout/Provider/Sambanova.py +317 -0
- webscout/Provider/TTI/README.md +69 -82
- webscout/Provider/TTI/__init__.py +37 -7
- webscout/Provider/TTI/base.py +147 -64
- webscout/Provider/TTI/claudeonline.py +393 -0
- webscout/Provider/TTI/magicstudio.py +292 -201
- webscout/Provider/TTI/miragic.py +180 -0
- webscout/Provider/TTI/pollinations.py +331 -221
- webscout/Provider/TTI/together.py +334 -0
- webscout/Provider/TTI/utils.py +14 -11
- webscout/Provider/TTS/README.md +186 -192
- webscout/Provider/TTS/__init__.py +43 -10
- webscout/Provider/TTS/base.py +523 -159
- webscout/Provider/TTS/deepgram.py +286 -156
- webscout/Provider/TTS/elevenlabs.py +189 -111
- webscout/Provider/TTS/freetts.py +218 -0
- webscout/Provider/TTS/murfai.py +288 -113
- webscout/Provider/TTS/openai_fm.py +364 -129
- webscout/Provider/TTS/parler.py +203 -111
- webscout/Provider/TTS/qwen.py +334 -0
- webscout/Provider/TTS/sherpa.py +286 -0
- webscout/Provider/TTS/speechma.py +693 -580
- webscout/Provider/TTS/streamElements.py +275 -333
- webscout/Provider/TTS/utils.py +280 -280
- webscout/Provider/TextPollinationsAI.py +331 -308
- webscout/Provider/TogetherAI.py +450 -0
- webscout/Provider/TwoAI.py +309 -475
- webscout/Provider/TypliAI.py +311 -305
- webscout/Provider/UNFINISHED/ChatHub.py +219 -209
- webscout/Provider/{OPENAI/glider.py → UNFINISHED/ChutesAI.py} +331 -326
- webscout/Provider/{GizAI.py → UNFINISHED/GizAI.py} +300 -295
- webscout/Provider/{Marcus.py → UNFINISHED/Marcus.py} +218 -198
- webscout/Provider/UNFINISHED/Qodo.py +481 -0
- webscout/Provider/{MCPCore.py → UNFINISHED/XenAI.py} +330 -315
- webscout/Provider/UNFINISHED/Youchat.py +347 -330
- webscout/Provider/UNFINISHED/aihumanizer.py +41 -0
- webscout/Provider/UNFINISHED/grammerchecker.py +37 -0
- webscout/Provider/UNFINISHED/liner.py +342 -0
- webscout/Provider/UNFINISHED/liner_api_request.py +246 -263
- webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +231 -224
- webscout/Provider/WiseCat.py +256 -233
- webscout/Provider/WrDoChat.py +390 -370
- webscout/Provider/__init__.py +115 -174
- webscout/Provider/ai4chat.py +181 -174
- webscout/Provider/akashgpt.py +330 -335
- webscout/Provider/cerebras.py +397 -290
- webscout/Provider/cleeai.py +236 -213
- webscout/Provider/elmo.py +291 -283
- webscout/Provider/geminiapi.py +343 -208
- webscout/Provider/julius.py +245 -223
- webscout/Provider/learnfastai.py +333 -325
- webscout/Provider/llama3mitril.py +230 -215
- webscout/Provider/llmchat.py +308 -258
- webscout/Provider/llmchatco.py +321 -306
- webscout/Provider/meta.py +996 -801
- webscout/Provider/oivscode.py +332 -309
- webscout/Provider/searchchat.py +316 -292
- webscout/Provider/sonus.py +264 -258
- webscout/Provider/toolbaz.py +359 -353
- webscout/Provider/turboseek.py +332 -266
- webscout/Provider/typefully.py +262 -202
- webscout/Provider/x0gpt.py +332 -299
- webscout/__init__.py +31 -39
- webscout/__main__.py +5 -5
- webscout/cli.py +585 -524
- webscout/client.py +1497 -70
- webscout/conversation.py +140 -436
- webscout/exceptions.py +383 -362
- webscout/litagent/__init__.py +29 -29
- webscout/litagent/agent.py +492 -455
- webscout/litagent/constants.py +60 -60
- webscout/models.py +505 -181
- webscout/optimizers.py +74 -420
- webscout/prompt_manager.py +376 -288
- webscout/sanitize.py +1514 -0
- webscout/scout/README.md +452 -404
- webscout/scout/__init__.py +8 -8
- webscout/scout/core/__init__.py +7 -7
- webscout/scout/core/crawler.py +330 -210
- webscout/scout/core/scout.py +800 -607
- webscout/scout/core/search_result.py +51 -96
- webscout/scout/core/text_analyzer.py +64 -63
- webscout/scout/core/text_utils.py +412 -277
- webscout/scout/core/web_analyzer.py +54 -52
- webscout/scout/element.py +872 -478
- webscout/scout/parsers/__init__.py +70 -69
- webscout/scout/parsers/html5lib_parser.py +182 -172
- webscout/scout/parsers/html_parser.py +238 -236
- webscout/scout/parsers/lxml_parser.py +203 -178
- webscout/scout/utils.py +38 -37
- webscout/search/__init__.py +47 -0
- webscout/search/base.py +201 -0
- webscout/search/bing_main.py +45 -0
- webscout/search/brave_main.py +92 -0
- webscout/search/duckduckgo_main.py +57 -0
- webscout/search/engines/__init__.py +127 -0
- webscout/search/engines/bing/__init__.py +15 -0
- webscout/search/engines/bing/base.py +35 -0
- webscout/search/engines/bing/images.py +114 -0
- webscout/search/engines/bing/news.py +96 -0
- webscout/search/engines/bing/suggestions.py +36 -0
- webscout/search/engines/bing/text.py +109 -0
- webscout/search/engines/brave/__init__.py +19 -0
- webscout/search/engines/brave/base.py +47 -0
- webscout/search/engines/brave/images.py +213 -0
- webscout/search/engines/brave/news.py +353 -0
- webscout/search/engines/brave/suggestions.py +318 -0
- webscout/search/engines/brave/text.py +167 -0
- webscout/search/engines/brave/videos.py +364 -0
- webscout/search/engines/duckduckgo/__init__.py +25 -0
- webscout/search/engines/duckduckgo/answers.py +80 -0
- webscout/search/engines/duckduckgo/base.py +189 -0
- webscout/search/engines/duckduckgo/images.py +100 -0
- webscout/search/engines/duckduckgo/maps.py +183 -0
- webscout/search/engines/duckduckgo/news.py +70 -0
- webscout/search/engines/duckduckgo/suggestions.py +22 -0
- webscout/search/engines/duckduckgo/text.py +221 -0
- webscout/search/engines/duckduckgo/translate.py +48 -0
- webscout/search/engines/duckduckgo/videos.py +80 -0
- webscout/search/engines/duckduckgo/weather.py +84 -0
- webscout/search/engines/mojeek.py +61 -0
- webscout/search/engines/wikipedia.py +77 -0
- webscout/search/engines/yahoo/__init__.py +41 -0
- webscout/search/engines/yahoo/answers.py +19 -0
- webscout/search/engines/yahoo/base.py +34 -0
- webscout/search/engines/yahoo/images.py +323 -0
- webscout/search/engines/yahoo/maps.py +19 -0
- webscout/search/engines/yahoo/news.py +258 -0
- webscout/search/engines/yahoo/suggestions.py +140 -0
- webscout/search/engines/yahoo/text.py +273 -0
- webscout/search/engines/yahoo/translate.py +19 -0
- webscout/search/engines/yahoo/videos.py +302 -0
- webscout/search/engines/yahoo/weather.py +220 -0
- webscout/search/engines/yandex.py +67 -0
- webscout/search/engines/yep/__init__.py +13 -0
- webscout/search/engines/yep/base.py +34 -0
- webscout/search/engines/yep/images.py +101 -0
- webscout/search/engines/yep/suggestions.py +38 -0
- webscout/search/engines/yep/text.py +99 -0
- webscout/search/http_client.py +172 -0
- webscout/search/results.py +141 -0
- webscout/search/yahoo_main.py +57 -0
- webscout/search/yep_main.py +48 -0
- webscout/server/__init__.py +48 -0
- webscout/server/config.py +78 -0
- webscout/server/exceptions.py +69 -0
- webscout/server/providers.py +286 -0
- webscout/server/request_models.py +131 -0
- webscout/server/request_processing.py +404 -0
- webscout/server/routes.py +642 -0
- webscout/server/server.py +351 -0
- webscout/server/ui_templates.py +1171 -0
- webscout/swiftcli/__init__.py +79 -95
- webscout/swiftcli/core/__init__.py +7 -7
- webscout/swiftcli/core/cli.py +574 -297
- webscout/swiftcli/core/context.py +98 -104
- webscout/swiftcli/core/group.py +268 -241
- webscout/swiftcli/decorators/__init__.py +28 -28
- webscout/swiftcli/decorators/command.py +243 -221
- webscout/swiftcli/decorators/options.py +247 -220
- webscout/swiftcli/decorators/output.py +392 -252
- webscout/swiftcli/exceptions.py +21 -21
- webscout/swiftcli/plugins/__init__.py +9 -9
- webscout/swiftcli/plugins/base.py +134 -135
- webscout/swiftcli/plugins/manager.py +269 -269
- webscout/swiftcli/utils/__init__.py +58 -59
- webscout/swiftcli/utils/formatting.py +251 -252
- webscout/swiftcli/utils/parsing.py +368 -267
- webscout/update_checker.py +280 -136
- webscout/utils.py +28 -14
- webscout/version.py +2 -1
- webscout/version.py.bak +3 -0
- webscout/zeroart/__init__.py +218 -135
- webscout/zeroart/base.py +70 -66
- webscout/zeroart/effects.py +155 -101
- webscout/zeroart/fonts.py +1799 -1239
- webscout-2026.1.19.dist-info/METADATA +638 -0
- webscout-2026.1.19.dist-info/RECORD +312 -0
- {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/WHEEL +1 -1
- {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/entry_points.txt +1 -1
- webscout/DWEBS.py +0 -520
- webscout/Extra/Act.md +0 -309
- webscout/Extra/GitToolkit/gitapi/README.md +0 -110
- webscout/Extra/autocoder/__init__.py +0 -9
- webscout/Extra/autocoder/autocoder.py +0 -1105
- webscout/Extra/autocoder/autocoder_utiles.py +0 -332
- webscout/Extra/gguf.md +0 -430
- webscout/Extra/weather.md +0 -281
- webscout/Litlogger/README.md +0 -10
- webscout/Litlogger/__init__.py +0 -15
- webscout/Litlogger/formats.py +0 -4
- webscout/Litlogger/handlers.py +0 -103
- webscout/Litlogger/levels.py +0 -13
- webscout/Litlogger/logger.py +0 -92
- webscout/Provider/AI21.py +0 -177
- webscout/Provider/AISEARCH/DeepFind.py +0 -254
- webscout/Provider/AISEARCH/felo_search.py +0 -202
- webscout/Provider/AISEARCH/genspark_search.py +0 -324
- webscout/Provider/AISEARCH/hika_search.py +0 -186
- webscout/Provider/AISEARCH/scira_search.py +0 -298
- webscout/Provider/Aitopia.py +0 -316
- webscout/Provider/AllenAI.py +0 -440
- webscout/Provider/Blackboxai.py +0 -791
- webscout/Provider/ChatGPTClone.py +0 -237
- webscout/Provider/ChatGPTGratis.py +0 -194
- webscout/Provider/Cloudflare.py +0 -324
- webscout/Provider/ExaChat.py +0 -358
- webscout/Provider/Flowith.py +0 -217
- webscout/Provider/FreeGemini.py +0 -250
- webscout/Provider/Glider.py +0 -225
- webscout/Provider/HF_space/__init__.py +0 -0
- webscout/Provider/HF_space/qwen_qwen2.py +0 -206
- webscout/Provider/HuggingFaceChat.py +0 -469
- webscout/Provider/Hunyuan.py +0 -283
- webscout/Provider/LambdaChat.py +0 -411
- webscout/Provider/Llama3.py +0 -259
- webscout/Provider/Nemotron.py +0 -218
- webscout/Provider/OLLAMA.py +0 -396
- webscout/Provider/OPENAI/BLACKBOXAI.py +0 -766
- webscout/Provider/OPENAI/Cloudflare.py +0 -378
- webscout/Provider/OPENAI/FreeGemini.py +0 -283
- webscout/Provider/OPENAI/NEMOTRON.py +0 -232
- webscout/Provider/OPENAI/Qwen3.py +0 -283
- webscout/Provider/OPENAI/api.py +0 -969
- webscout/Provider/OPENAI/c4ai.py +0 -373
- webscout/Provider/OPENAI/chatgptclone.py +0 -494
- webscout/Provider/OPENAI/copilot.py +0 -242
- webscout/Provider/OPENAI/flowith.py +0 -162
- webscout/Provider/OPENAI/freeaichat.py +0 -359
- webscout/Provider/OPENAI/mcpcore.py +0 -389
- webscout/Provider/OPENAI/multichat.py +0 -376
- webscout/Provider/OPENAI/opkfc.py +0 -496
- webscout/Provider/OPENAI/scirachat.py +0 -477
- webscout/Provider/OPENAI/standardinput.py +0 -433
- webscout/Provider/OPENAI/typegpt.py +0 -364
- webscout/Provider/OPENAI/uncovrAI.py +0 -463
- webscout/Provider/OPENAI/venice.py +0 -431
- webscout/Provider/OPENAI/yep.py +0 -382
- webscout/Provider/OpenGPT.py +0 -209
- webscout/Provider/Perplexitylabs.py +0 -415
- webscout/Provider/Reka.py +0 -214
- webscout/Provider/StandardInput.py +0 -290
- webscout/Provider/TTI/aiarta.py +0 -365
- webscout/Provider/TTI/artbit.py +0 -0
- webscout/Provider/TTI/fastflux.py +0 -200
- webscout/Provider/TTI/piclumen.py +0 -203
- webscout/Provider/TTI/pixelmuse.py +0 -225
- webscout/Provider/TTS/gesserit.py +0 -128
- webscout/Provider/TTS/sthir.py +0 -94
- webscout/Provider/TeachAnything.py +0 -229
- webscout/Provider/UNFINISHED/puterjs.py +0 -635
- webscout/Provider/UNFINISHED/test_lmarena.py +0 -119
- webscout/Provider/Venice.py +0 -258
- webscout/Provider/VercelAI.py +0 -253
- webscout/Provider/Writecream.py +0 -246
- webscout/Provider/WritingMate.py +0 -269
- webscout/Provider/asksteve.py +0 -220
- webscout/Provider/chatglm.py +0 -215
- webscout/Provider/copilot.py +0 -425
- webscout/Provider/freeaichat.py +0 -285
- webscout/Provider/granite.py +0 -235
- webscout/Provider/hermes.py +0 -266
- webscout/Provider/koala.py +0 -170
- webscout/Provider/lmarena.py +0 -198
- webscout/Provider/multichat.py +0 -364
- webscout/Provider/scira_chat.py +0 -299
- webscout/Provider/scnet.py +0 -243
- webscout/Provider/talkai.py +0 -194
- webscout/Provider/typegpt.py +0 -289
- webscout/Provider/uncovr.py +0 -368
- webscout/Provider/yep.py +0 -389
- webscout/litagent/Readme.md +0 -276
- webscout/litprinter/__init__.py +0 -59
- webscout/swiftcli/Readme.md +0 -323
- webscout/tempid.py +0 -128
- webscout/webscout_search.py +0 -1184
- webscout/webscout_search_async.py +0 -654
- webscout/yep_search.py +0 -347
- webscout/zeroart/README.md +0 -89
- webscout-8.2.9.dist-info/METADATA +0 -1033
- webscout-8.2.9.dist-info/RECORD +0 -289
- {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/licenses/LICENSE.md +0 -0
- {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/top_level.txt +0 -0
webscout/scout/element.py
CHANGED
|
@@ -1,478 +1,872 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Scout Element Module - Advanced HTML Element Representation
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
import re
|
|
6
|
-
from typing import Any, Dict, List, Optional, Union
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class NavigableString(str):
|
|
10
|
-
"""
|
|
11
|
-
A string that knows its place in the document tree.
|
|
12
|
-
Mimics
|
|
13
|
-
"""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
"""
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
def
|
|
37
|
-
"""
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
"""
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
"""
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
""
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
)
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
if
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
#
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
1
|
+
"""
|
|
2
|
+
Scout Element Module - Advanced HTML Element Representation
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any, Dict, List, Optional, Union
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NavigableString(str):
|
|
10
|
+
"""
|
|
11
|
+
A string that knows its place in the document tree.
|
|
12
|
+
Mimics BS4's NavigableString for better compatibility.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# The parent may be a Tag or None
|
|
16
|
+
parent: Optional["Tag"]
|
|
17
|
+
|
|
18
|
+
def __new__(cls, text: str):
|
|
19
|
+
"""
|
|
20
|
+
Create a new NavigableString instance.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
text (str): String content
|
|
24
|
+
"""
|
|
25
|
+
return str.__new__(cls, text)
|
|
26
|
+
|
|
27
|
+
def __init__(self, text: str):
|
|
28
|
+
"""
|
|
29
|
+
Initialize a navigable string.
|
|
30
|
+
|
|
31
|
+
Args:
|
|
32
|
+
text (str): String content
|
|
33
|
+
"""
|
|
34
|
+
self.parent = None
|
|
35
|
+
|
|
36
|
+
def __repr__(self):
|
|
37
|
+
"""String representation."""
|
|
38
|
+
return f"NavigableString({super().__repr__()})"
|
|
39
|
+
|
|
40
|
+
def __add__(self, other):
|
|
41
|
+
"""
|
|
42
|
+
Allow concatenation of NavigableString with other strings.
|
|
43
|
+
|
|
44
|
+
Args:
|
|
45
|
+
other (str): String to concatenate
|
|
46
|
+
|
|
47
|
+
Returns:
|
|
48
|
+
str: Concatenated string
|
|
49
|
+
"""
|
|
50
|
+
return str(self) + str(other)
|
|
51
|
+
|
|
52
|
+
def strip(self, chars=None):
|
|
53
|
+
"""
|
|
54
|
+
Strip whitespace or specified characters.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
chars (str, optional): Characters to strip
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
str: Stripped string
|
|
61
|
+
"""
|
|
62
|
+
return NavigableString(super().strip(chars))
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class Tag:
|
|
66
|
+
"""
|
|
67
|
+
Represents an HTML tag with advanced traversal and manipulation capabilities.
|
|
68
|
+
Enhanced to closely mimic BS4's Tag class.
|
|
69
|
+
"""
|
|
70
|
+
|
|
71
|
+
def __init__(self, name: str, attrs: Optional[Dict[str, str]] = None):
|
|
72
|
+
"""
|
|
73
|
+
Initialize a Tag with name and attributes.
|
|
74
|
+
|
|
75
|
+
Args:
|
|
76
|
+
name (str): Tag name
|
|
77
|
+
attrs (dict, optional): Tag attributes
|
|
78
|
+
"""
|
|
79
|
+
self.name = name
|
|
80
|
+
self.attrs: Dict[str, str] = attrs or {}
|
|
81
|
+
self.contents: List[Union["Tag", NavigableString, str]] = []
|
|
82
|
+
self.parent: Optional["Tag"] = None
|
|
83
|
+
self._string: Optional[str] = None # For single string content
|
|
84
|
+
|
|
85
|
+
def __str__(self):
|
|
86
|
+
"""String representation of the tag."""
|
|
87
|
+
return self.decode_contents()
|
|
88
|
+
|
|
89
|
+
def __repr__(self):
|
|
90
|
+
"""Detailed representation of the tag."""
|
|
91
|
+
return f"<{self.name} {self.attrs}>"
|
|
92
|
+
|
|
93
|
+
def __call__(self, *args, **kwargs):
|
|
94
|
+
"""
|
|
95
|
+
Allows calling find_all directly on the tag.
|
|
96
|
+
Mimics BS4's behavior.
|
|
97
|
+
"""
|
|
98
|
+
return self.find_all(*args, **kwargs)
|
|
99
|
+
|
|
100
|
+
def __contains__(self, item):
|
|
101
|
+
"""
|
|
102
|
+
Check if an item is in the tag's contents.
|
|
103
|
+
|
|
104
|
+
Args:
|
|
105
|
+
item: Item to search for
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
bool: True if item is in contents, False otherwise
|
|
109
|
+
"""
|
|
110
|
+
return item in self.contents
|
|
111
|
+
|
|
112
|
+
def __getitem__(self, key):
|
|
113
|
+
"""
|
|
114
|
+
Get an attribute value using dictionary-like access.
|
|
115
|
+
|
|
116
|
+
Args:
|
|
117
|
+
key (str): Attribute name
|
|
118
|
+
|
|
119
|
+
Returns:
|
|
120
|
+
Any: Attribute value
|
|
121
|
+
"""
|
|
122
|
+
return self.attrs[key]
|
|
123
|
+
|
|
124
|
+
def __iter__(self):
|
|
125
|
+
"""
|
|
126
|
+
Iterate through tag's contents.
|
|
127
|
+
|
|
128
|
+
Returns:
|
|
129
|
+
Iterator: Contents of the tag
|
|
130
|
+
"""
|
|
131
|
+
return iter(self.contents)
|
|
132
|
+
|
|
133
|
+
def __eq__(self, other):
|
|
134
|
+
"""
|
|
135
|
+
Compare tags based on name and attributes.
|
|
136
|
+
|
|
137
|
+
Args:
|
|
138
|
+
other (Tag): Tag to compare
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
bool: True if tags are equivalent
|
|
142
|
+
"""
|
|
143
|
+
if not isinstance(other, Tag):
|
|
144
|
+
return False
|
|
145
|
+
return self.name == other.name and self.attrs == other.attrs and str(self) == str(other)
|
|
146
|
+
|
|
147
|
+
def __hash__(self):
|
|
148
|
+
"""
|
|
149
|
+
Generate a hash for the tag.
|
|
150
|
+
|
|
151
|
+
Returns:
|
|
152
|
+
int: Hash value
|
|
153
|
+
"""
|
|
154
|
+
return hash((self.name, frozenset(self.attrs.items()), str(self)))
|
|
155
|
+
|
|
156
|
+
def find(
|
|
157
|
+
self, name=None, attrs={}, recursive=True, text=None, limit=None, class_=None, **kwargs
|
|
158
|
+
) -> Optional["Tag"]:
|
|
159
|
+
"""
|
|
160
|
+
Find the first matching child element.
|
|
161
|
+
Enhanced with more flexible matching.
|
|
162
|
+
|
|
163
|
+
Args:
|
|
164
|
+
name (str, optional): Tag name to search for
|
|
165
|
+
attrs (dict, optional): Attributes to match
|
|
166
|
+
recursive (bool, optional): Search recursively
|
|
167
|
+
text (str, optional): Text content to match
|
|
168
|
+
|
|
169
|
+
Returns:
|
|
170
|
+
Tag or None: First matching element
|
|
171
|
+
"""
|
|
172
|
+
# Merge class_ with attrs['class'] if both are present
|
|
173
|
+
attrs = dict(attrs) if attrs else {}
|
|
174
|
+
if class_ is not None:
|
|
175
|
+
if "class" in attrs:
|
|
176
|
+
# Merge both
|
|
177
|
+
if isinstance(attrs["class"], list):
|
|
178
|
+
class_list = attrs["class"]
|
|
179
|
+
else:
|
|
180
|
+
class_list = [
|
|
181
|
+
cls.strip()
|
|
182
|
+
for cls in re.split(r"[ ,]+", str(attrs["class"]))
|
|
183
|
+
if cls.strip()
|
|
184
|
+
]
|
|
185
|
+
if isinstance(class_, list):
|
|
186
|
+
class_list += class_
|
|
187
|
+
else:
|
|
188
|
+
class_list += [
|
|
189
|
+
cls.strip() for cls in re.split(r"[ ,]+", str(class_)) if cls.strip()
|
|
190
|
+
]
|
|
191
|
+
attrs["class"] = class_list
|
|
192
|
+
else:
|
|
193
|
+
attrs["class"] = class_
|
|
194
|
+
results = self.find_all(name, attrs, recursive, text, limit=1, **kwargs)
|
|
195
|
+
return results[0] if results else None
|
|
196
|
+
|
|
197
|
+
def find_all(
|
|
198
|
+
self, name=None, attrs={}, recursive=True, text=None, limit=None, class_=None, **kwargs
|
|
199
|
+
) -> List["Tag"]:
|
|
200
|
+
"""
|
|
201
|
+
Find all matching child elements.
|
|
202
|
+
Enhanced with more flexible matching and BS4-like features.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
name (str, optional): Tag name to search for
|
|
206
|
+
attrs (dict, optional): Attributes to match
|
|
207
|
+
recursive (bool, optional): Search recursively
|
|
208
|
+
text (str, optional): Text content to match
|
|
209
|
+
limit (int, optional): Maximum number of results
|
|
210
|
+
|
|
211
|
+
Returns:
|
|
212
|
+
List[Tag]: List of matching elements
|
|
213
|
+
"""
|
|
214
|
+
results = []
|
|
215
|
+
|
|
216
|
+
def _match(tag):
|
|
217
|
+
# Check tag name with case-insensitive and regex support
|
|
218
|
+
if name:
|
|
219
|
+
if isinstance(name, str):
|
|
220
|
+
if name != "*" and tag.name.lower() != name.lower():
|
|
221
|
+
return False
|
|
222
|
+
elif isinstance(name, re.Pattern):
|
|
223
|
+
if not name.search(tag.name):
|
|
224
|
+
return False
|
|
225
|
+
elif isinstance(name, (list, tuple)):
|
|
226
|
+
if tag.name.lower() not in [n.lower() for n in name]:
|
|
227
|
+
return False
|
|
228
|
+
|
|
229
|
+
# Check attributes with more flexible matching
|
|
230
|
+
# Handle class_ parameter if provided
|
|
231
|
+
search_attrs = dict(attrs)
|
|
232
|
+
if class_ is not None:
|
|
233
|
+
search_attrs["class"] = class_
|
|
234
|
+
|
|
235
|
+
for k, v in search_attrs.items():
|
|
236
|
+
tag_attr = tag.attrs.get(k)
|
|
237
|
+
|
|
238
|
+
if k == "class":
|
|
239
|
+
# Support multiple classes and whole-word matching
|
|
240
|
+
tag_classes = tag_attr
|
|
241
|
+
if isinstance(tag_classes, str):
|
|
242
|
+
tag_classes = [
|
|
243
|
+
c.strip() for c in re.split(r"[ ,]+", tag_classes) if c.strip()
|
|
244
|
+
]
|
|
245
|
+
elif not isinstance(tag_classes, list):
|
|
246
|
+
tag_classes = []
|
|
247
|
+
|
|
248
|
+
if isinstance(v, str):
|
|
249
|
+
v_classes = [c.strip() for c in re.split(r"[ ,]+", v) if c.strip()]
|
|
250
|
+
if not all(cls in tag_classes for cls in v_classes):
|
|
251
|
+
return False
|
|
252
|
+
elif isinstance(v, list):
|
|
253
|
+
if not all(cls in tag_classes for cls in v):
|
|
254
|
+
return False
|
|
255
|
+
elif isinstance(v, re.Pattern):
|
|
256
|
+
if not any(v.search(cls) for cls in tag_classes):
|
|
257
|
+
return False
|
|
258
|
+
else:
|
|
259
|
+
return False
|
|
260
|
+
else:
|
|
261
|
+
# Regex or exact match for other attributes
|
|
262
|
+
if v is True:
|
|
263
|
+
if tag_attr is None:
|
|
264
|
+
return False
|
|
265
|
+
elif v is False:
|
|
266
|
+
if tag_attr is not None:
|
|
267
|
+
return False
|
|
268
|
+
elif isinstance(v, re.Pattern):
|
|
269
|
+
if tag_attr is None or not v.search(str(tag_attr)):
|
|
270
|
+
return False
|
|
271
|
+
elif tag_attr != v:
|
|
272
|
+
return False
|
|
273
|
+
|
|
274
|
+
# Check text content
|
|
275
|
+
if text:
|
|
276
|
+
tag_text = tag.get_text(strip=True)
|
|
277
|
+
if isinstance(text, str):
|
|
278
|
+
if text not in tag_text:
|
|
279
|
+
return False
|
|
280
|
+
elif isinstance(text, re.Pattern):
|
|
281
|
+
if not text.search(tag_text):
|
|
282
|
+
return False
|
|
283
|
+
|
|
284
|
+
return True
|
|
285
|
+
|
|
286
|
+
def _search(element):
|
|
287
|
+
if _match(element):
|
|
288
|
+
results.append(element)
|
|
289
|
+
if limit and len(results) == limit:
|
|
290
|
+
return
|
|
291
|
+
|
|
292
|
+
if recursive:
|
|
293
|
+
for child in element.contents:
|
|
294
|
+
if isinstance(child, Tag):
|
|
295
|
+
_search(child)
|
|
296
|
+
|
|
297
|
+
_search(self)
|
|
298
|
+
return results
|
|
299
|
+
|
|
300
|
+
def select(self, selector: str) -> List["Tag"]:
|
|
301
|
+
"""
|
|
302
|
+
Select elements using CSS selector.
|
|
303
|
+
Enhanced to support more complex selectors including:
|
|
304
|
+
- Tag selectors: 'p', 'div'
|
|
305
|
+
- Class selectors: '.class', 'p.class'
|
|
306
|
+
- ID selectors: '#id', 'div#id'
|
|
307
|
+
- Attribute selectors: '[attr]', '[attr=value]'
|
|
308
|
+
- Descendant selectors: 'div p'
|
|
309
|
+
- Child selectors: 'div > p'
|
|
310
|
+
- Multiple classes: '.class1.class2'
|
|
311
|
+
|
|
312
|
+
Args:
|
|
313
|
+
selector (str): CSS selector string
|
|
314
|
+
|
|
315
|
+
Returns:
|
|
316
|
+
List[Tag]: List of matching elements
|
|
317
|
+
"""
|
|
318
|
+
|
|
319
|
+
def _parse_simple_selector(simple_sel: str) -> dict:
|
|
320
|
+
"""Parse a simple selector like 'p.class#id[attr=value]' into components."""
|
|
321
|
+
components = {"tag": None, "id": None, "classes": [], "attrs": {}}
|
|
322
|
+
|
|
323
|
+
# Extract tag name (at the start)
|
|
324
|
+
tag_match = re.match(r"^([a-zA-Z][\w-]*)", simple_sel)
|
|
325
|
+
if tag_match:
|
|
326
|
+
components["tag"] = tag_match.group(1)
|
|
327
|
+
simple_sel = simple_sel[len(tag_match.group(1)) :]
|
|
328
|
+
|
|
329
|
+
# Extract ID
|
|
330
|
+
id_matches = re.findall(r"#([\w-]+)", simple_sel)
|
|
331
|
+
if id_matches:
|
|
332
|
+
components["id"] = id_matches[0]
|
|
333
|
+
|
|
334
|
+
# Extract classes
|
|
335
|
+
class_matches = re.findall(r"\.([\w-]+)", simple_sel)
|
|
336
|
+
components["classes"] = class_matches
|
|
337
|
+
|
|
338
|
+
# Extract attributes
|
|
339
|
+
attr_matches = re.findall(r"\[([^\]]+)\]", simple_sel)
|
|
340
|
+
for attr_expr in attr_matches:
|
|
341
|
+
if "=" in attr_expr:
|
|
342
|
+
attr_name, attr_value = attr_expr.split("=", 1)
|
|
343
|
+
if isinstance(components["attrs"], dict):
|
|
344
|
+
components["attrs"][attr_name.strip()] = attr_value.strip("'\"")
|
|
345
|
+
else:
|
|
346
|
+
if isinstance(components["attrs"], dict):
|
|
347
|
+
components["attrs"][attr_expr.strip()] = None
|
|
348
|
+
|
|
349
|
+
return components
|
|
350
|
+
|
|
351
|
+
def _match_simple_selector(tag: "Tag", components: dict) -> bool:
|
|
352
|
+
"""Check if a tag matches the parsed selector components."""
|
|
353
|
+
# Check tag name
|
|
354
|
+
if components["tag"] and tag.name != components["tag"]:
|
|
355
|
+
return False
|
|
356
|
+
|
|
357
|
+
# Check ID
|
|
358
|
+
if components["id"] and tag.get("id") != components["id"]:
|
|
359
|
+
return False
|
|
360
|
+
|
|
361
|
+
# Check classes
|
|
362
|
+
tag_classes = tag.get("class", "")
|
|
363
|
+
if isinstance(tag_classes, str):
|
|
364
|
+
tag_classes = tag_classes.split()
|
|
365
|
+
elif not isinstance(tag_classes, list):
|
|
366
|
+
tag_classes = [str(tag_classes)] if tag_classes else []
|
|
367
|
+
|
|
368
|
+
for cls in components["classes"]:
|
|
369
|
+
if cls not in tag_classes:
|
|
370
|
+
return False
|
|
371
|
+
|
|
372
|
+
# Check attributes
|
|
373
|
+
for attr_name, attr_value in components["attrs"].items():
|
|
374
|
+
if attr_value is None:
|
|
375
|
+
# Just check attribute exists
|
|
376
|
+
if attr_name not in tag.attrs:
|
|
377
|
+
return False
|
|
378
|
+
else:
|
|
379
|
+
# Check attribute value
|
|
380
|
+
if tag.get(attr_name) != attr_value:
|
|
381
|
+
return False
|
|
382
|
+
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
def _find_all_matching(element: "Tag", components: dict) -> List["Tag"]:
|
|
386
|
+
"""Recursively find all elements matching the selector components."""
|
|
387
|
+
matches = []
|
|
388
|
+
|
|
389
|
+
# Check current element
|
|
390
|
+
if _match_simple_selector(element, components):
|
|
391
|
+
matches.append(element)
|
|
392
|
+
|
|
393
|
+
# Check children recursively
|
|
394
|
+
for child in element.contents:
|
|
395
|
+
if isinstance(child, Tag):
|
|
396
|
+
matches.extend(_find_all_matching(child, components))
|
|
397
|
+
|
|
398
|
+
return matches
|
|
399
|
+
|
|
400
|
+
# Handle combinators (descendant ' ' and child '>')
|
|
401
|
+
if " > " in selector:
|
|
402
|
+
# Child combinator
|
|
403
|
+
parts = [p.strip() for p in selector.split(" > ")]
|
|
404
|
+
return self._select_with_child_combinator(parts)
|
|
405
|
+
elif " " in selector.strip():
|
|
406
|
+
# Descendant combinator
|
|
407
|
+
parts = [p.strip() for p in selector.split()]
|
|
408
|
+
return self._select_with_descendant_combinator(parts)
|
|
409
|
+
else:
|
|
410
|
+
# Simple selector
|
|
411
|
+
components = _parse_simple_selector(selector)
|
|
412
|
+
return _find_all_matching(self, components)
|
|
413
|
+
|
|
414
|
+
def _select_with_descendant_combinator(self, parts: List[str]) -> List["Tag"]:
|
|
415
|
+
"""Handle descendant combinator (space)."""
|
|
416
|
+
if not parts:
|
|
417
|
+
return []
|
|
418
|
+
|
|
419
|
+
if len(parts) == 1:
|
|
420
|
+
components = self._parse_selector_components(parts[0])
|
|
421
|
+
return self._find_all_matching_in_tree(self, components)
|
|
422
|
+
|
|
423
|
+
# Find elements matching the first part
|
|
424
|
+
first_components = self._parse_selector_components(parts[0])
|
|
425
|
+
first_matches = self._find_all_matching_in_tree(self, first_components)
|
|
426
|
+
|
|
427
|
+
# For each match, find descendants matching remaining parts
|
|
428
|
+
results = []
|
|
429
|
+
remaining_selector = " ".join(parts[1:])
|
|
430
|
+
for match in first_matches:
|
|
431
|
+
descendants = match.select(remaining_selector)
|
|
432
|
+
results.extend(descendants)
|
|
433
|
+
|
|
434
|
+
return results
|
|
435
|
+
|
|
436
|
+
def _select_with_child_combinator(self, parts: List[str]) -> List["Tag"]:
|
|
437
|
+
"""Handle child combinator (>)."""
|
|
438
|
+
if not parts:
|
|
439
|
+
return []
|
|
440
|
+
|
|
441
|
+
if len(parts) == 1:
|
|
442
|
+
components = self._parse_selector_components(parts[0])
|
|
443
|
+
return self._find_all_matching_in_tree(self, components)
|
|
444
|
+
|
|
445
|
+
# Find elements matching the first part
|
|
446
|
+
first_components = self._parse_selector_components(parts[0])
|
|
447
|
+
first_matches = self._find_all_matching_in_tree(self, first_components)
|
|
448
|
+
|
|
449
|
+
# For each match, find direct children matching the next part
|
|
450
|
+
if len(parts) == 2:
|
|
451
|
+
# Last part, just check direct children
|
|
452
|
+
next_components = self._parse_selector_components(parts[1])
|
|
453
|
+
results = []
|
|
454
|
+
for match in first_matches:
|
|
455
|
+
for child in match.contents:
|
|
456
|
+
if isinstance(child, Tag) and self._match_selector_components(
|
|
457
|
+
child, next_components
|
|
458
|
+
):
|
|
459
|
+
results.append(child)
|
|
460
|
+
return results
|
|
461
|
+
else:
|
|
462
|
+
# More parts, need to continue recursively
|
|
463
|
+
results = []
|
|
464
|
+
next_components = self._parse_selector_components(parts[1])
|
|
465
|
+
remaining_parts = parts[2:]
|
|
466
|
+
for match in first_matches:
|
|
467
|
+
for child in match.contents:
|
|
468
|
+
if isinstance(child, Tag) and self._match_selector_components(
|
|
469
|
+
child, next_components
|
|
470
|
+
):
|
|
471
|
+
# Continue with remaining parts
|
|
472
|
+
remaining_selector = " > ".join(remaining_parts)
|
|
473
|
+
descendants = child.select(remaining_selector)
|
|
474
|
+
results.extend(descendants)
|
|
475
|
+
return results
|
|
476
|
+
|
|
477
|
+
def _parse_selector_components(self, simple_sel: str) -> dict:
|
|
478
|
+
"""Parse a simple selector like 'p.class#id[attr=value]' into components."""
|
|
479
|
+
components = {"tag": None, "id": None, "classes": [], "attrs": {}}
|
|
480
|
+
|
|
481
|
+
# Extract tag name (at the start)
|
|
482
|
+
tag_match = re.match(r"^([a-zA-Z][\w-]*)", simple_sel)
|
|
483
|
+
if tag_match:
|
|
484
|
+
components["tag"] = tag_match.group(1)
|
|
485
|
+
simple_sel = simple_sel[len(tag_match.group(1)) :]
|
|
486
|
+
|
|
487
|
+
# Extract ID
|
|
488
|
+
id_matches = re.findall(r"#([\w-]+)", simple_sel)
|
|
489
|
+
if id_matches:
|
|
490
|
+
components["id"] = id_matches[0]
|
|
491
|
+
|
|
492
|
+
# Extract classes
|
|
493
|
+
class_matches = re.findall(r"\.([\w-]+)", simple_sel)
|
|
494
|
+
components["classes"] = class_matches
|
|
495
|
+
|
|
496
|
+
# Extract attributes
|
|
497
|
+
attr_matches = re.findall(r"\[([^\]]+)\]", simple_sel)
|
|
498
|
+
for attr_expr in attr_matches:
|
|
499
|
+
if "=" in attr_expr:
|
|
500
|
+
attr_name, attr_value = attr_expr.split("=", 1)
|
|
501
|
+
if isinstance(components["attrs"], dict):
|
|
502
|
+
components["attrs"][attr_name.strip()] = attr_value.strip("'\"")
|
|
503
|
+
else:
|
|
504
|
+
if isinstance(components["attrs"], dict):
|
|
505
|
+
components["attrs"][attr_expr.strip()] = None
|
|
506
|
+
|
|
507
|
+
return components
|
|
508
|
+
|
|
509
|
+
def _match_selector_components(self, tag: "Tag", components: dict) -> bool:
|
|
510
|
+
"""Check if a tag matches the parsed selector components."""
|
|
511
|
+
# Check tag name
|
|
512
|
+
if components["tag"] and tag.name != components["tag"]:
|
|
513
|
+
return False
|
|
514
|
+
|
|
515
|
+
# Check ID
|
|
516
|
+
if components["id"] and tag.get("id") != components["id"]:
|
|
517
|
+
return False
|
|
518
|
+
|
|
519
|
+
# Check classes
|
|
520
|
+
tag_classes = tag.get("class", "")
|
|
521
|
+
if isinstance(tag_classes, str):
|
|
522
|
+
tag_classes = tag_classes.split()
|
|
523
|
+
elif not isinstance(tag_classes, list):
|
|
524
|
+
tag_classes = [str(tag_classes)] if tag_classes else []
|
|
525
|
+
|
|
526
|
+
for cls in components["classes"]:
|
|
527
|
+
if cls not in tag_classes:
|
|
528
|
+
return False
|
|
529
|
+
|
|
530
|
+
# Check attributes
|
|
531
|
+
for attr_name, attr_value in components["attrs"].items():
|
|
532
|
+
if attr_value is None:
|
|
533
|
+
# Just check attribute exists
|
|
534
|
+
if attr_name not in tag.attrs:
|
|
535
|
+
return False
|
|
536
|
+
else:
|
|
537
|
+
# Check attribute value
|
|
538
|
+
if tag.get(attr_name) != attr_value:
|
|
539
|
+
return False
|
|
540
|
+
|
|
541
|
+
return True
|
|
542
|
+
|
|
543
|
+
def _find_all_matching_in_tree(self, element: "Tag", components: dict) -> List["Tag"]:
|
|
544
|
+
"""Recursively find all elements matching the selector components."""
|
|
545
|
+
matches = []
|
|
546
|
+
|
|
547
|
+
# Check current element
|
|
548
|
+
if self._match_selector_components(element, components):
|
|
549
|
+
matches.append(element)
|
|
550
|
+
|
|
551
|
+
# Check children recursively
|
|
552
|
+
for child in element.contents:
|
|
553
|
+
if isinstance(child, Tag):
|
|
554
|
+
matches.extend(self._find_all_matching_in_tree(child, components))
|
|
555
|
+
|
|
556
|
+
return matches
|
|
557
|
+
|
|
558
|
+
def select_one(self, selector: str) -> Optional["Tag"]:
|
|
559
|
+
"""
|
|
560
|
+
Select the first element matching the CSS selector.
|
|
561
|
+
|
|
562
|
+
Args:
|
|
563
|
+
selector (str): CSS selector string
|
|
564
|
+
|
|
565
|
+
Returns:
|
|
566
|
+
Tag or None: First matching element
|
|
567
|
+
"""
|
|
568
|
+
results = self.select(selector)
|
|
569
|
+
return results[0] if results else None
|
|
570
|
+
|
|
571
|
+
def get_text(self, separator=" ", strip=False, types=None) -> str:
|
|
572
|
+
"""
|
|
573
|
+
Extract text from the tag and its descendants.
|
|
574
|
+
Enhanced to support more flexible text extraction.
|
|
575
|
+
|
|
576
|
+
Args:
|
|
577
|
+
separator (str, optional): Text separator
|
|
578
|
+
strip (bool, optional): Strip whitespace
|
|
579
|
+
types (list, optional): Types of content to extract
|
|
580
|
+
|
|
581
|
+
Returns:
|
|
582
|
+
str: Extracted text
|
|
583
|
+
"""
|
|
584
|
+
texts = []
|
|
585
|
+
for content in self.contents:
|
|
586
|
+
# Support filtering by content type
|
|
587
|
+
if types is None or type(content) in types:
|
|
588
|
+
if isinstance(content, NavigableString):
|
|
589
|
+
texts.append(str(content))
|
|
590
|
+
elif isinstance(content, Tag):
|
|
591
|
+
texts.append(content.get_text(separator, strip))
|
|
592
|
+
|
|
593
|
+
text = separator.join(texts)
|
|
594
|
+
text = re.sub(r"\n\n+", "\n", text) # Replace multiple newlines with single newlines
|
|
595
|
+
return text.strip() if strip else text
|
|
596
|
+
|
|
597
|
+
def find_text(self, pattern: Union[str, re.Pattern], **kwargs) -> Optional[str]:
|
|
598
|
+
"""
|
|
599
|
+
Find the first text matching a pattern.
|
|
600
|
+
|
|
601
|
+
Args:
|
|
602
|
+
pattern (str or re.Pattern): Pattern to match
|
|
603
|
+
**kwargs: Additional arguments for get_text()
|
|
604
|
+
|
|
605
|
+
Returns:
|
|
606
|
+
str or None: First matching text
|
|
607
|
+
"""
|
|
608
|
+
text = self.get_text(**kwargs)
|
|
609
|
+
|
|
610
|
+
if isinstance(pattern, str):
|
|
611
|
+
return pattern if pattern in text else None
|
|
612
|
+
elif isinstance(pattern, re.Pattern):
|
|
613
|
+
match = pattern.search(text)
|
|
614
|
+
return match.group(0) if match else None
|
|
615
|
+
|
|
616
|
+
def replace_text(self, old: Union[str, re.Pattern], new: str, **kwargs) -> str:
|
|
617
|
+
"""
|
|
618
|
+
Replace text matching a pattern.
|
|
619
|
+
|
|
620
|
+
Args:
|
|
621
|
+
old (str or re.Pattern): Pattern to replace
|
|
622
|
+
new (str): Replacement text
|
|
623
|
+
**kwargs: Additional arguments for get_text()
|
|
624
|
+
|
|
625
|
+
Returns:
|
|
626
|
+
str: Modified text
|
|
627
|
+
"""
|
|
628
|
+
text = self.get_text(**kwargs)
|
|
629
|
+
|
|
630
|
+
if isinstance(old, str):
|
|
631
|
+
return text.replace(old, new)
|
|
632
|
+
elif isinstance(old, re.Pattern):
|
|
633
|
+
return old.sub(new, text)
|
|
634
|
+
|
|
635
|
+
def get(self, key: str, default: Any = None) -> Any:
|
|
636
|
+
"""
|
|
637
|
+
Get an attribute value.
|
|
638
|
+
|
|
639
|
+
Args:
|
|
640
|
+
key (str): Attribute name
|
|
641
|
+
default (Any, optional): Default value if attribute not found
|
|
642
|
+
|
|
643
|
+
Returns:
|
|
644
|
+
Any: Attribute value or default
|
|
645
|
+
"""
|
|
646
|
+
return self.attrs.get(key, default)
|
|
647
|
+
|
|
648
|
+
def decompose(self) -> None:
|
|
649
|
+
"""Remove the tag and its contents from the document."""
|
|
650
|
+
if self.parent:
|
|
651
|
+
self.parent.contents.remove(self)
|
|
652
|
+
|
|
653
|
+
def extract(self) -> "Tag":
|
|
654
|
+
"""
|
|
655
|
+
Remove the tag from the document and return it.
|
|
656
|
+
|
|
657
|
+
Returns:
|
|
658
|
+
Tag: Extracted tag
|
|
659
|
+
"""
|
|
660
|
+
self.decompose()
|
|
661
|
+
return self
|
|
662
|
+
|
|
663
|
+
def clear(self) -> None:
|
|
664
|
+
"""Remove all contents of the tag."""
|
|
665
|
+
self.contents.clear()
|
|
666
|
+
|
|
667
|
+
@property
|
|
668
|
+
def string(self) -> Optional[str]:
|
|
669
|
+
"""
|
|
670
|
+
Get the string content of the tag.
|
|
671
|
+
Returns the explicitly set _string if present or the combined text of the tag's contents.
|
|
672
|
+
"""
|
|
673
|
+
return self._string if self._string is not None else self.get_text()
|
|
674
|
+
|
|
675
|
+
@property
|
|
676
|
+
def text(self) -> str:
|
|
677
|
+
"""BS4 compatible text property."""
|
|
678
|
+
return self.get_text()
|
|
679
|
+
|
|
680
|
+
@string.setter
|
|
681
|
+
def string(self, value: Optional[str]) -> None:
|
|
682
|
+
"""
|
|
683
|
+
Set the string content of the tag.
|
|
684
|
+
Clears existing contents and sets new string value.
|
|
685
|
+
|
|
686
|
+
Args:
|
|
687
|
+
value (str | None): New string content
|
|
688
|
+
"""
|
|
689
|
+
self._string = value
|
|
690
|
+
self.clear()
|
|
691
|
+
if value is not None:
|
|
692
|
+
self.append(value)
|
|
693
|
+
|
|
694
|
+
def append(self, new_child: Union["Tag", NavigableString, str]) -> None:
|
|
695
|
+
"""Append a new child to this tag with error handling."""
|
|
696
|
+
if isinstance(new_child, str):
|
|
697
|
+
new_child = NavigableString(new_child)
|
|
698
|
+
if hasattr(new_child, "parent"):
|
|
699
|
+
new_child.parent = self
|
|
700
|
+
self.contents.append(new_child)
|
|
701
|
+
|
|
702
|
+
def extend(self, new_children: List[Union["Tag", NavigableString, str]]) -> None:
|
|
703
|
+
"""Extend the contents of this tag with a list of new children."""
|
|
704
|
+
for child in new_children:
|
|
705
|
+
self.append(child)
|
|
706
|
+
|
|
707
|
+
def insert(self, index: int, new_child: Union["Tag", NavigableString, str]) -> None:
|
|
708
|
+
"""Insert a new child at the given index with error handling."""
|
|
709
|
+
if isinstance(new_child, str):
|
|
710
|
+
new_child = NavigableString(new_child)
|
|
711
|
+
if hasattr(new_child, "parent"):
|
|
712
|
+
new_child.parent = self
|
|
713
|
+
self.contents.insert(index, new_child)
|
|
714
|
+
|
|
715
|
+
def replace_with(self, new_tag: "Tag") -> None:
|
|
716
|
+
"""Replace this tag with another tag with error handling."""
|
|
717
|
+
if self.parent:
|
|
718
|
+
try:
|
|
719
|
+
index = self.parent.contents.index(self)
|
|
720
|
+
self.parent.contents[index] = new_tag
|
|
721
|
+
new_tag.parent = self.parent
|
|
722
|
+
except ValueError:
|
|
723
|
+
pass
|
|
724
|
+
|
|
725
|
+
def wrap(self, wrapper_tag: "Tag") -> "Tag":
|
|
726
|
+
"""Wrap this tag in another tag."""
|
|
727
|
+
if self.parent:
|
|
728
|
+
idx = self.parent.contents.index(self)
|
|
729
|
+
self.parent.contents[idx] = wrapper_tag
|
|
730
|
+
wrapper_tag.parent = self.parent
|
|
731
|
+
else:
|
|
732
|
+
wrapper_tag.parent = None
|
|
733
|
+
wrapper_tag.contents.append(self)
|
|
734
|
+
self.parent = wrapper_tag
|
|
735
|
+
return wrapper_tag
|
|
736
|
+
|
|
737
|
+
def unwrap(self) -> None:
|
|
738
|
+
"""Remove this tag but keep its contents in the parent."""
|
|
739
|
+
if self.parent:
|
|
740
|
+
idx = self.parent.contents.index(self)
|
|
741
|
+
for child in reversed(self.contents):
|
|
742
|
+
if isinstance(child, (Tag, NavigableString)):
|
|
743
|
+
child.parent = self.parent
|
|
744
|
+
self.parent.contents.insert(idx, child)
|
|
745
|
+
self.parent.contents.remove(self)
|
|
746
|
+
self.parent = None
|
|
747
|
+
self.contents = []
|
|
748
|
+
|
|
749
|
+
def insert_before(self, new_element: "Tag") -> None:
|
|
750
|
+
"""Insert a tag or string immediately before this tag."""
|
|
751
|
+
if self.parent:
|
|
752
|
+
idx = self.parent.contents.index(self)
|
|
753
|
+
new_element.parent = self.parent
|
|
754
|
+
self.parent.contents.insert(idx, new_element)
|
|
755
|
+
|
|
756
|
+
def insert_after(self, new_element: "Tag") -> None:
|
|
757
|
+
"""Insert a tag or string immediately after this tag."""
|
|
758
|
+
if self.parent:
|
|
759
|
+
idx = self.parent.contents.index(self)
|
|
760
|
+
new_element.parent = self.parent
|
|
761
|
+
self.parent.contents.insert(idx + 1, new_element)
|
|
762
|
+
|
|
763
|
+
@property
|
|
764
|
+
def descendants(self):
|
|
765
|
+
"""Yield all descendants in document order."""
|
|
766
|
+
for child in self.contents:
|
|
767
|
+
yield child
|
|
768
|
+
if isinstance(child, Tag):
|
|
769
|
+
yield from child.descendants
|
|
770
|
+
|
|
771
|
+
@property
|
|
772
|
+
def parents(self):
|
|
773
|
+
"""Yield all parents up the tree."""
|
|
774
|
+
current = self.parent
|
|
775
|
+
while current:
|
|
776
|
+
yield current
|
|
777
|
+
current = current.parent
|
|
778
|
+
|
|
779
|
+
@property
|
|
780
|
+
def next_element(self):
|
|
781
|
+
"""Return the next element in document order."""
|
|
782
|
+
if self.contents:
|
|
783
|
+
return self.contents[0]
|
|
784
|
+
current = self
|
|
785
|
+
while current.parent:
|
|
786
|
+
idx = current.parent.contents.index(current)
|
|
787
|
+
if idx + 1 < len(current.parent.contents):
|
|
788
|
+
return current.parent.contents[idx + 1]
|
|
789
|
+
current = current.parent
|
|
790
|
+
return None
|
|
791
|
+
|
|
792
|
+
@property
|
|
793
|
+
def previous_element(self):
|
|
794
|
+
"""Return the previous element in document order."""
|
|
795
|
+
if not self.parent:
|
|
796
|
+
return None
|
|
797
|
+
idx = self.parent.contents.index(self)
|
|
798
|
+
if idx > 0:
|
|
799
|
+
prev = self.parent.contents[idx - 1]
|
|
800
|
+
while isinstance(prev, Tag) and prev.contents:
|
|
801
|
+
prev = prev.contents[-1]
|
|
802
|
+
return prev
|
|
803
|
+
return self.parent
|
|
804
|
+
|
|
805
|
+
def decode_contents(self, eventual_encoding="utf-8") -> str:
|
|
806
|
+
"""
|
|
807
|
+
Decode the contents of the tag to a string.
|
|
808
|
+
|
|
809
|
+
Args:
|
|
810
|
+
eventual_encoding (str, optional): Encoding to use
|
|
811
|
+
|
|
812
|
+
Returns:
|
|
813
|
+
str: Decoded contents
|
|
814
|
+
"""
|
|
815
|
+
return "".join(str(content) for content in self.contents)
|
|
816
|
+
|
|
817
|
+
def prettify(self, formatter="minimal") -> str:
|
|
818
|
+
"""
|
|
819
|
+
Return a nicely formatted representation of the tag.
|
|
820
|
+
|
|
821
|
+
Args:
|
|
822
|
+
formatter (str, optional): Formatting style
|
|
823
|
+
|
|
824
|
+
Returns:
|
|
825
|
+
str: Prettified tag representation
|
|
826
|
+
"""
|
|
827
|
+
|
|
828
|
+
def _prettify(tag, indent=0):
|
|
829
|
+
result = " " * indent + f"<{tag.name}"
|
|
830
|
+
for k, v in tag.attrs.items():
|
|
831
|
+
if isinstance(v, list):
|
|
832
|
+
v = " ".join(v)
|
|
833
|
+
result += f' {k}="{v}"'
|
|
834
|
+
|
|
835
|
+
# Implementation of self-closing tags
|
|
836
|
+
self_closing = {
|
|
837
|
+
"br",
|
|
838
|
+
"img",
|
|
839
|
+
"input",
|
|
840
|
+
"hr",
|
|
841
|
+
"meta",
|
|
842
|
+
"link",
|
|
843
|
+
"base",
|
|
844
|
+
"area",
|
|
845
|
+
"col",
|
|
846
|
+
"embed",
|
|
847
|
+
"keygen",
|
|
848
|
+
"source",
|
|
849
|
+
"track",
|
|
850
|
+
"wbr",
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
if tag.name.lower() in self_closing and not tag.contents:
|
|
854
|
+
result += " />\n"
|
|
855
|
+
return result
|
|
856
|
+
|
|
857
|
+
result += ">\n"
|
|
858
|
+
|
|
859
|
+
for content in tag.contents:
|
|
860
|
+
if isinstance(content, Tag):
|
|
861
|
+
result += _prettify(content, indent + 2)
|
|
862
|
+
elif isinstance(content, NavigableString):
|
|
863
|
+
if content.strip():
|
|
864
|
+
result += " " * (indent + 2) + str(content) + "\n"
|
|
865
|
+
else:
|
|
866
|
+
if str(content).strip():
|
|
867
|
+
result += " " * (indent + 2) + str(content) + "\n"
|
|
868
|
+
|
|
869
|
+
result += " " * indent + f"</{tag.name}>\n"
|
|
870
|
+
return result
|
|
871
|
+
|
|
872
|
+
return _prettify(self)
|