webscout 8.2.2__py3-none-any.whl → 2026.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (483) hide show
  1. webscout/AIauto.py +524 -143
  2. webscout/AIbase.py +247 -123
  3. webscout/AIutel.py +68 -132
  4. webscout/Bard.py +1072 -535
  5. webscout/Extra/GitToolkit/__init__.py +2 -2
  6. webscout/Extra/GitToolkit/gitapi/__init__.py +20 -12
  7. webscout/Extra/GitToolkit/gitapi/gist.py +142 -0
  8. webscout/Extra/GitToolkit/gitapi/organization.py +91 -0
  9. webscout/Extra/GitToolkit/gitapi/repository.py +308 -195
  10. webscout/Extra/GitToolkit/gitapi/search.py +162 -0
  11. webscout/Extra/GitToolkit/gitapi/trending.py +236 -0
  12. webscout/Extra/GitToolkit/gitapi/user.py +128 -96
  13. webscout/Extra/GitToolkit/gitapi/utils.py +82 -62
  14. webscout/Extra/YTToolkit/README.md +443 -0
  15. webscout/Extra/YTToolkit/YTdownloader.py +953 -957
  16. webscout/Extra/YTToolkit/__init__.py +3 -3
  17. webscout/Extra/YTToolkit/transcriber.py +595 -476
  18. webscout/Extra/YTToolkit/ytapi/README.md +230 -0
  19. webscout/Extra/YTToolkit/ytapi/__init__.py +22 -6
  20. webscout/Extra/YTToolkit/ytapi/captions.py +190 -0
  21. webscout/Extra/YTToolkit/ytapi/channel.py +302 -307
  22. webscout/Extra/YTToolkit/ytapi/errors.py +13 -13
  23. webscout/Extra/YTToolkit/ytapi/extras.py +178 -45
  24. webscout/Extra/YTToolkit/ytapi/hashtag.py +120 -0
  25. webscout/Extra/YTToolkit/ytapi/https.py +89 -88
  26. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -61
  27. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -59
  28. webscout/Extra/YTToolkit/ytapi/pool.py +8 -8
  29. webscout/Extra/YTToolkit/ytapi/query.py +143 -40
  30. webscout/Extra/YTToolkit/ytapi/shorts.py +122 -0
  31. webscout/Extra/YTToolkit/ytapi/stream.py +68 -63
  32. webscout/Extra/YTToolkit/ytapi/suggestions.py +97 -0
  33. webscout/Extra/YTToolkit/ytapi/utils.py +66 -62
  34. webscout/Extra/YTToolkit/ytapi/video.py +189 -18
  35. webscout/Extra/__init__.py +2 -3
  36. webscout/Extra/gguf.py +1298 -682
  37. webscout/Extra/tempmail/README.md +488 -0
  38. webscout/Extra/tempmail/__init__.py +28 -28
  39. webscout/Extra/tempmail/async_utils.py +143 -141
  40. webscout/Extra/tempmail/base.py +172 -161
  41. webscout/Extra/tempmail/cli.py +191 -187
  42. webscout/Extra/tempmail/emailnator.py +88 -84
  43. webscout/Extra/tempmail/mail_tm.py +378 -361
  44. webscout/Extra/tempmail/temp_mail_io.py +304 -292
  45. webscout/Extra/weather.py +196 -194
  46. webscout/Extra/weather_ascii.py +17 -15
  47. webscout/Provider/AISEARCH/PERPLEXED_search.py +175 -0
  48. webscout/Provider/AISEARCH/Perplexity.py +237 -304
  49. webscout/Provider/AISEARCH/README.md +106 -0
  50. webscout/Provider/AISEARCH/__init__.py +16 -10
  51. webscout/Provider/AISEARCH/brave_search.py +298 -0
  52. webscout/Provider/AISEARCH/iask_search.py +130 -209
  53. webscout/Provider/AISEARCH/monica_search.py +200 -246
  54. webscout/Provider/AISEARCH/webpilotai_search.py +242 -281
  55. webscout/Provider/Algion.py +413 -0
  56. webscout/Provider/Andi.py +74 -69
  57. webscout/Provider/Apriel.py +313 -0
  58. webscout/Provider/Ayle.py +323 -0
  59. webscout/Provider/ChatSandbox.py +329 -0
  60. webscout/Provider/ClaudeOnline.py +365 -0
  61. webscout/Provider/Cohere.py +232 -208
  62. webscout/Provider/DeepAI.py +367 -0
  63. webscout/Provider/Deepinfra.py +343 -173
  64. webscout/Provider/EssentialAI.py +217 -0
  65. webscout/Provider/ExaAI.py +274 -261
  66. webscout/Provider/Gemini.py +60 -54
  67. webscout/Provider/GithubChat.py +385 -367
  68. webscout/Provider/Gradient.py +286 -0
  69. webscout/Provider/Groq.py +556 -670
  70. webscout/Provider/HadadXYZ.py +323 -0
  71. webscout/Provider/HeckAI.py +392 -233
  72. webscout/Provider/HuggingFace.py +387 -0
  73. webscout/Provider/IBM.py +340 -0
  74. webscout/Provider/Jadve.py +317 -266
  75. webscout/Provider/K2Think.py +306 -0
  76. webscout/Provider/Koboldai.py +221 -381
  77. webscout/Provider/Netwrck.py +273 -228
  78. webscout/Provider/Nvidia.py +310 -0
  79. webscout/Provider/OPENAI/DeepAI.py +489 -0
  80. webscout/Provider/OPENAI/K2Think.py +423 -0
  81. webscout/Provider/OPENAI/PI.py +463 -0
  82. webscout/Provider/OPENAI/README.md +890 -0
  83. webscout/Provider/OPENAI/TogetherAI.py +405 -0
  84. webscout/Provider/OPENAI/TwoAI.py +255 -0
  85. webscout/Provider/OPENAI/__init__.py +148 -25
  86. webscout/Provider/OPENAI/ai4chat.py +348 -0
  87. webscout/Provider/OPENAI/akashgpt.py +436 -0
  88. webscout/Provider/OPENAI/algion.py +303 -0
  89. webscout/Provider/OPENAI/ayle.py +365 -0
  90. webscout/Provider/OPENAI/base.py +253 -46
  91. webscout/Provider/OPENAI/cerebras.py +296 -0
  92. webscout/Provider/OPENAI/chatgpt.py +514 -193
  93. webscout/Provider/OPENAI/chatsandbox.py +233 -0
  94. webscout/Provider/OPENAI/deepinfra.py +403 -272
  95. webscout/Provider/OPENAI/e2b.py +2370 -1350
  96. webscout/Provider/OPENAI/elmo.py +278 -0
  97. webscout/Provider/OPENAI/exaai.py +186 -138
  98. webscout/Provider/OPENAI/freeassist.py +446 -0
  99. webscout/Provider/OPENAI/gradient.py +448 -0
  100. webscout/Provider/OPENAI/groq.py +380 -0
  101. webscout/Provider/OPENAI/hadadxyz.py +292 -0
  102. webscout/Provider/OPENAI/heckai.py +100 -104
  103. webscout/Provider/OPENAI/huggingface.py +321 -0
  104. webscout/Provider/OPENAI/ibm.py +425 -0
  105. webscout/Provider/OPENAI/llmchat.py +253 -0
  106. webscout/Provider/OPENAI/llmchatco.py +378 -327
  107. webscout/Provider/OPENAI/meta.py +541 -0
  108. webscout/Provider/OPENAI/netwrck.py +110 -84
  109. webscout/Provider/OPENAI/nvidia.py +317 -0
  110. webscout/Provider/OPENAI/oivscode.py +348 -0
  111. webscout/Provider/OPENAI/openrouter.py +328 -0
  112. webscout/Provider/OPENAI/pydantic_imports.py +1 -0
  113. webscout/Provider/OPENAI/sambanova.py +397 -0
  114. webscout/Provider/OPENAI/sonus.py +126 -115
  115. webscout/Provider/OPENAI/textpollinations.py +218 -133
  116. webscout/Provider/OPENAI/toolbaz.py +136 -166
  117. webscout/Provider/OPENAI/typefully.py +419 -0
  118. webscout/Provider/OPENAI/typliai.py +279 -0
  119. webscout/Provider/OPENAI/utils.py +314 -211
  120. webscout/Provider/OPENAI/wisecat.py +103 -125
  121. webscout/Provider/OPENAI/writecream.py +185 -156
  122. webscout/Provider/OPENAI/x0gpt.py +227 -136
  123. webscout/Provider/OPENAI/zenmux.py +380 -0
  124. webscout/Provider/OpenRouter.py +386 -0
  125. webscout/Provider/Openai.py +337 -496
  126. webscout/Provider/PI.py +443 -344
  127. webscout/Provider/QwenLM.py +346 -254
  128. webscout/Provider/STT/__init__.py +28 -0
  129. webscout/Provider/STT/base.py +303 -0
  130. webscout/Provider/STT/elevenlabs.py +264 -0
  131. webscout/Provider/Sambanova.py +317 -0
  132. webscout/Provider/TTI/README.md +69 -0
  133. webscout/Provider/TTI/__init__.py +37 -12
  134. webscout/Provider/TTI/base.py +147 -0
  135. webscout/Provider/TTI/claudeonline.py +393 -0
  136. webscout/Provider/TTI/magicstudio.py +292 -0
  137. webscout/Provider/TTI/miragic.py +180 -0
  138. webscout/Provider/TTI/pollinations.py +331 -0
  139. webscout/Provider/TTI/together.py +334 -0
  140. webscout/Provider/TTI/utils.py +14 -0
  141. webscout/Provider/TTS/README.md +186 -0
  142. webscout/Provider/TTS/__init__.py +43 -7
  143. webscout/Provider/TTS/base.py +523 -0
  144. webscout/Provider/TTS/deepgram.py +286 -156
  145. webscout/Provider/TTS/elevenlabs.py +189 -111
  146. webscout/Provider/TTS/freetts.py +218 -0
  147. webscout/Provider/TTS/murfai.py +288 -113
  148. webscout/Provider/TTS/openai_fm.py +364 -0
  149. webscout/Provider/TTS/parler.py +203 -111
  150. webscout/Provider/TTS/qwen.py +334 -0
  151. webscout/Provider/TTS/sherpa.py +286 -0
  152. webscout/Provider/TTS/speechma.py +693 -180
  153. webscout/Provider/TTS/streamElements.py +275 -333
  154. webscout/Provider/TTS/utils.py +280 -280
  155. webscout/Provider/TextPollinationsAI.py +221 -121
  156. webscout/Provider/TogetherAI.py +450 -0
  157. webscout/Provider/TwoAI.py +309 -199
  158. webscout/Provider/TypliAI.py +311 -0
  159. webscout/Provider/UNFINISHED/ChatHub.py +219 -0
  160. webscout/Provider/{OPENAI/glider.py → UNFINISHED/ChutesAI.py} +160 -145
  161. webscout/Provider/UNFINISHED/GizAI.py +300 -0
  162. webscout/Provider/UNFINISHED/Marcus.py +218 -0
  163. webscout/Provider/UNFINISHED/Qodo.py +481 -0
  164. webscout/Provider/UNFINISHED/XenAI.py +330 -0
  165. webscout/Provider/{Youchat.py → UNFINISHED/Youchat.py} +64 -47
  166. webscout/Provider/UNFINISHED/aihumanizer.py +41 -0
  167. webscout/Provider/UNFINISHED/grammerchecker.py +37 -0
  168. webscout/Provider/UNFINISHED/liner.py +342 -0
  169. webscout/Provider/UNFINISHED/liner_api_request.py +246 -0
  170. webscout/Provider/UNFINISHED/samurai.py +231 -0
  171. webscout/Provider/WiseCat.py +256 -196
  172. webscout/Provider/WrDoChat.py +390 -0
  173. webscout/Provider/__init__.py +115 -198
  174. webscout/Provider/ai4chat.py +181 -202
  175. webscout/Provider/akashgpt.py +330 -342
  176. webscout/Provider/cerebras.py +397 -242
  177. webscout/Provider/cleeai.py +236 -213
  178. webscout/Provider/elmo.py +291 -234
  179. webscout/Provider/geminiapi.py +343 -208
  180. webscout/Provider/julius.py +245 -223
  181. webscout/Provider/learnfastai.py +333 -266
  182. webscout/Provider/llama3mitril.py +230 -180
  183. webscout/Provider/llmchat.py +308 -213
  184. webscout/Provider/llmchatco.py +321 -311
  185. webscout/Provider/meta.py +996 -794
  186. webscout/Provider/oivscode.py +332 -0
  187. webscout/Provider/searchchat.py +316 -293
  188. webscout/Provider/sonus.py +264 -208
  189. webscout/Provider/toolbaz.py +359 -320
  190. webscout/Provider/turboseek.py +332 -219
  191. webscout/Provider/typefully.py +262 -280
  192. webscout/Provider/x0gpt.py +332 -256
  193. webscout/__init__.py +31 -38
  194. webscout/__main__.py +5 -5
  195. webscout/cli.py +585 -293
  196. webscout/client.py +1497 -0
  197. webscout/conversation.py +140 -565
  198. webscout/exceptions.py +383 -339
  199. webscout/litagent/__init__.py +29 -29
  200. webscout/litagent/agent.py +492 -455
  201. webscout/litagent/constants.py +60 -60
  202. webscout/models.py +505 -181
  203. webscout/optimizers.py +32 -378
  204. webscout/prompt_manager.py +376 -274
  205. webscout/sanitize.py +1514 -0
  206. webscout/scout/README.md +452 -0
  207. webscout/scout/__init__.py +8 -8
  208. webscout/scout/core/__init__.py +7 -7
  209. webscout/scout/core/crawler.py +330 -140
  210. webscout/scout/core/scout.py +800 -568
  211. webscout/scout/core/search_result.py +51 -96
  212. webscout/scout/core/text_analyzer.py +64 -63
  213. webscout/scout/core/text_utils.py +412 -277
  214. webscout/scout/core/web_analyzer.py +54 -52
  215. webscout/scout/element.py +872 -460
  216. webscout/scout/parsers/__init__.py +70 -69
  217. webscout/scout/parsers/html5lib_parser.py +182 -172
  218. webscout/scout/parsers/html_parser.py +238 -236
  219. webscout/scout/parsers/lxml_parser.py +203 -178
  220. webscout/scout/utils.py +38 -37
  221. webscout/search/__init__.py +47 -0
  222. webscout/search/base.py +201 -0
  223. webscout/search/bing_main.py +45 -0
  224. webscout/search/brave_main.py +92 -0
  225. webscout/search/duckduckgo_main.py +57 -0
  226. webscout/search/engines/__init__.py +127 -0
  227. webscout/search/engines/bing/__init__.py +15 -0
  228. webscout/search/engines/bing/base.py +35 -0
  229. webscout/search/engines/bing/images.py +114 -0
  230. webscout/search/engines/bing/news.py +96 -0
  231. webscout/search/engines/bing/suggestions.py +36 -0
  232. webscout/search/engines/bing/text.py +109 -0
  233. webscout/search/engines/brave/__init__.py +19 -0
  234. webscout/search/engines/brave/base.py +47 -0
  235. webscout/search/engines/brave/images.py +213 -0
  236. webscout/search/engines/brave/news.py +353 -0
  237. webscout/search/engines/brave/suggestions.py +318 -0
  238. webscout/search/engines/brave/text.py +167 -0
  239. webscout/search/engines/brave/videos.py +364 -0
  240. webscout/search/engines/duckduckgo/__init__.py +25 -0
  241. webscout/search/engines/duckduckgo/answers.py +80 -0
  242. webscout/search/engines/duckduckgo/base.py +189 -0
  243. webscout/search/engines/duckduckgo/images.py +100 -0
  244. webscout/search/engines/duckduckgo/maps.py +183 -0
  245. webscout/search/engines/duckduckgo/news.py +70 -0
  246. webscout/search/engines/duckduckgo/suggestions.py +22 -0
  247. webscout/search/engines/duckduckgo/text.py +221 -0
  248. webscout/search/engines/duckduckgo/translate.py +48 -0
  249. webscout/search/engines/duckduckgo/videos.py +80 -0
  250. webscout/search/engines/duckduckgo/weather.py +84 -0
  251. webscout/search/engines/mojeek.py +61 -0
  252. webscout/search/engines/wikipedia.py +77 -0
  253. webscout/search/engines/yahoo/__init__.py +41 -0
  254. webscout/search/engines/yahoo/answers.py +19 -0
  255. webscout/search/engines/yahoo/base.py +34 -0
  256. webscout/search/engines/yahoo/images.py +323 -0
  257. webscout/search/engines/yahoo/maps.py +19 -0
  258. webscout/search/engines/yahoo/news.py +258 -0
  259. webscout/search/engines/yahoo/suggestions.py +140 -0
  260. webscout/search/engines/yahoo/text.py +273 -0
  261. webscout/search/engines/yahoo/translate.py +19 -0
  262. webscout/search/engines/yahoo/videos.py +302 -0
  263. webscout/search/engines/yahoo/weather.py +220 -0
  264. webscout/search/engines/yandex.py +67 -0
  265. webscout/search/engines/yep/__init__.py +13 -0
  266. webscout/search/engines/yep/base.py +34 -0
  267. webscout/search/engines/yep/images.py +101 -0
  268. webscout/search/engines/yep/suggestions.py +38 -0
  269. webscout/search/engines/yep/text.py +99 -0
  270. webscout/search/http_client.py +172 -0
  271. webscout/search/results.py +141 -0
  272. webscout/search/yahoo_main.py +57 -0
  273. webscout/search/yep_main.py +48 -0
  274. webscout/server/__init__.py +48 -0
  275. webscout/server/config.py +78 -0
  276. webscout/server/exceptions.py +69 -0
  277. webscout/server/providers.py +286 -0
  278. webscout/server/request_models.py +131 -0
  279. webscout/server/request_processing.py +404 -0
  280. webscout/server/routes.py +642 -0
  281. webscout/server/server.py +351 -0
  282. webscout/server/ui_templates.py +1171 -0
  283. webscout/swiftcli/__init__.py +79 -809
  284. webscout/swiftcli/core/__init__.py +7 -0
  285. webscout/swiftcli/core/cli.py +574 -0
  286. webscout/swiftcli/core/context.py +98 -0
  287. webscout/swiftcli/core/group.py +268 -0
  288. webscout/swiftcli/decorators/__init__.py +28 -0
  289. webscout/swiftcli/decorators/command.py +243 -0
  290. webscout/swiftcli/decorators/options.py +247 -0
  291. webscout/swiftcli/decorators/output.py +392 -0
  292. webscout/swiftcli/exceptions.py +21 -0
  293. webscout/swiftcli/plugins/__init__.py +9 -0
  294. webscout/swiftcli/plugins/base.py +134 -0
  295. webscout/swiftcli/plugins/manager.py +269 -0
  296. webscout/swiftcli/utils/__init__.py +58 -0
  297. webscout/swiftcli/utils/formatting.py +251 -0
  298. webscout/swiftcli/utils/parsing.py +368 -0
  299. webscout/update_checker.py +280 -136
  300. webscout/utils.py +28 -14
  301. webscout/version.py +2 -1
  302. webscout/version.py.bak +3 -0
  303. webscout/zeroart/__init__.py +218 -55
  304. webscout/zeroart/base.py +70 -60
  305. webscout/zeroart/effects.py +155 -99
  306. webscout/zeroart/fonts.py +1799 -816
  307. webscout-2026.1.19.dist-info/METADATA +638 -0
  308. webscout-2026.1.19.dist-info/RECORD +312 -0
  309. {webscout-8.2.2.dist-info → webscout-2026.1.19.dist-info}/WHEEL +1 -1
  310. webscout-2026.1.19.dist-info/entry_points.txt +4 -0
  311. webscout-2026.1.19.dist-info/top_level.txt +1 -0
  312. inferno/__init__.py +0 -6
  313. inferno/__main__.py +0 -9
  314. inferno/cli.py +0 -6
  315. webscout/DWEBS.py +0 -477
  316. webscout/Extra/autocoder/__init__.py +0 -9
  317. webscout/Extra/autocoder/autocoder.py +0 -849
  318. webscout/Extra/autocoder/autocoder_utiles.py +0 -332
  319. webscout/LLM.py +0 -442
  320. webscout/Litlogger/__init__.py +0 -67
  321. webscout/Litlogger/core/__init__.py +0 -6
  322. webscout/Litlogger/core/level.py +0 -23
  323. webscout/Litlogger/core/logger.py +0 -165
  324. webscout/Litlogger/handlers/__init__.py +0 -12
  325. webscout/Litlogger/handlers/console.py +0 -33
  326. webscout/Litlogger/handlers/file.py +0 -143
  327. webscout/Litlogger/handlers/network.py +0 -173
  328. webscout/Litlogger/styles/__init__.py +0 -7
  329. webscout/Litlogger/styles/colors.py +0 -249
  330. webscout/Litlogger/styles/formats.py +0 -458
  331. webscout/Litlogger/styles/text.py +0 -87
  332. webscout/Litlogger/utils/__init__.py +0 -6
  333. webscout/Litlogger/utils/detectors.py +0 -153
  334. webscout/Litlogger/utils/formatters.py +0 -200
  335. webscout/Local/__init__.py +0 -12
  336. webscout/Local/__main__.py +0 -9
  337. webscout/Local/api.py +0 -576
  338. webscout/Local/cli.py +0 -516
  339. webscout/Local/config.py +0 -75
  340. webscout/Local/llm.py +0 -287
  341. webscout/Local/model_manager.py +0 -253
  342. webscout/Local/server.py +0 -721
  343. webscout/Local/utils.py +0 -93
  344. webscout/Provider/AI21.py +0 -177
  345. webscout/Provider/AISEARCH/DeepFind.py +0 -250
  346. webscout/Provider/AISEARCH/ISou.py +0 -256
  347. webscout/Provider/AISEARCH/felo_search.py +0 -228
  348. webscout/Provider/AISEARCH/genspark_search.py +0 -208
  349. webscout/Provider/AISEARCH/hika_search.py +0 -194
  350. webscout/Provider/AISEARCH/scira_search.py +0 -324
  351. webscout/Provider/Aitopia.py +0 -292
  352. webscout/Provider/AllenAI.py +0 -413
  353. webscout/Provider/Blackboxai.py +0 -229
  354. webscout/Provider/C4ai.py +0 -432
  355. webscout/Provider/ChatGPTClone.py +0 -226
  356. webscout/Provider/ChatGPTES.py +0 -237
  357. webscout/Provider/ChatGPTGratis.py +0 -194
  358. webscout/Provider/Chatify.py +0 -175
  359. webscout/Provider/Cloudflare.py +0 -273
  360. webscout/Provider/DeepSeek.py +0 -196
  361. webscout/Provider/ElectronHub.py +0 -709
  362. webscout/Provider/ExaChat.py +0 -342
  363. webscout/Provider/Free2GPT.py +0 -241
  364. webscout/Provider/GPTWeb.py +0 -193
  365. webscout/Provider/Glider.py +0 -211
  366. webscout/Provider/HF_space/__init__.py +0 -0
  367. webscout/Provider/HF_space/qwen_qwen2.py +0 -206
  368. webscout/Provider/HuggingFaceChat.py +0 -462
  369. webscout/Provider/Hunyuan.py +0 -272
  370. webscout/Provider/LambdaChat.py +0 -392
  371. webscout/Provider/Llama.py +0 -200
  372. webscout/Provider/Llama3.py +0 -204
  373. webscout/Provider/Marcus.py +0 -148
  374. webscout/Provider/OLLAMA.py +0 -396
  375. webscout/Provider/OPENAI/c4ai.py +0 -367
  376. webscout/Provider/OPENAI/chatgptclone.py +0 -460
  377. webscout/Provider/OPENAI/exachat.py +0 -433
  378. webscout/Provider/OPENAI/freeaichat.py +0 -352
  379. webscout/Provider/OPENAI/opkfc.py +0 -488
  380. webscout/Provider/OPENAI/scirachat.py +0 -463
  381. webscout/Provider/OPENAI/standardinput.py +0 -425
  382. webscout/Provider/OPENAI/typegpt.py +0 -346
  383. webscout/Provider/OPENAI/uncovrAI.py +0 -455
  384. webscout/Provider/OPENAI/venice.py +0 -413
  385. webscout/Provider/OPENAI/yep.py +0 -327
  386. webscout/Provider/OpenGPT.py +0 -199
  387. webscout/Provider/Perplexitylabs.py +0 -415
  388. webscout/Provider/Phind.py +0 -535
  389. webscout/Provider/PizzaGPT.py +0 -198
  390. webscout/Provider/Reka.py +0 -214
  391. webscout/Provider/StandardInput.py +0 -278
  392. webscout/Provider/TTI/AiForce/__init__.py +0 -22
  393. webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
  394. webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
  395. webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
  396. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
  397. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
  398. webscout/Provider/TTI/ImgSys/__init__.py +0 -23
  399. webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
  400. webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
  401. webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
  402. webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
  403. webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
  404. webscout/Provider/TTI/Nexra/__init__.py +0 -22
  405. webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
  406. webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
  407. webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
  408. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
  409. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
  410. webscout/Provider/TTI/aiarta/__init__.py +0 -2
  411. webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
  412. webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
  413. webscout/Provider/TTI/artbit/__init__.py +0 -22
  414. webscout/Provider/TTI/artbit/async_artbit.py +0 -155
  415. webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
  416. webscout/Provider/TTI/fastflux/__init__.py +0 -22
  417. webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
  418. webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
  419. webscout/Provider/TTI/huggingface/__init__.py +0 -22
  420. webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
  421. webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
  422. webscout/Provider/TTI/piclumen/__init__.py +0 -23
  423. webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
  424. webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
  425. webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
  426. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
  427. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
  428. webscout/Provider/TTI/talkai/__init__.py +0 -4
  429. webscout/Provider/TTI/talkai/async_talkai.py +0 -229
  430. webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
  431. webscout/Provider/TTS/gesserit.py +0 -127
  432. webscout/Provider/TeachAnything.py +0 -187
  433. webscout/Provider/Venice.py +0 -219
  434. webscout/Provider/VercelAI.py +0 -234
  435. webscout/Provider/WebSim.py +0 -228
  436. webscout/Provider/Writecream.py +0 -211
  437. webscout/Provider/WritingMate.py +0 -197
  438. webscout/Provider/aimathgpt.py +0 -189
  439. webscout/Provider/askmyai.py +0 -158
  440. webscout/Provider/asksteve.py +0 -203
  441. webscout/Provider/bagoodex.py +0 -145
  442. webscout/Provider/chatglm.py +0 -205
  443. webscout/Provider/copilot.py +0 -428
  444. webscout/Provider/freeaichat.py +0 -271
  445. webscout/Provider/gaurish.py +0 -244
  446. webscout/Provider/geminiprorealtime.py +0 -160
  447. webscout/Provider/granite.py +0 -187
  448. webscout/Provider/hermes.py +0 -219
  449. webscout/Provider/koala.py +0 -268
  450. webscout/Provider/labyrinth.py +0 -340
  451. webscout/Provider/lepton.py +0 -194
  452. webscout/Provider/llamatutor.py +0 -192
  453. webscout/Provider/multichat.py +0 -325
  454. webscout/Provider/promptrefine.py +0 -193
  455. webscout/Provider/scira_chat.py +0 -277
  456. webscout/Provider/scnet.py +0 -187
  457. webscout/Provider/talkai.py +0 -194
  458. webscout/Provider/tutorai.py +0 -252
  459. webscout/Provider/typegpt.py +0 -232
  460. webscout/Provider/uncovr.py +0 -312
  461. webscout/Provider/yep.py +0 -376
  462. webscout/litprinter/__init__.py +0 -59
  463. webscout/scout/core.py +0 -881
  464. webscout/tempid.py +0 -128
  465. webscout/webscout_search.py +0 -1346
  466. webscout/webscout_search_async.py +0 -877
  467. webscout/yep_search.py +0 -297
  468. webscout-8.2.2.dist-info/METADATA +0 -734
  469. webscout-8.2.2.dist-info/RECORD +0 -309
  470. webscout-8.2.2.dist-info/entry_points.txt +0 -5
  471. webscout-8.2.2.dist-info/top_level.txt +0 -3
  472. webstoken/__init__.py +0 -30
  473. webstoken/classifier.py +0 -189
  474. webstoken/keywords.py +0 -216
  475. webstoken/language.py +0 -128
  476. webstoken/ner.py +0 -164
  477. webstoken/normalizer.py +0 -35
  478. webstoken/processor.py +0 -77
  479. webstoken/sentiment.py +0 -206
  480. webstoken/stemmer.py +0 -73
  481. webstoken/tagger.py +0 -60
  482. webstoken/tokenizer.py +0 -158
  483. {webscout-8.2.2.dist-info → webscout-2026.1.19.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,568 +1,800 @@
1
- """
2
- Scout Main Module - HTML Parsing and Traversal
3
- """
4
- import re
5
- import json
6
- import hashlib
7
- import unicodedata
8
- import urllib.parse
9
- from typing import List, Dict, Optional, Any
10
-
11
- from ..parsers import ParserRegistry
12
- from ..element import Tag, NavigableString
13
- from ..utils import decode_markup
14
- from .text_analyzer import ScoutTextAnalyzer
15
- from .web_analyzer import ScoutWebAnalyzer
16
- from .search_result import ScoutSearchResult
17
- from .text_utils import SentenceTokenizer
18
-
19
-
20
- class Scout:
21
- """
22
- Scout - Making web scraping a breeze! 🌊
23
- A comprehensive HTML parsing and traversal library.
24
- Enhanced with advanced features and intelligent parsing.
25
- """
26
-
27
- def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
28
- """
29
- Initialize Scout with HTML content.
30
-
31
- Args:
32
- markup (str): HTML content to parse
33
- features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
34
- from_encoding (str): Source encoding (if known)
35
- **kwargs: Additional parsing options
36
- """
37
- # Intelligent markup handling
38
- self.markup = self._preprocess_markup(markup, from_encoding)
39
- self.features = features
40
- self.from_encoding = from_encoding
41
-
42
- # Get the right parser for the job
43
- if features not in ParserRegistry.list_parsers():
44
- raise ValueError(
45
- f"Invalid parser '{features}'! Choose from: {', '.join(ParserRegistry.list_parsers().keys())}"
46
- )
47
-
48
- parser_class = ParserRegistry.get_parser(features)
49
- self.parser = parser_class
50
-
51
- # Parse that HTML! 🎯
52
- self._soup = self.parser.parse(self.markup)
53
-
54
- # BeautifulSoup-like attributes
55
- self.name = self._soup.name if hasattr(self._soup, 'name') else None
56
- self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
57
-
58
- # Advanced parsing options
59
- self._cache = {}
60
-
61
- # Text and web analyzers
62
- self.text_analyzer = ScoutTextAnalyzer()
63
- self.web_analyzer = ScoutWebAnalyzer()
64
-
65
- def normalize_text(self, text: str, form='NFKD') -> str:
66
- """
67
- Normalize text using Unicode normalization.
68
-
69
- Args:
70
- text (str): Input text
71
- form (str, optional): Normalization form
72
-
73
- Returns:
74
- str: Normalized text
75
- """
76
- return unicodedata.normalize(form, text)
77
-
78
- def url_parse(self, url: str) -> Dict[str, str]:
79
- """
80
- Parse and analyze a URL.
81
-
82
- Args:
83
- url (str): URL to parse
84
-
85
- Returns:
86
- Dict[str, str]: Parsed URL components
87
- """
88
- parsed = urllib.parse.urlparse(url)
89
- return {
90
- 'scheme': parsed.scheme,
91
- 'netloc': parsed.netloc,
92
- 'path': parsed.path,
93
- 'params': parsed.params,
94
- 'query': parsed.query,
95
- 'fragment': parsed.fragment
96
- }
97
-
98
- def analyze_page_structure(self) -> Dict[str, Any]:
99
- """
100
- Analyze the structure of the parsed page.
101
-
102
- Returns:
103
- Dict[str, Any]: Page structure analysis
104
- """
105
- return self.web_analyzer.analyze_page_structure(self)
106
-
107
- def analyze_text(self, text: Optional[str] = None) -> Dict[str, Any]:
108
- """
109
- Perform advanced text analysis.
110
-
111
- Args:
112
- text (str, optional): Text to analyze. If None, uses page text.
113
-
114
- Returns:
115
- Dict[str, Any]: Text analysis results
116
- """
117
- if text is None:
118
- text = self.get_text()
119
-
120
- return {
121
- 'word_count': self.text_analyzer.count_words(text),
122
- 'entities': self.text_analyzer.extract_entities(text),
123
- 'tokens': self.text_analyzer.tokenize(text)
124
- }
125
-
126
- def extract_semantic_info(self) -> Dict[str, Any]:
127
- """
128
- Extract semantic information from the document.
129
-
130
- Returns:
131
- Dict[str, Any]: Semantic information
132
- """
133
- semantic_info = {
134
- 'headings': {
135
- 'h1': [h.get_text(strip=True) for h in self.find_all('h1')],
136
- 'h2': [h.get_text(strip=True) for h in self.find_all('h2')],
137
- 'h3': [h.get_text(strip=True) for h in self.find_all('h3')]
138
- },
139
- 'lists': {
140
- 'ul': [ul.find_all('li') for ul in self.find_all('ul')],
141
- 'ol': [ol.find_all('li') for ol in self.find_all('ol')]
142
- },
143
- 'tables': {
144
- 'count': len(self.find_all('table')),
145
- 'headers': [table.find_all('th') for table in self.find_all('table')]
146
- }
147
- }
148
- return semantic_info
149
-
150
- def cache(self, key: str, value: Any = None) -> Any:
151
- """
152
- Manage a cache for parsed content.
153
-
154
- Args:
155
- key (str): Cache key
156
- value (Any, optional): Value to cache
157
-
158
- Returns:
159
- Any: Cached value or None
160
- """
161
- if value is not None:
162
- self._cache[key] = value
163
- return self._cache.get(key)
164
-
165
- def hash_content(self, method='md5') -> str:
166
- """
167
- Generate a hash of the parsed content.
168
-
169
- Args:
170
- method (str, optional): Hashing method
171
-
172
- Returns:
173
- str: Content hash
174
- """
175
- hash_methods = {
176
- 'md5': hashlib.md5,
177
- 'sha1': hashlib.sha1,
178
- 'sha256': hashlib.sha256
179
- }
180
-
181
- if method not in hash_methods:
182
- raise ValueError(f"Unsupported hash method: {method}")
183
-
184
- hasher = hash_methods[method]()
185
- hasher.update(str(self._soup).encode('utf-8'))
186
- return hasher.hexdigest()
187
-
188
- def extract_links(self, base_url: Optional[str] = None) -> List[Dict[str, str]]:
189
- """
190
- Extract all links from the document.
191
-
192
- Args:
193
- base_url (str, optional): Base URL for resolving relative links
194
-
195
- Returns:
196
- List[Dict[str, str]]: List of link dictionaries
197
- """
198
- links = []
199
- for link in self.find_all(['a', 'link']):
200
- href = link.get('href')
201
- if href:
202
- # Resolve relative URLs if base_url is provided
203
- if base_url and not href.startswith(('http://', 'https://', '//')):
204
- href = f"{base_url.rstrip('/')}/{href.lstrip('/')}"
205
-
206
- links.append({
207
- 'href': href,
208
- 'text': link.get_text(strip=True),
209
- 'rel': link.get('rel', [None])[0],
210
- 'type': link.get('type')
211
- })
212
- return links
213
-
214
- def extract_metadata(self) -> Dict[str, Any]:
215
- """
216
- Extract metadata from HTML document.
217
-
218
- Returns:
219
- Dict[str, Any]: Extracted metadata
220
- """
221
- metadata = {
222
- 'title': self.find('title').texts()[0] if self.find('title').texts() else None,
223
- 'description': self.find('meta', attrs={'name': 'description'}).attrs('content')[0] if self.find('meta', attrs={'name': 'description'}).attrs('content') else None,
224
- 'keywords': self.find('meta', attrs={'name': 'keywords'}).attrs('content')[0].split(',') if self.find('meta', attrs={'name': 'keywords'}).attrs('content') else [],
225
- 'og_metadata': {},
226
- 'twitter_metadata': {}
227
- }
228
-
229
- # Open Graph metadata
230
- for meta in self.find_all('meta', attrs={'property': re.compile(r'^og:')}):
231
- key = meta.attrs('property')[0][3:]
232
- metadata['og_metadata'][key] = meta.attrs('content')[0]
233
-
234
- # Twitter Card metadata
235
- for meta in self.find_all('meta', attrs={'name': re.compile(r'^twitter:')}):
236
- key = meta.attrs('name')[0][8:]
237
- metadata['twitter_metadata'][key] = meta.attrs('content')[0]
238
-
239
- return metadata
240
-
241
- def to_json(self, indent=2) -> str:
242
- """
243
- Convert parsed content to JSON.
244
-
245
- Args:
246
- indent (int, optional): JSON indentation
247
-
248
- Returns:
249
- str: JSON representation of the document
250
- """
251
- def _tag_to_dict(tag):
252
- if isinstance(tag, NavigableString):
253
- return str(tag)
254
-
255
- result = {
256
- 'name': tag.name,
257
- 'attrs': tag.attrs,
258
- 'text': tag.get_text(strip=True)
259
- }
260
-
261
- if tag.contents:
262
- result['children'] = [_tag_to_dict(child) for child in tag.contents]
263
-
264
- return result
265
-
266
- return json.dumps(_tag_to_dict(self._soup), indent=indent)
267
-
268
- def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> ScoutSearchResult:
269
- """
270
- Find the first matching element.
271
-
272
- Args:
273
- name (str, optional): Tag name to search for
274
- attrs (dict, optional): Attributes to match
275
- recursive (bool, optional): Search recursively
276
- text (str, optional): Text content to match
277
-
278
- Returns:
279
- ScoutSearchResult: First matching element
280
- """
281
- result = self._soup.find(name, attrs, recursive, text, **kwargs)
282
- return ScoutSearchResult([result]) if result else ScoutSearchResult([])
283
-
284
- def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> ScoutSearchResult:
285
- """
286
- Find all matching elements.
287
-
288
- Args:
289
- name (str, optional): Tag name to search for
290
- attrs (dict, optional): Attributes to match
291
- recursive (bool, optional): Search recursively
292
- text (str, optional): Text content to match
293
- limit (int, optional): Maximum number of results
294
-
295
- Returns:
296
- ScoutSearchResult: List of matching elements
297
- """
298
- results = self._soup.find_all(name, attrs, recursive, text, limit, **kwargs)
299
- return ScoutSearchResult(results)
300
-
301
- def find_parent(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
302
- """
303
- Find the first parent matching given criteria.
304
-
305
- Args:
306
- name (str, optional): Tag name to search for
307
- attrs (dict, optional): Attributes to match
308
-
309
- Returns:
310
- Tag or None: First matching parent
311
- """
312
- current = self._soup.parent
313
- while current:
314
- if (name is None or current.name == name) and \
315
- all(current.get(k) == v for k, v in attrs.items()):
316
- return current
317
- current = current.parent
318
- return None
319
-
320
- def find_parents(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
321
- """
322
- Find all parents matching given criteria.
323
-
324
- Args:
325
- name (str, optional): Tag name to search for
326
- attrs (dict, optional): Attributes to match
327
- limit (int, optional): Maximum number of results
328
-
329
- Returns:
330
- List[Tag]: List of matching parents
331
- """
332
- parents = []
333
- current = self._soup.parent
334
- while current and (limit is None or len(parents) < limit):
335
- if (name is None or current.name == name) and \
336
- all(current.get(k) == v for k, v in attrs.items()):
337
- parents.append(current)
338
- current = current.parent
339
- return parents
340
-
341
- def find_next_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
342
- """
343
- Find the next sibling matching given criteria.
344
-
345
- Args:
346
- name (str, optional): Tag name to search for
347
- attrs (dict, optional): Attributes to match
348
-
349
- Returns:
350
- Tag or None: First matching next sibling
351
- """
352
- if not self._soup.parent:
353
- return None
354
-
355
- siblings = self._soup.parent.contents
356
- try:
357
- current_index = siblings.index(self._soup)
358
- for sibling in siblings[current_index + 1:]:
359
- if isinstance(sibling, Tag):
360
- if (name is None or sibling.name == name) and \
361
- all(sibling.get(k) == v for k, v in attrs.items()):
362
- return sibling
363
- except ValueError:
364
- pass
365
- return None
366
-
367
- def find_next_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
368
- """
369
- Find all next siblings matching given criteria.
370
-
371
- Args:
372
- name (str, optional): Tag name to search for
373
- attrs (dict, optional): Attributes to match
374
- limit (int, optional): Maximum number of results
375
-
376
- Returns:
377
- List[Tag]: List of matching next siblings
378
- """
379
- if not self._soup.parent:
380
- return []
381
-
382
- siblings = []
383
- siblings_list = self._soup.parent.contents
384
- try:
385
- current_index = siblings_list.index(self._soup)
386
- for sibling in siblings_list[current_index + 1:]:
387
- if isinstance(sibling, Tag):
388
- if (name is None or sibling.name == name) and \
389
- all(sibling.get(k) == v for k, v in attrs.items()):
390
- siblings.append(sibling)
391
- if limit and len(siblings) == limit:
392
- break
393
- except ValueError:
394
- pass
395
- return siblings
396
-
397
- def select(self, selector: str) -> List[Tag]:
398
- """
399
- Select elements using CSS selector.
400
-
401
- Args:
402
- selector (str): CSS selector string
403
-
404
- Returns:
405
- List[Tag]: List of matching elements
406
- """
407
- return self._soup.select(selector)
408
-
409
- def select_one(self, selector: str) -> Optional[Tag]:
410
- """
411
- Select the first element matching the CSS selector.
412
-
413
- Args:
414
- selector (str): CSS selector string
415
-
416
- Returns:
417
- Tag or None: First matching element
418
- """
419
- return self._soup.select_one(selector)
420
-
421
- def get_text(self, separator=' ', strip=False, types=None) -> str:
422
- """
423
- Extract all text from the parsed document.
424
-
425
- Args:
426
- separator (str, optional): Text separator
427
- strip (bool, optional): Strip whitespace
428
- types (list, optional): Types of content to extract
429
-
430
- Returns:
431
- str: Extracted text
432
- """
433
- tokenizer = SentenceTokenizer()
434
- text = self._soup.get_text(separator, strip, types)
435
- sentences = tokenizer.tokenize(text)
436
- return "\n\n".join(sentences)
437
-
438
- def remove_tags(self, tags: List[str]) -> None:
439
- """
440
- Remove specified tags and their contents from the document.
441
-
442
- Args:
443
- tags (List[str]): List of tag names to remove
444
- """
445
- for tag_name in tags:
446
- for tag in self._soup.find_all(tag_name):
447
- tag.decompose()
448
-
449
- def prettify(self, formatter='minimal') -> str:
450
- """
451
- Return a formatted, pretty-printed version of the HTML.
452
-
453
- Args:
454
- formatter (str, optional): Formatting style
455
-
456
- Returns:
457
- str: Prettified HTML
458
- """
459
- return self._soup.prettify(formatter)
460
-
461
- def decompose(self, tag: Tag = None) -> None:
462
- """
463
- Remove a tag and its contents from the document.
464
-
465
- Args:
466
- tag (Tag, optional): Tag to remove. If None, removes the root tag.
467
- """
468
- if tag is None:
469
- tag = self._soup
470
- tag.decompose()
471
-
472
- def extract(self, tag: Tag = None) -> Tag:
473
- """
474
- Remove a tag from the document and return it.
475
-
476
- Args:
477
- tag (Tag, optional): Tag to extract. If None, extracts the root tag.
478
-
479
- Returns:
480
- Tag: Extracted tag
481
- """
482
- if tag is None:
483
- tag = self._soup
484
- return tag.extract()
485
-
486
- def clear(self, tag: Tag = None) -> None:
487
- """
488
- Remove a tag's contents while keeping the tag itself.
489
-
490
- Args:
491
- tag (Tag, optional): Tag to clear. If None, clears the root tag.
492
- """
493
- if tag is None:
494
- tag = self._soup
495
- tag.clear()
496
-
497
- def replace_with(self, old_tag: Tag, new_tag: Tag) -> None:
498
- """
499
- Replace one tag with another.
500
-
501
- Args:
502
- old_tag (Tag): Tag to replace
503
- new_tag (Tag): Replacement tag
504
- """
505
- old_tag.replace_with(new_tag)
506
-
507
- def encode(self, encoding='utf-8') -> bytes:
508
- """
509
- Encode the document to a specific encoding.
510
-
511
- Args:
512
- encoding (str, optional): Encoding to use
513
-
514
- Returns:
515
- bytes: Encoded document
516
- """
517
- return str(self._soup).encode(encoding)
518
-
519
- def decode(self, encoding='utf-8') -> str:
520
- """
521
- Decode the document from a specific encoding.
522
-
523
- Args:
524
- encoding (str, optional): Encoding to use
525
-
526
- Returns:
527
- str: Decoded document
528
- """
529
- return str(self._soup)
530
-
531
- def __str__(self) -> str:
532
- """
533
- String representation of the parsed document.
534
-
535
- Returns:
536
- str: HTML content
537
- """
538
- return str(self._soup)
539
-
540
- def __repr__(self) -> str:
541
- """
542
- Detailed representation of the Scout object.
543
-
544
- Returns:
545
- str: Scout object description
546
- """
547
- return f"Scout(features='{self.features}', content_length={len(self.markup)})"
548
-
549
- def _preprocess_markup(self, markup: str, encoding: Optional[str] = None) -> str:
550
- """
551
- Preprocess markup before parsing.
552
-
553
- Args:
554
- markup (str): Input markup
555
- encoding (str, optional): Encoding to use
556
-
557
- Returns:
558
- str: Preprocessed markup
559
- """
560
- # Decode markup
561
- decoded_markup = decode_markup(markup, encoding)
562
-
563
- # Basic HTML cleaning
564
- # Remove comments, normalize whitespace, etc.
565
- decoded_markup = re.sub(r'<!--.*?-->', '', decoded_markup, flags=re.DOTALL)
566
- decoded_markup = re.sub(r'\s+', ' ', decoded_markup)
567
-
568
- return decoded_markup
1
+ """
2
+ Scout Main Module - HTML Parsing and Traversal
3
+ """
4
+
5
+ import hashlib
6
+ import json
7
+ import re
8
+ import unicodedata
9
+ import urllib.parse
10
+ from typing import Any, Dict, List, Literal, Optional, Union
11
+
12
+ from ..element import NavigableString, Tag
13
+ from ..parsers import ParserRegistry
14
+ from ..utils import decode_markup
15
+ from .search_result import ScoutSearchResult
16
+ from .text_analyzer import ScoutTextAnalyzer
17
+ from .web_analyzer import ScoutWebAnalyzer
18
+
19
+
20
+ class Scout:
21
+ """
22
+ Scout - Making web scraping a breeze! 🌊
23
+ A comprehensive HTML parsing and traversal library.
24
+ Enhanced with advanced features and intelligent parsing.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ markup: Union[str, bytes] = "",
30
+ features: str = "html.parser",
31
+ from_encoding: Optional[str] = None,
32
+ exclude_encodings: Optional[List[str]] = None,
33
+ element_classes: Optional[Dict[str, Any]] = None,
34
+ **kwargs,
35
+ ):
36
+ """
37
+ Initialize Scout with HTML content.
38
+
39
+ Args:
40
+ markup (str): HTML content to parse
41
+ features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
42
+ from_encoding (str): Source encoding (if known)
43
+ exclude_encodings (list): Encodings to avoid
44
+ element_classes (dict): Custom classes for different element types
45
+ **kwargs: Additional parsing options
46
+ """
47
+ # Store original markup and settings
48
+ self.original_encoding = from_encoding
49
+ self.exclude_encodings = exclude_encodings or []
50
+ self.element_classes = element_classes or {}
51
+ self.builder_features = features
52
+ self.contains_replacement_characters = False
53
+
54
+ # Intelligent markup handling
55
+ self.markup = self._preprocess_markup(markup, from_encoding)
56
+ self.features = features
57
+ self.from_encoding = from_encoding
58
+
59
+ # Get the right parser for the job
60
+ if features not in ParserRegistry.list_parsers():
61
+ raise ValueError(
62
+ f"Invalid parser '{features}'! Choose from: {', '.join(ParserRegistry.list_parsers().keys())}"
63
+ )
64
+
65
+ parser_class = ParserRegistry.get_parser(features)
66
+ self.parser = parser_class
67
+
68
+ # Parse that HTML! 🎯
69
+ self._soup = self.parser.parse(self.markup)
70
+
71
+ # Set up the root element properly
72
+ if hasattr(self._soup, "name"):
73
+ self.name = self._soup.name
74
+ else:
75
+ self.name = "[document]"
76
+
77
+ # BS4-like attributes
78
+ self.attrs = self._soup.attrs if hasattr(self._soup, "attrs") else {}
79
+ self.contents = self._soup.contents if hasattr(self._soup, "contents") else []
80
+ self.parent = None
81
+ self.next_sibling = None
82
+ self.previous_sibling = None
83
+
84
+ # Advanced parsing options and caching
85
+ self._cache = {}
86
+ self._tag_name_cache = {}
87
+ self._css_selector_cache = {}
88
+
89
+ # Text and web analyzers
90
+ self.text_analyzer = ScoutTextAnalyzer()
91
+ self.web_analyzer = ScoutWebAnalyzer()
92
+
93
+ def normalize_text(self, text: str, form: Literal["NFC", "NFD", "NFKC", "NFKD"] = "NFKD") -> str:
94
+ """
95
+ Normalize text using Unicode normalization.
96
+
97
+ Args:
98
+ text (str): Input text
99
+ form (Literal["NFC", "NFD", "NFKC", "NFKD"], optional): Normalization form
100
+
101
+ Returns:
102
+ str: Normalized text
103
+ """
104
+ return unicodedata.normalize(form, text)
105
+
106
+ def url_parse(self, url: str) -> Dict[str, str]:
107
+ """
108
+ Parse and analyze a URL.
109
+
110
+ Args:
111
+ url (str): URL to parse
112
+
113
+ Returns:
114
+ Dict[str, str]: Parsed URL components
115
+ """
116
+ parsed = urllib.parse.urlparse(url)
117
+ return {
118
+ "scheme": parsed.scheme,
119
+ "netloc": parsed.netloc,
120
+ "path": parsed.path,
121
+ "params": parsed.params,
122
+ "query": parsed.query,
123
+ "fragment": parsed.fragment,
124
+ }
125
+
126
+ def analyze_page_structure(self) -> Dict[str, Any]:
127
+ """
128
+ Analyze the structure of the parsed page.
129
+
130
+ Returns:
131
+ Dict[str, Any]: Page structure analysis
132
+ """
133
+ return self.web_analyzer.analyze_page_structure(self)
134
+
135
+ def analyze_text(self, text: Optional[str] = None) -> Dict[str, Any]:
136
+ """
137
+ Perform advanced text analysis.
138
+
139
+ Args:
140
+ text (str, optional): Text to analyze. If None, uses page text.
141
+
142
+ Returns:
143
+ Dict[str, Any]: Text analysis results
144
+ """
145
+ if text is None:
146
+ text = self.get_text()
147
+
148
+ return {
149
+ "word_count": self.text_analyzer.count_words(text),
150
+ "entities": self.text_analyzer.extract_entities(text),
151
+ "tokens": self.text_analyzer.tokenize(text),
152
+ }
153
+
154
+ def extract_semantic_info(self) -> Dict[str, Any]:
155
+ """
156
+ Extract semantic information from the document.
157
+
158
+ Returns:
159
+ Dict[str, Any]: Semantic information
160
+ """
161
+ semantic_info = {
162
+ "headings": {
163
+ "h1": [h.get_text(strip=True) for h in self.find_all("h1")],
164
+ "h2": [h.get_text(strip=True) for h in self.find_all("h2")],
165
+ "h3": [h.get_text(strip=True) for h in self.find_all("h3")],
166
+ },
167
+ "lists": {
168
+ "ul": [ul.find_all("li") for ul in self.find_all("ul")],
169
+ "ol": [ol.find_all("li") for ol in self.find_all("ol")],
170
+ },
171
+ "tables": {
172
+ "count": len(self.find_all("table")),
173
+ "headers": [table.find_all("th") for table in self.find_all("table")],
174
+ },
175
+ }
176
+ return semantic_info
177
+
178
+ def cache(self, key: str, value: Any = None) -> Any:
179
+ """
180
+ Manage a cache for parsed content.
181
+
182
+ Args:
183
+ key (str): Cache key
184
+ value (Any, optional): Value to cache
185
+
186
+ Returns:
187
+ Any: Cached value or None
188
+ """
189
+ if value is not None:
190
+ self._cache[key] = value
191
+ return self._cache.get(key)
192
+
193
+ def hash_content(self, method="md5") -> str:
194
+ """
195
+ Generate a hash of the parsed content.
196
+
197
+ Args:
198
+ method (str, optional): Hashing method
199
+
200
+ Returns:
201
+ str: Content hash
202
+ """
203
+ hash_methods = {"md5": hashlib.md5, "sha1": hashlib.sha1, "sha256": hashlib.sha256}
204
+
205
+ if method not in hash_methods:
206
+ raise ValueError(f"Unsupported hash method: {method}")
207
+
208
+ hasher = hash_methods[method]()
209
+ hasher.update(str(self._soup).encode("utf-8"))
210
+ return hasher.hexdigest()
211
+
212
+ def extract_links(self, base_url: Optional[str] = None) -> List[Dict[str, str]]:
213
+ """
214
+ Extract all links from the document.
215
+
216
+ Args:
217
+ base_url (str, optional): Base URL for resolving relative links
218
+
219
+ Returns:
220
+ List[Dict[str, str]]: List of link dictionaries
221
+ """
222
+ links = []
223
+ for link in self.find_all(["a", "link"]):
224
+ href = link.get("href")
225
+ if href:
226
+ # Resolve relative URLs if base_url is provided
227
+ if base_url and not href.startswith(("http://", "https://", "//")):
228
+ href = f"{base_url.rstrip('/')}/{href.lstrip('/')}"
229
+
230
+ links.append(
231
+ {
232
+ "href": href,
233
+ "text": link.get_text(strip=True),
234
+ "rel": link.get("rel", [None])[0],
235
+ "type": link.get("type"),
236
+ }
237
+ )
238
+ return links
239
+
240
+ def extract_metadata(self) -> Dict[str, Any]:
241
+ """
242
+ Extract metadata from HTML document.
243
+
244
+ Returns:
245
+ Dict[str, Any]: Extracted metadata
246
+ """
247
+ title_tag = self.find("title")
248
+ desc_tag = self.find("meta", attrs={"name": "description"})
249
+ keywords_tag = self.find("meta", attrs={"name": "keywords"})
250
+
251
+ metadata = {
252
+ "title": title_tag.get_text(strip=True) if title_tag else None,
253
+ "description": desc_tag.get("content") if desc_tag else None,
254
+ "keywords": keywords_tag.get("content").split(",")
255
+ if keywords_tag and keywords_tag.get("content")
256
+ else [],
257
+ "og_metadata": {},
258
+ "twitter_metadata": {},
259
+ }
260
+
261
+ # Open Graph metadata
262
+ for meta in self.find_all("meta", attrs={"property": re.compile(r"^og:")}):
263
+ key = meta.get("property")
264
+ if key and key.startswith("og:"):
265
+ if isinstance(metadata["og_metadata"], dict):
266
+ metadata["og_metadata"][key[3:]] = meta.get("content")
267
+
268
+ # Twitter Card metadata
269
+ for meta in self.find_all("meta", attrs={"name": re.compile(r"^twitter:")}):
270
+ key = meta.get("name")
271
+ if key and key.startswith("twitter:"):
272
+ if isinstance(metadata["twitter_metadata"], dict):
273
+ metadata["twitter_metadata"][key[8:]] = meta.get("content")
274
+
275
+ return metadata
276
+
277
+ def to_json(self, indent=2) -> str:
278
+ """
279
+ Convert parsed content to JSON.
280
+
281
+ Args:
282
+ indent (int, optional): JSON indentation
283
+
284
+ Returns:
285
+ str: JSON representation of the document
286
+ """
287
+
288
+ def _tag_to_dict(tag):
289
+ if isinstance(tag, NavigableString):
290
+ return str(tag)
291
+
292
+ result = {"name": tag.name, "attrs": tag.attrs, "text": tag.get_text(strip=True)}
293
+
294
+ if tag.contents:
295
+ result["children"] = [_tag_to_dict(child) for child in tag.contents]
296
+
297
+ return result
298
+
299
+ return json.dumps(_tag_to_dict(self._soup), indent=indent)
300
+
301
+ def find(
302
+ self, name=None, attrs={}, recursive=True, text=None, class_=None, **kwargs
303
+ ) -> Optional[Tag]:
304
+ """
305
+ Find the first matching element. Returns a single Tag or None.
306
+ Highly compatible with BS4.
307
+ """
308
+ return self._soup.find(name, attrs, recursive, text, limit=1, class_=class_, **kwargs)
309
+
310
+ def find_all(
311
+ self, name=None, attrs={}, recursive=True, text=None, limit=None, class_=None, **kwargs
312
+ ) -> ScoutSearchResult:
313
+ """
314
+ Find all matching elements.
315
+
316
+ Args:
317
+ name (str, optional): Tag name to search for
318
+ attrs (dict, optional): Attributes to match
319
+ recursive (bool, optional): Search recursively
320
+ text (str, optional): Text content to match
321
+ limit (int, optional): Maximum number of results
322
+
323
+ Returns:
324
+ ScoutSearchResult: List of matching elements
325
+ """
326
+ results = self._soup.find_all(name, attrs, recursive, text, limit, class_=class_, **kwargs)
327
+ return ScoutSearchResult(results)
328
+
329
+ def find_parent(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
330
+ """
331
+ Find the first parent matching given criteria.
332
+
333
+ Args:
334
+ name (str, optional): Tag name to search for
335
+ attrs (dict, optional): Attributes to match
336
+
337
+ Returns:
338
+ Tag or None: First matching parent
339
+ """
340
+ current = self._soup.parent
341
+ while current:
342
+ if (name is None or current.name == name) and all(
343
+ current.get(k) == v for k, v in attrs.items()
344
+ ):
345
+ return current
346
+ current = current.parent
347
+ return None
348
+
349
+ def find_parents(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
350
+ """
351
+ Find all parents matching given criteria.
352
+
353
+ Args:
354
+ name (str, optional): Tag name to search for
355
+ attrs (dict, optional): Attributes to match
356
+ limit (int, optional): Maximum number of results
357
+
358
+ Returns:
359
+ List[Tag]: List of matching parents
360
+ """
361
+ parents = []
362
+ current = self._soup.parent
363
+ while current and (limit is None or len(parents) < limit):
364
+ if (name is None or current.name == name) and all(
365
+ current.get(k) == v for k, v in attrs.items()
366
+ ):
367
+ parents.append(current)
368
+ current = current.parent
369
+ return parents
370
+
371
+ def find_next_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
372
+ """
373
+ Find the next sibling matching given criteria.
374
+
375
+ Args:
376
+ name (str, optional): Tag name to search for
377
+ attrs (dict, optional): Attributes to match
378
+
379
+ Returns:
380
+ Tag or None: First matching next sibling
381
+ """
382
+ if not self._soup.parent:
383
+ return None
384
+
385
+ siblings = self._soup.parent.contents
386
+ try:
387
+ current_index = siblings.index(self._soup)
388
+ for sibling in siblings[current_index + 1 :]:
389
+ if isinstance(sibling, Tag):
390
+ if (name is None or sibling.name == name) and all(
391
+ sibling.get(k) == v for k, v in attrs.items()
392
+ ):
393
+ return sibling
394
+ except ValueError:
395
+ pass
396
+ return None
397
+
398
+ def find_next_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
399
+ """
400
+ Find all next siblings matching given criteria.
401
+
402
+ Args:
403
+ name (str, optional): Tag name to search for
404
+ attrs (dict, optional): Attributes to match
405
+ limit (int, optional): Maximum number of results
406
+
407
+ Returns:
408
+ List[Tag]: List of matching next siblings
409
+ """
410
+ if not self._soup.parent:
411
+ return []
412
+
413
+ siblings = []
414
+ siblings_list = self._soup.parent.contents
415
+ try:
416
+ current_index = siblings_list.index(self._soup)
417
+ for sibling in siblings_list[current_index + 1 :]:
418
+ if isinstance(sibling, Tag):
419
+ if (name is None or sibling.name == name) and all(
420
+ sibling.get(k) == v for k, v in attrs.items()
421
+ ):
422
+ siblings.append(sibling)
423
+ if limit and len(siblings) == limit:
424
+ break
425
+ except ValueError:
426
+ pass
427
+ return siblings
428
+
429
+ def find_previous_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
430
+ """Find the previous sibling matching given criteria."""
431
+ if not self._soup.parent:
432
+ return None
433
+
434
+ siblings = self._soup.parent.contents
435
+ try:
436
+ current_index = siblings.index(self._soup)
437
+ for sibling in reversed(siblings[:current_index]):
438
+ if isinstance(sibling, Tag):
439
+ if (name is None or sibling.name == name) and all(
440
+ sibling.get(k) == v for k, v in attrs.items()
441
+ ):
442
+ return sibling
443
+ except ValueError:
444
+ pass
445
+ return None
446
+
447
+ def find_previous_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
448
+ """Find all previous siblings matching given criteria."""
449
+ if not self._soup.parent:
450
+ return []
451
+
452
+ siblings = []
453
+ siblings_list = self._soup.parent.contents
454
+ try:
455
+ current_index = siblings_list.index(self._soup)
456
+ for sibling in reversed(siblings_list[:current_index]):
457
+ if isinstance(sibling, Tag):
458
+ if (name is None or sibling.name == name) and all(
459
+ sibling.get(k) == v for k, v in attrs.items()
460
+ ):
461
+ siblings.append(sibling)
462
+ if limit and len(siblings) == limit:
463
+ break
464
+ except ValueError:
465
+ pass
466
+ return siblings
467
+
468
+ def find_next(self, name=None, attrs={}, text=None, **kwargs) -> Optional[Tag]:
469
+ """
470
+ Find the next element in document order.
471
+
472
+ Args:
473
+ name: Tag name to search for
474
+ attrs: Attributes to match
475
+ text: Text content to match
476
+ **kwargs: Additional attributes
477
+
478
+ Returns:
479
+ Optional[Tag]: Next matching element or None
480
+ """
481
+ return self._soup.find_next(name, attrs, text, **kwargs)
482
+
483
+ def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs) -> List[Tag]:
484
+ """
485
+ Find all next elements in document order.
486
+
487
+ Args:
488
+ name: Tag name to search for
489
+ attrs: Attributes to match
490
+ text: Text content to match
491
+ limit: Maximum number of results
492
+ **kwargs: Additional attributes
493
+
494
+ Returns:
495
+ List[Tag]: List of matching elements
496
+ """
497
+ return self._soup.find_all_next(name, attrs, text, limit, **kwargs)
498
+
499
+ def find_previous(self, name=None, attrs={}, text=None, **kwargs) -> Optional[Tag]:
500
+ """
501
+ Find the previous element in document order.
502
+
503
+ Args:
504
+ name: Tag name to search for
505
+ attrs: Attributes to match
506
+ text: Text content to match
507
+ **kwargs: Additional attributes
508
+
509
+ Returns:
510
+ Optional[Tag]: Previous matching element or None
511
+ """
512
+ return self._soup.find_previous(name, attrs, text, **kwargs)
513
+
514
+ def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs) -> List[Tag]:
515
+ """
516
+ Find all previous elements in document order.
517
+
518
+ Args:
519
+ name: Tag name to search for
520
+ attrs: Attributes to match
521
+ text: Text content to match
522
+ limit: Maximum number of results
523
+ **kwargs: Additional attributes
524
+
525
+ Returns:
526
+ List[Tag]: List of matching elements
527
+ """
528
+ return self._soup.find_all_previous(name, attrs, text, limit, **kwargs)
529
+
530
+ def select(self, selector: str) -> List[Tag]:
531
+ """
532
+ Select elements using CSS selector.
533
+
534
+ Args:
535
+ selector (str): CSS selector string
536
+
537
+ Returns:
538
+ List[Tag]: List of matching elements
539
+ """
540
+ return self._soup.select(selector)
541
+
542
+ def select_one(self, selector: str) -> Optional[Tag]:
543
+ """
544
+ Select the first element matching the CSS selector.
545
+
546
+ Args:
547
+ selector (str): CSS selector string
548
+
549
+ Returns:
550
+ Tag or None: First matching element
551
+ """
552
+ return self._soup.select_one(selector)
553
+
554
+ def get_text(self, separator="", strip=False, types=None) -> str:
555
+ """
556
+ Extract all text from the parsed document.
557
+ Standard behavior like BS4.
558
+ """
559
+ return self._soup.get_text(separator, strip, types)
560
+
561
+ @property
562
+ def text(self) -> str:
563
+ """BS4 compatible text property."""
564
+ return self.get_text()
565
+
566
+ @property
567
+ def string(self) -> Optional[str]:
568
+ """BS4 compatible string property."""
569
+ return self._soup.string
570
+
571
+ def get_text_robust(
572
+ self, separator=" ", strip=False, types=None, encoding_fallbacks=None
573
+ ) -> str:
574
+ """Extract text robustly, trying multiple encodings if needed."""
575
+ try:
576
+ return self.get_text(separator, strip, types)
577
+ except UnicodeDecodeError:
578
+ if encoding_fallbacks:
579
+ for enc in encoding_fallbacks:
580
+ try:
581
+ return self._soup.get_text(separator, strip, types).encode(enc).decode(enc)
582
+ except Exception:
583
+ continue
584
+ raise
585
+
586
+ def remove_tags(self, tags: List[str]) -> None:
587
+ """
588
+ Remove specified tags and their contents from the document.
589
+
590
+ Args:
591
+ tags (List[str]): List of tag names to remove
592
+ """
593
+ for tag_name in tags:
594
+ for tag in self._soup.find_all(tag_name):
595
+ tag.decompose()
596
+
597
+ def prettify(self, formatter="minimal") -> str:
598
+ """
599
+ Return a formatted, pretty-printed version of the HTML.
600
+
601
+ Args:
602
+ formatter (str, optional): Formatting style
603
+
604
+ Returns:
605
+ str: Prettified HTML
606
+ """
607
+ return self._soup.prettify(formatter)
608
+
609
+ def decompose(self, tag: Optional[Tag] = None) -> None:
610
+ """
611
+ Remove a tag and its contents from the document.
612
+
613
+ Args:
614
+ tag (Tag, optional): Tag to remove. If None, removes the root tag.
615
+ """
616
+ if tag is None:
617
+ tag = self._soup
618
+ assert tag is not None
619
+ tag.decompose()
620
+
621
+ def extract(self, tag: Optional[Tag] = None) -> Tag:
622
+ """
623
+ Remove a tag from the document and return it.
624
+
625
+ Args:
626
+ tag (Tag, optional): Tag to extract. If None, extracts the root tag.
627
+
628
+ Returns:
629
+ Tag: Extracted tag
630
+ """
631
+ if tag is None:
632
+ tag = self._soup
633
+ assert tag is not None
634
+ return tag.extract()
635
+
636
+ def clear(self, tag: Optional[Tag] = None) -> None:
637
+ """
638
+ Remove a tag's contents while keeping the tag itself.
639
+
640
+ Args:
641
+ tag (Tag, optional): Tag to clear. If None, clears the root tag.
642
+ """
643
+ if tag is None:
644
+ tag = self._soup
645
+ assert tag is not None
646
+ tag.clear()
647
+
648
+ def replace_with(self, old_tag: Tag, new_tag: Tag) -> None:
649
+ """
650
+ Replace one tag with another.
651
+
652
+ Args:
653
+ old_tag (Tag): Tag to replace
654
+ new_tag (Tag): Replacement tag
655
+ """
656
+ old_tag.replace_with(new_tag)
657
+
658
+ def encode(self, encoding="utf-8", errors="strict") -> bytes:
659
+ """Encode the document to a specific encoding with error handling."""
660
+ try:
661
+ return str(self._soup).encode(encoding, errors)
662
+ except Exception:
663
+ return str(self._soup).encode("utf-8", errors)
664
+
665
+ def decode(self, encoding="utf-8", errors="strict") -> str:
666
+ """Decode the document from a specific encoding with error handling.
667
+
668
+ Note: The parsed soup is represented as a str in memory, so decoding
669
+ simply returns the string representation.
670
+ """
671
+ try:
672
+ return str(self._soup)
673
+ except Exception:
674
+ return str(self._soup)
675
+
676
+ def __str__(self) -> str:
677
+ """
678
+ String representation of the parsed document.
679
+
680
+ Returns:
681
+ str: HTML content
682
+ """
683
+ return str(self._soup)
684
+
685
+ def __repr__(self) -> str:
686
+ """
687
+ Detailed representation of the Scout object.
688
+
689
+ Returns:
690
+ str: Scout object description
691
+ """
692
+ return f"Scout(features='{self.features}', content_length={len(self.markup)})"
693
+
694
+ def _preprocess_markup(self, markup: Union[str, bytes], encoding: Optional[str] = None) -> str:
695
+ """
696
+ Preprocess markup before parsing.
697
+
698
+ Args:
699
+ markup (str): Input markup
700
+ encoding (str, optional): Encoding to use
701
+
702
+ Returns:
703
+ str: Preprocessed markup
704
+ """
705
+ # Decode markup
706
+ decoded_markup = decode_markup(markup, encoding)
707
+
708
+ # Basic HTML cleaning
709
+ # Remove comments, normalize whitespace, etc.
710
+ decoded_markup = re.sub(r"<!--.*?-->", "", decoded_markup, flags=re.DOTALL)
711
+ decoded_markup = re.sub(r"\s+", " ", decoded_markup)
712
+
713
+ return decoded_markup
714
+
715
+ def wrap(self, wrapper_tag: Tag) -> Tag:
716
+ """Wrap the root tag in another tag with error handling."""
717
+ try:
718
+ return self._soup.wrap(wrapper_tag)
719
+ except Exception:
720
+ return wrapper_tag
721
+
722
+ def unwrap(self) -> None:
723
+ """Unwrap the root tag, keeping its contents in the parent, with error handling."""
724
+ try:
725
+ self._soup.unwrap()
726
+ except Exception:
727
+ pass
728
+
729
+ def insert_before(self, new_element: Tag) -> None:
730
+ """Insert a tag or string immediately before the root tag with error handling."""
731
+ try:
732
+ self._soup.insert_before(new_element)
733
+ except Exception:
734
+ pass
735
+
736
+ def insert_after(self, new_element: Tag) -> None:
737
+ """Insert a tag or string immediately after the root tag with error handling."""
738
+ try:
739
+ self._soup.insert_after(new_element)
740
+ except Exception:
741
+ pass
742
+
743
+ def append(self, tag: Tag) -> None:
744
+ """Append a tag to the root tag with error handling."""
745
+ try:
746
+ self._soup.append(tag)
747
+ except Exception:
748
+ pass
749
+
750
+ @property
751
+ def descendants(self):
752
+ """Yield all descendants of the root tag in document order."""
753
+ return self._soup.descendants
754
+
755
+ @property
756
+ def parents(self):
757
+ """Yield all parents of the root tag up the tree."""
758
+ return self._soup.parents
759
+
760
+ @property
761
+ def next_element(self):
762
+ """Return the next element in document order after the root tag."""
763
+ return self._soup.next_element
764
+
765
+ @property
766
+ def previous_element(self):
767
+ """Return the previous element in document order before the root tag."""
768
+ return self._soup.previous_element
769
+
770
+ def fetch_and_parse(self, url: str, session=None, **kwargs) -> "Scout":
771
+ """Fetch HTML from a URL and parse it with Scout. Prefers curl_cffi."""
772
+ try:
773
+ from curl_cffi import requests as curleq
774
+
775
+ s = session or curleq.Session()
776
+ resp = s.get(url, **kwargs)
777
+ return Scout(resp.content, features=self.features)
778
+ except ImportError:
779
+ import requests
780
+
781
+ s = session or requests.Session()
782
+ resp = s.get(url, **kwargs)
783
+ return Scout(resp.content, features=self.features)
784
+
785
+ def tables_to_dataframe(self, table_index=0, pandas_module=None):
786
+ """Convert the nth table in the document to a pandas DataFrame."""
787
+ try:
788
+ if pandas_module:
789
+ pd = pandas_module
790
+ else:
791
+ import pandas as pd # type: ignore
792
+ except ImportError:
793
+ raise ImportError("pandas is required for tables_to_dataframe. Please install pandas.")
794
+ tables = self.find_all("table")
795
+ if not tables or table_index >= len(tables):
796
+ return None
797
+ table = tables[table_index]
798
+ rows = table.find_all("tr")
799
+ data = [[cell.get_text(strip=True) for cell in row.find_all(["td", "th"])] for row in rows]
800
+ return pd.DataFrame(data)