webscout 8.2.9__py3-none-any.whl → 2026.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (413) hide show
  1. webscout/AIauto.py +524 -251
  2. webscout/AIbase.py +247 -319
  3. webscout/AIutel.py +68 -703
  4. webscout/Bard.py +1072 -1026
  5. webscout/Extra/GitToolkit/__init__.py +10 -10
  6. webscout/Extra/GitToolkit/gitapi/__init__.py +20 -12
  7. webscout/Extra/GitToolkit/gitapi/gist.py +142 -0
  8. webscout/Extra/GitToolkit/gitapi/organization.py +91 -0
  9. webscout/Extra/GitToolkit/gitapi/repository.py +308 -195
  10. webscout/Extra/GitToolkit/gitapi/search.py +162 -0
  11. webscout/Extra/GitToolkit/gitapi/trending.py +236 -0
  12. webscout/Extra/GitToolkit/gitapi/user.py +128 -96
  13. webscout/Extra/GitToolkit/gitapi/utils.py +82 -62
  14. webscout/Extra/YTToolkit/README.md +443 -375
  15. webscout/Extra/YTToolkit/YTdownloader.py +953 -957
  16. webscout/Extra/YTToolkit/__init__.py +3 -3
  17. webscout/Extra/YTToolkit/transcriber.py +595 -476
  18. webscout/Extra/YTToolkit/ytapi/README.md +230 -44
  19. webscout/Extra/YTToolkit/ytapi/__init__.py +22 -6
  20. webscout/Extra/YTToolkit/ytapi/captions.py +190 -0
  21. webscout/Extra/YTToolkit/ytapi/channel.py +302 -307
  22. webscout/Extra/YTToolkit/ytapi/errors.py +13 -13
  23. webscout/Extra/YTToolkit/ytapi/extras.py +178 -118
  24. webscout/Extra/YTToolkit/ytapi/hashtag.py +120 -0
  25. webscout/Extra/YTToolkit/ytapi/https.py +89 -88
  26. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -61
  27. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -59
  28. webscout/Extra/YTToolkit/ytapi/pool.py +8 -8
  29. webscout/Extra/YTToolkit/ytapi/query.py +143 -40
  30. webscout/Extra/YTToolkit/ytapi/shorts.py +122 -0
  31. webscout/Extra/YTToolkit/ytapi/stream.py +68 -63
  32. webscout/Extra/YTToolkit/ytapi/suggestions.py +97 -0
  33. webscout/Extra/YTToolkit/ytapi/utils.py +66 -62
  34. webscout/Extra/YTToolkit/ytapi/video.py +403 -232
  35. webscout/Extra/__init__.py +2 -3
  36. webscout/Extra/gguf.py +1298 -684
  37. webscout/Extra/tempmail/README.md +487 -487
  38. webscout/Extra/tempmail/__init__.py +28 -28
  39. webscout/Extra/tempmail/async_utils.py +143 -141
  40. webscout/Extra/tempmail/base.py +172 -161
  41. webscout/Extra/tempmail/cli.py +191 -187
  42. webscout/Extra/tempmail/emailnator.py +88 -84
  43. webscout/Extra/tempmail/mail_tm.py +378 -361
  44. webscout/Extra/tempmail/temp_mail_io.py +304 -292
  45. webscout/Extra/weather.py +196 -194
  46. webscout/Extra/weather_ascii.py +17 -15
  47. webscout/Provider/AISEARCH/PERPLEXED_search.py +175 -0
  48. webscout/Provider/AISEARCH/Perplexity.py +292 -333
  49. webscout/Provider/AISEARCH/README.md +106 -279
  50. webscout/Provider/AISEARCH/__init__.py +16 -9
  51. webscout/Provider/AISEARCH/brave_search.py +298 -0
  52. webscout/Provider/AISEARCH/iask_search.py +357 -410
  53. webscout/Provider/AISEARCH/monica_search.py +200 -220
  54. webscout/Provider/AISEARCH/webpilotai_search.py +242 -255
  55. webscout/Provider/Algion.py +413 -0
  56. webscout/Provider/Andi.py +74 -69
  57. webscout/Provider/Apriel.py +313 -0
  58. webscout/Provider/Ayle.py +323 -0
  59. webscout/Provider/ChatSandbox.py +329 -342
  60. webscout/Provider/ClaudeOnline.py +365 -0
  61. webscout/Provider/Cohere.py +232 -208
  62. webscout/Provider/DeepAI.py +367 -0
  63. webscout/Provider/Deepinfra.py +467 -340
  64. webscout/Provider/EssentialAI.py +217 -0
  65. webscout/Provider/ExaAI.py +274 -261
  66. webscout/Provider/Gemini.py +175 -169
  67. webscout/Provider/GithubChat.py +385 -369
  68. webscout/Provider/Gradient.py +286 -0
  69. webscout/Provider/Groq.py +556 -801
  70. webscout/Provider/HadadXYZ.py +323 -0
  71. webscout/Provider/HeckAI.py +392 -375
  72. webscout/Provider/HuggingFace.py +387 -0
  73. webscout/Provider/IBM.py +340 -0
  74. webscout/Provider/Jadve.py +317 -291
  75. webscout/Provider/K2Think.py +306 -0
  76. webscout/Provider/Koboldai.py +221 -384
  77. webscout/Provider/Netwrck.py +273 -270
  78. webscout/Provider/Nvidia.py +310 -0
  79. webscout/Provider/OPENAI/DeepAI.py +489 -0
  80. webscout/Provider/OPENAI/K2Think.py +423 -0
  81. webscout/Provider/OPENAI/PI.py +463 -0
  82. webscout/Provider/OPENAI/README.md +890 -952
  83. webscout/Provider/OPENAI/TogetherAI.py +405 -0
  84. webscout/Provider/OPENAI/TwoAI.py +255 -357
  85. webscout/Provider/OPENAI/__init__.py +148 -40
  86. webscout/Provider/OPENAI/ai4chat.py +348 -293
  87. webscout/Provider/OPENAI/akashgpt.py +436 -0
  88. webscout/Provider/OPENAI/algion.py +303 -0
  89. webscout/Provider/OPENAI/{exachat.py → ayle.py} +365 -444
  90. webscout/Provider/OPENAI/base.py +253 -249
  91. webscout/Provider/OPENAI/cerebras.py +296 -0
  92. webscout/Provider/OPENAI/chatgpt.py +870 -556
  93. webscout/Provider/OPENAI/chatsandbox.py +233 -173
  94. webscout/Provider/OPENAI/deepinfra.py +403 -322
  95. webscout/Provider/OPENAI/e2b.py +2370 -1414
  96. webscout/Provider/OPENAI/elmo.py +278 -0
  97. webscout/Provider/OPENAI/exaai.py +452 -417
  98. webscout/Provider/OPENAI/freeassist.py +446 -0
  99. webscout/Provider/OPENAI/gradient.py +448 -0
  100. webscout/Provider/OPENAI/groq.py +380 -364
  101. webscout/Provider/OPENAI/hadadxyz.py +292 -0
  102. webscout/Provider/OPENAI/heckai.py +333 -308
  103. webscout/Provider/OPENAI/huggingface.py +321 -0
  104. webscout/Provider/OPENAI/ibm.py +425 -0
  105. webscout/Provider/OPENAI/llmchat.py +253 -0
  106. webscout/Provider/OPENAI/llmchatco.py +378 -335
  107. webscout/Provider/OPENAI/meta.py +541 -0
  108. webscout/Provider/OPENAI/netwrck.py +374 -357
  109. webscout/Provider/OPENAI/nvidia.py +317 -0
  110. webscout/Provider/OPENAI/oivscode.py +348 -287
  111. webscout/Provider/OPENAI/openrouter.py +328 -0
  112. webscout/Provider/OPENAI/pydantic_imports.py +1 -172
  113. webscout/Provider/OPENAI/sambanova.py +397 -0
  114. webscout/Provider/OPENAI/sonus.py +305 -304
  115. webscout/Provider/OPENAI/textpollinations.py +370 -339
  116. webscout/Provider/OPENAI/toolbaz.py +375 -413
  117. webscout/Provider/OPENAI/typefully.py +419 -355
  118. webscout/Provider/OPENAI/typliai.py +279 -0
  119. webscout/Provider/OPENAI/utils.py +314 -318
  120. webscout/Provider/OPENAI/wisecat.py +359 -387
  121. webscout/Provider/OPENAI/writecream.py +185 -163
  122. webscout/Provider/OPENAI/x0gpt.py +462 -365
  123. webscout/Provider/OPENAI/zenmux.py +380 -0
  124. webscout/Provider/OpenRouter.py +386 -0
  125. webscout/Provider/Openai.py +337 -496
  126. webscout/Provider/PI.py +443 -429
  127. webscout/Provider/QwenLM.py +346 -254
  128. webscout/Provider/STT/__init__.py +28 -0
  129. webscout/Provider/STT/base.py +303 -0
  130. webscout/Provider/STT/elevenlabs.py +264 -0
  131. webscout/Provider/Sambanova.py +317 -0
  132. webscout/Provider/TTI/README.md +69 -82
  133. webscout/Provider/TTI/__init__.py +37 -7
  134. webscout/Provider/TTI/base.py +147 -64
  135. webscout/Provider/TTI/claudeonline.py +393 -0
  136. webscout/Provider/TTI/magicstudio.py +292 -201
  137. webscout/Provider/TTI/miragic.py +180 -0
  138. webscout/Provider/TTI/pollinations.py +331 -221
  139. webscout/Provider/TTI/together.py +334 -0
  140. webscout/Provider/TTI/utils.py +14 -11
  141. webscout/Provider/TTS/README.md +186 -192
  142. webscout/Provider/TTS/__init__.py +43 -10
  143. webscout/Provider/TTS/base.py +523 -159
  144. webscout/Provider/TTS/deepgram.py +286 -156
  145. webscout/Provider/TTS/elevenlabs.py +189 -111
  146. webscout/Provider/TTS/freetts.py +218 -0
  147. webscout/Provider/TTS/murfai.py +288 -113
  148. webscout/Provider/TTS/openai_fm.py +364 -129
  149. webscout/Provider/TTS/parler.py +203 -111
  150. webscout/Provider/TTS/qwen.py +334 -0
  151. webscout/Provider/TTS/sherpa.py +286 -0
  152. webscout/Provider/TTS/speechma.py +693 -580
  153. webscout/Provider/TTS/streamElements.py +275 -333
  154. webscout/Provider/TTS/utils.py +280 -280
  155. webscout/Provider/TextPollinationsAI.py +331 -308
  156. webscout/Provider/TogetherAI.py +450 -0
  157. webscout/Provider/TwoAI.py +309 -475
  158. webscout/Provider/TypliAI.py +311 -305
  159. webscout/Provider/UNFINISHED/ChatHub.py +219 -209
  160. webscout/Provider/{OPENAI/glider.py → UNFINISHED/ChutesAI.py} +331 -326
  161. webscout/Provider/{GizAI.py → UNFINISHED/GizAI.py} +300 -295
  162. webscout/Provider/{Marcus.py → UNFINISHED/Marcus.py} +218 -198
  163. webscout/Provider/UNFINISHED/Qodo.py +481 -0
  164. webscout/Provider/{MCPCore.py → UNFINISHED/XenAI.py} +330 -315
  165. webscout/Provider/UNFINISHED/Youchat.py +347 -330
  166. webscout/Provider/UNFINISHED/aihumanizer.py +41 -0
  167. webscout/Provider/UNFINISHED/grammerchecker.py +37 -0
  168. webscout/Provider/UNFINISHED/liner.py +342 -0
  169. webscout/Provider/UNFINISHED/liner_api_request.py +246 -263
  170. webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +231 -224
  171. webscout/Provider/WiseCat.py +256 -233
  172. webscout/Provider/WrDoChat.py +390 -370
  173. webscout/Provider/__init__.py +115 -174
  174. webscout/Provider/ai4chat.py +181 -174
  175. webscout/Provider/akashgpt.py +330 -335
  176. webscout/Provider/cerebras.py +397 -290
  177. webscout/Provider/cleeai.py +236 -213
  178. webscout/Provider/elmo.py +291 -283
  179. webscout/Provider/geminiapi.py +343 -208
  180. webscout/Provider/julius.py +245 -223
  181. webscout/Provider/learnfastai.py +333 -325
  182. webscout/Provider/llama3mitril.py +230 -215
  183. webscout/Provider/llmchat.py +308 -258
  184. webscout/Provider/llmchatco.py +321 -306
  185. webscout/Provider/meta.py +996 -801
  186. webscout/Provider/oivscode.py +332 -309
  187. webscout/Provider/searchchat.py +316 -292
  188. webscout/Provider/sonus.py +264 -258
  189. webscout/Provider/toolbaz.py +359 -353
  190. webscout/Provider/turboseek.py +332 -266
  191. webscout/Provider/typefully.py +262 -202
  192. webscout/Provider/x0gpt.py +332 -299
  193. webscout/__init__.py +31 -39
  194. webscout/__main__.py +5 -5
  195. webscout/cli.py +585 -524
  196. webscout/client.py +1497 -70
  197. webscout/conversation.py +140 -436
  198. webscout/exceptions.py +383 -362
  199. webscout/litagent/__init__.py +29 -29
  200. webscout/litagent/agent.py +492 -455
  201. webscout/litagent/constants.py +60 -60
  202. webscout/models.py +505 -181
  203. webscout/optimizers.py +74 -420
  204. webscout/prompt_manager.py +376 -288
  205. webscout/sanitize.py +1514 -0
  206. webscout/scout/README.md +452 -404
  207. webscout/scout/__init__.py +8 -8
  208. webscout/scout/core/__init__.py +7 -7
  209. webscout/scout/core/crawler.py +330 -210
  210. webscout/scout/core/scout.py +800 -607
  211. webscout/scout/core/search_result.py +51 -96
  212. webscout/scout/core/text_analyzer.py +64 -63
  213. webscout/scout/core/text_utils.py +412 -277
  214. webscout/scout/core/web_analyzer.py +54 -52
  215. webscout/scout/element.py +872 -478
  216. webscout/scout/parsers/__init__.py +70 -69
  217. webscout/scout/parsers/html5lib_parser.py +182 -172
  218. webscout/scout/parsers/html_parser.py +238 -236
  219. webscout/scout/parsers/lxml_parser.py +203 -178
  220. webscout/scout/utils.py +38 -37
  221. webscout/search/__init__.py +47 -0
  222. webscout/search/base.py +201 -0
  223. webscout/search/bing_main.py +45 -0
  224. webscout/search/brave_main.py +92 -0
  225. webscout/search/duckduckgo_main.py +57 -0
  226. webscout/search/engines/__init__.py +127 -0
  227. webscout/search/engines/bing/__init__.py +15 -0
  228. webscout/search/engines/bing/base.py +35 -0
  229. webscout/search/engines/bing/images.py +114 -0
  230. webscout/search/engines/bing/news.py +96 -0
  231. webscout/search/engines/bing/suggestions.py +36 -0
  232. webscout/search/engines/bing/text.py +109 -0
  233. webscout/search/engines/brave/__init__.py +19 -0
  234. webscout/search/engines/brave/base.py +47 -0
  235. webscout/search/engines/brave/images.py +213 -0
  236. webscout/search/engines/brave/news.py +353 -0
  237. webscout/search/engines/brave/suggestions.py +318 -0
  238. webscout/search/engines/brave/text.py +167 -0
  239. webscout/search/engines/brave/videos.py +364 -0
  240. webscout/search/engines/duckduckgo/__init__.py +25 -0
  241. webscout/search/engines/duckduckgo/answers.py +80 -0
  242. webscout/search/engines/duckduckgo/base.py +189 -0
  243. webscout/search/engines/duckduckgo/images.py +100 -0
  244. webscout/search/engines/duckduckgo/maps.py +183 -0
  245. webscout/search/engines/duckduckgo/news.py +70 -0
  246. webscout/search/engines/duckduckgo/suggestions.py +22 -0
  247. webscout/search/engines/duckduckgo/text.py +221 -0
  248. webscout/search/engines/duckduckgo/translate.py +48 -0
  249. webscout/search/engines/duckduckgo/videos.py +80 -0
  250. webscout/search/engines/duckduckgo/weather.py +84 -0
  251. webscout/search/engines/mojeek.py +61 -0
  252. webscout/search/engines/wikipedia.py +77 -0
  253. webscout/search/engines/yahoo/__init__.py +41 -0
  254. webscout/search/engines/yahoo/answers.py +19 -0
  255. webscout/search/engines/yahoo/base.py +34 -0
  256. webscout/search/engines/yahoo/images.py +323 -0
  257. webscout/search/engines/yahoo/maps.py +19 -0
  258. webscout/search/engines/yahoo/news.py +258 -0
  259. webscout/search/engines/yahoo/suggestions.py +140 -0
  260. webscout/search/engines/yahoo/text.py +273 -0
  261. webscout/search/engines/yahoo/translate.py +19 -0
  262. webscout/search/engines/yahoo/videos.py +302 -0
  263. webscout/search/engines/yahoo/weather.py +220 -0
  264. webscout/search/engines/yandex.py +67 -0
  265. webscout/search/engines/yep/__init__.py +13 -0
  266. webscout/search/engines/yep/base.py +34 -0
  267. webscout/search/engines/yep/images.py +101 -0
  268. webscout/search/engines/yep/suggestions.py +38 -0
  269. webscout/search/engines/yep/text.py +99 -0
  270. webscout/search/http_client.py +172 -0
  271. webscout/search/results.py +141 -0
  272. webscout/search/yahoo_main.py +57 -0
  273. webscout/search/yep_main.py +48 -0
  274. webscout/server/__init__.py +48 -0
  275. webscout/server/config.py +78 -0
  276. webscout/server/exceptions.py +69 -0
  277. webscout/server/providers.py +286 -0
  278. webscout/server/request_models.py +131 -0
  279. webscout/server/request_processing.py +404 -0
  280. webscout/server/routes.py +642 -0
  281. webscout/server/server.py +351 -0
  282. webscout/server/ui_templates.py +1171 -0
  283. webscout/swiftcli/__init__.py +79 -95
  284. webscout/swiftcli/core/__init__.py +7 -7
  285. webscout/swiftcli/core/cli.py +574 -297
  286. webscout/swiftcli/core/context.py +98 -104
  287. webscout/swiftcli/core/group.py +268 -241
  288. webscout/swiftcli/decorators/__init__.py +28 -28
  289. webscout/swiftcli/decorators/command.py +243 -221
  290. webscout/swiftcli/decorators/options.py +247 -220
  291. webscout/swiftcli/decorators/output.py +392 -252
  292. webscout/swiftcli/exceptions.py +21 -21
  293. webscout/swiftcli/plugins/__init__.py +9 -9
  294. webscout/swiftcli/plugins/base.py +134 -135
  295. webscout/swiftcli/plugins/manager.py +269 -269
  296. webscout/swiftcli/utils/__init__.py +58 -59
  297. webscout/swiftcli/utils/formatting.py +251 -252
  298. webscout/swiftcli/utils/parsing.py +368 -267
  299. webscout/update_checker.py +280 -136
  300. webscout/utils.py +28 -14
  301. webscout/version.py +2 -1
  302. webscout/version.py.bak +3 -0
  303. webscout/zeroart/__init__.py +218 -135
  304. webscout/zeroart/base.py +70 -66
  305. webscout/zeroart/effects.py +155 -101
  306. webscout/zeroart/fonts.py +1799 -1239
  307. webscout-2026.1.19.dist-info/METADATA +638 -0
  308. webscout-2026.1.19.dist-info/RECORD +312 -0
  309. {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/WHEEL +1 -1
  310. {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/entry_points.txt +1 -1
  311. webscout/DWEBS.py +0 -520
  312. webscout/Extra/Act.md +0 -309
  313. webscout/Extra/GitToolkit/gitapi/README.md +0 -110
  314. webscout/Extra/autocoder/__init__.py +0 -9
  315. webscout/Extra/autocoder/autocoder.py +0 -1105
  316. webscout/Extra/autocoder/autocoder_utiles.py +0 -332
  317. webscout/Extra/gguf.md +0 -430
  318. webscout/Extra/weather.md +0 -281
  319. webscout/Litlogger/README.md +0 -10
  320. webscout/Litlogger/__init__.py +0 -15
  321. webscout/Litlogger/formats.py +0 -4
  322. webscout/Litlogger/handlers.py +0 -103
  323. webscout/Litlogger/levels.py +0 -13
  324. webscout/Litlogger/logger.py +0 -92
  325. webscout/Provider/AI21.py +0 -177
  326. webscout/Provider/AISEARCH/DeepFind.py +0 -254
  327. webscout/Provider/AISEARCH/felo_search.py +0 -202
  328. webscout/Provider/AISEARCH/genspark_search.py +0 -324
  329. webscout/Provider/AISEARCH/hika_search.py +0 -186
  330. webscout/Provider/AISEARCH/scira_search.py +0 -298
  331. webscout/Provider/Aitopia.py +0 -316
  332. webscout/Provider/AllenAI.py +0 -440
  333. webscout/Provider/Blackboxai.py +0 -791
  334. webscout/Provider/ChatGPTClone.py +0 -237
  335. webscout/Provider/ChatGPTGratis.py +0 -194
  336. webscout/Provider/Cloudflare.py +0 -324
  337. webscout/Provider/ExaChat.py +0 -358
  338. webscout/Provider/Flowith.py +0 -217
  339. webscout/Provider/FreeGemini.py +0 -250
  340. webscout/Provider/Glider.py +0 -225
  341. webscout/Provider/HF_space/__init__.py +0 -0
  342. webscout/Provider/HF_space/qwen_qwen2.py +0 -206
  343. webscout/Provider/HuggingFaceChat.py +0 -469
  344. webscout/Provider/Hunyuan.py +0 -283
  345. webscout/Provider/LambdaChat.py +0 -411
  346. webscout/Provider/Llama3.py +0 -259
  347. webscout/Provider/Nemotron.py +0 -218
  348. webscout/Provider/OLLAMA.py +0 -396
  349. webscout/Provider/OPENAI/BLACKBOXAI.py +0 -766
  350. webscout/Provider/OPENAI/Cloudflare.py +0 -378
  351. webscout/Provider/OPENAI/FreeGemini.py +0 -283
  352. webscout/Provider/OPENAI/NEMOTRON.py +0 -232
  353. webscout/Provider/OPENAI/Qwen3.py +0 -283
  354. webscout/Provider/OPENAI/api.py +0 -969
  355. webscout/Provider/OPENAI/c4ai.py +0 -373
  356. webscout/Provider/OPENAI/chatgptclone.py +0 -494
  357. webscout/Provider/OPENAI/copilot.py +0 -242
  358. webscout/Provider/OPENAI/flowith.py +0 -162
  359. webscout/Provider/OPENAI/freeaichat.py +0 -359
  360. webscout/Provider/OPENAI/mcpcore.py +0 -389
  361. webscout/Provider/OPENAI/multichat.py +0 -376
  362. webscout/Provider/OPENAI/opkfc.py +0 -496
  363. webscout/Provider/OPENAI/scirachat.py +0 -477
  364. webscout/Provider/OPENAI/standardinput.py +0 -433
  365. webscout/Provider/OPENAI/typegpt.py +0 -364
  366. webscout/Provider/OPENAI/uncovrAI.py +0 -463
  367. webscout/Provider/OPENAI/venice.py +0 -431
  368. webscout/Provider/OPENAI/yep.py +0 -382
  369. webscout/Provider/OpenGPT.py +0 -209
  370. webscout/Provider/Perplexitylabs.py +0 -415
  371. webscout/Provider/Reka.py +0 -214
  372. webscout/Provider/StandardInput.py +0 -290
  373. webscout/Provider/TTI/aiarta.py +0 -365
  374. webscout/Provider/TTI/artbit.py +0 -0
  375. webscout/Provider/TTI/fastflux.py +0 -200
  376. webscout/Provider/TTI/piclumen.py +0 -203
  377. webscout/Provider/TTI/pixelmuse.py +0 -225
  378. webscout/Provider/TTS/gesserit.py +0 -128
  379. webscout/Provider/TTS/sthir.py +0 -94
  380. webscout/Provider/TeachAnything.py +0 -229
  381. webscout/Provider/UNFINISHED/puterjs.py +0 -635
  382. webscout/Provider/UNFINISHED/test_lmarena.py +0 -119
  383. webscout/Provider/Venice.py +0 -258
  384. webscout/Provider/VercelAI.py +0 -253
  385. webscout/Provider/Writecream.py +0 -246
  386. webscout/Provider/WritingMate.py +0 -269
  387. webscout/Provider/asksteve.py +0 -220
  388. webscout/Provider/chatglm.py +0 -215
  389. webscout/Provider/copilot.py +0 -425
  390. webscout/Provider/freeaichat.py +0 -285
  391. webscout/Provider/granite.py +0 -235
  392. webscout/Provider/hermes.py +0 -266
  393. webscout/Provider/koala.py +0 -170
  394. webscout/Provider/lmarena.py +0 -198
  395. webscout/Provider/multichat.py +0 -364
  396. webscout/Provider/scira_chat.py +0 -299
  397. webscout/Provider/scnet.py +0 -243
  398. webscout/Provider/talkai.py +0 -194
  399. webscout/Provider/typegpt.py +0 -289
  400. webscout/Provider/uncovr.py +0 -368
  401. webscout/Provider/yep.py +0 -389
  402. webscout/litagent/Readme.md +0 -276
  403. webscout/litprinter/__init__.py +0 -59
  404. webscout/swiftcli/Readme.md +0 -323
  405. webscout/tempid.py +0 -128
  406. webscout/webscout_search.py +0 -1184
  407. webscout/webscout_search_async.py +0 -654
  408. webscout/yep_search.py +0 -347
  409. webscout/zeroart/README.md +0 -89
  410. webscout-8.2.9.dist-info/METADATA +0 -1033
  411. webscout-8.2.9.dist-info/RECORD +0 -289
  412. {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/licenses/LICENSE.md +0 -0
  413. {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/top_level.txt +0 -0
@@ -1,607 +1,800 @@
1
- """
2
- Scout Main Module - HTML Parsing and Traversal
3
- """
4
- import hashlib
5
- import json
6
- import re
7
- import unicodedata
8
- import urllib.parse
9
- from typing import Any, Dict, List, Optional
10
-
11
- from ..element import NavigableString, Tag
12
- from ..parsers import ParserRegistry
13
- from ..utils import decode_markup
14
- from .search_result import ScoutSearchResult
15
- from .text_analyzer import ScoutTextAnalyzer
16
- from .text_utils import SentenceTokenizer
17
- from .web_analyzer import ScoutWebAnalyzer
18
-
19
-
20
- class Scout:
21
- """
22
- Scout - Making web scraping a breeze! 🌊
23
- A comprehensive HTML parsing and traversal library.
24
- Enhanced with advanced features and intelligent parsing.
25
- """
26
-
27
- def __init__(self, markup="", features='html.parser', from_encoding=None, **kwargs):
28
- """
29
- Initialize Scout with HTML content.
30
-
31
- Args:
32
- markup (str): HTML content to parse
33
- features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
34
- from_encoding (str): Source encoding (if known)
35
- **kwargs: Additional parsing options
36
- """
37
- # Intelligent markup handling
38
- self.markup = self._preprocess_markup(markup, from_encoding)
39
- self.features = features
40
- self.from_encoding = from_encoding
41
-
42
- # Get the right parser for the job
43
- if features not in ParserRegistry.list_parsers():
44
- raise ValueError(
45
- f"Invalid parser '{features}'! Choose from: {', '.join(ParserRegistry.list_parsers().keys())}"
46
- )
47
-
48
- parser_class = ParserRegistry.get_parser(features)
49
- self.parser = parser_class
50
-
51
- # Parse that HTML! 🎯
52
- self._soup = self.parser.parse(self.markup)
53
-
54
- # BeautifulSoup-like attributes
55
- self.name = self._soup.name if hasattr(self._soup, 'name') else None
56
- self.attrs = self._soup.attrs if hasattr(self._soup, 'attrs') else {}
57
-
58
- # Advanced parsing options
59
- self._cache = {}
60
-
61
- # Text and web analyzers
62
- self.text_analyzer = ScoutTextAnalyzer()
63
- self.web_analyzer = ScoutWebAnalyzer()
64
-
65
- def normalize_text(self, text: str, form='NFKD') -> str:
66
- """
67
- Normalize text using Unicode normalization.
68
-
69
- Args:
70
- text (str): Input text
71
- form (str, optional): Normalization form
72
-
73
- Returns:
74
- str: Normalized text
75
- """
76
- return unicodedata.normalize(form, text)
77
-
78
- def url_parse(self, url: str) -> Dict[str, str]:
79
- """
80
- Parse and analyze a URL.
81
-
82
- Args:
83
- url (str): URL to parse
84
-
85
- Returns:
86
- Dict[str, str]: Parsed URL components
87
- """
88
- parsed = urllib.parse.urlparse(url)
89
- return {
90
- 'scheme': parsed.scheme,
91
- 'netloc': parsed.netloc,
92
- 'path': parsed.path,
93
- 'params': parsed.params,
94
- 'query': parsed.query,
95
- 'fragment': parsed.fragment
96
- }
97
-
98
- def analyze_page_structure(self) -> Dict[str, Any]:
99
- """
100
- Analyze the structure of the parsed page.
101
-
102
- Returns:
103
- Dict[str, Any]: Page structure analysis
104
- """
105
- return self.web_analyzer.analyze_page_structure(self)
106
-
107
- def analyze_text(self, text: Optional[str] = None) -> Dict[str, Any]:
108
- """
109
- Perform advanced text analysis.
110
-
111
- Args:
112
- text (str, optional): Text to analyze. If None, uses page text.
113
-
114
- Returns:
115
- Dict[str, Any]: Text analysis results
116
- """
117
- if text is None:
118
- text = self.get_text()
119
-
120
- return {
121
- 'word_count': self.text_analyzer.count_words(text),
122
- 'entities': self.text_analyzer.extract_entities(text),
123
- 'tokens': self.text_analyzer.tokenize(text)
124
- }
125
-
126
- def extract_semantic_info(self) -> Dict[str, Any]:
127
- """
128
- Extract semantic information from the document.
129
-
130
- Returns:
131
- Dict[str, Any]: Semantic information
132
- """
133
- semantic_info = {
134
- 'headings': {
135
- 'h1': [h.get_text(strip=True) for h in self.find_all('h1')],
136
- 'h2': [h.get_text(strip=True) for h in self.find_all('h2')],
137
- 'h3': [h.get_text(strip=True) for h in self.find_all('h3')]
138
- },
139
- 'lists': {
140
- 'ul': [ul.find_all('li') for ul in self.find_all('ul')],
141
- 'ol': [ol.find_all('li') for ol in self.find_all('ol')]
142
- },
143
- 'tables': {
144
- 'count': len(self.find_all('table')),
145
- 'headers': [table.find_all('th') for table in self.find_all('table')]
146
- }
147
- }
148
- return semantic_info
149
-
150
- def cache(self, key: str, value: Any = None) -> Any:
151
- """
152
- Manage a cache for parsed content.
153
-
154
- Args:
155
- key (str): Cache key
156
- value (Any, optional): Value to cache
157
-
158
- Returns:
159
- Any: Cached value or None
160
- """
161
- if value is not None:
162
- self._cache[key] = value
163
- return self._cache.get(key)
164
-
165
- def hash_content(self, method='md5') -> str:
166
- """
167
- Generate a hash of the parsed content.
168
-
169
- Args:
170
- method (str, optional): Hashing method
171
-
172
- Returns:
173
- str: Content hash
174
- """
175
- hash_methods = {
176
- 'md5': hashlib.md5,
177
- 'sha1': hashlib.sha1,
178
- 'sha256': hashlib.sha256
179
- }
180
-
181
- if method not in hash_methods:
182
- raise ValueError(f"Unsupported hash method: {method}")
183
-
184
- hasher = hash_methods[method]()
185
- hasher.update(str(self._soup).encode('utf-8'))
186
- return hasher.hexdigest()
187
-
188
- def extract_links(self, base_url: Optional[str] = None) -> List[Dict[str, str]]:
189
- """
190
- Extract all links from the document.
191
-
192
- Args:
193
- base_url (str, optional): Base URL for resolving relative links
194
-
195
- Returns:
196
- List[Dict[str, str]]: List of link dictionaries
197
- """
198
- links = []
199
- for link in self.find_all(['a', 'link']):
200
- href = link.get('href')
201
- if href:
202
- # Resolve relative URLs if base_url is provided
203
- if base_url and not href.startswith(('http://', 'https://', '//')):
204
- href = f"{base_url.rstrip('/')}/{href.lstrip('/')}"
205
-
206
- links.append({
207
- 'href': href,
208
- 'text': link.get_text(strip=True),
209
- 'rel': link.get('rel', [None])[0],
210
- 'type': link.get('type')
211
- })
212
- return links
213
-
214
- def extract_metadata(self) -> Dict[str, Any]:
215
- """
216
- Extract metadata from HTML document.
217
-
218
- Returns:
219
- Dict[str, Any]: Extracted metadata
220
- """
221
- metadata = {
222
- 'title': self.find('title').texts()[0] if self.find('title').texts() else None,
223
- 'description': self.find('meta', attrs={'name': 'description'}).attrs('content')[0] if self.find('meta', attrs={'name': 'description'}).attrs('content') else None,
224
- 'keywords': self.find('meta', attrs={'name': 'keywords'}).attrs('content')[0].split(',') if self.find('meta', attrs={'name': 'keywords'}).attrs('content') else [],
225
- 'og_metadata': {},
226
- 'twitter_metadata': {}
227
- }
228
-
229
- # Open Graph metadata
230
- for meta in self.find_all('meta', attrs={'property': re.compile(r'^og:')}):
231
- key = meta.attrs('property')[0][3:]
232
- metadata['og_metadata'][key] = meta.attrs('content')[0]
233
-
234
- # Twitter Card metadata
235
- for meta in self.find_all('meta', attrs={'name': re.compile(r'^twitter:')}):
236
- key = meta.attrs('name')[0][8:]
237
- metadata['twitter_metadata'][key] = meta.attrs('content')[0]
238
-
239
- return metadata
240
-
241
- def to_json(self, indent=2) -> str:
242
- """
243
- Convert parsed content to JSON.
244
-
245
- Args:
246
- indent (int, optional): JSON indentation
247
-
248
- Returns:
249
- str: JSON representation of the document
250
- """
251
- def _tag_to_dict(tag):
252
- if isinstance(tag, NavigableString):
253
- return str(tag)
254
-
255
- result = {
256
- 'name': tag.name,
257
- 'attrs': tag.attrs,
258
- 'text': tag.get_text(strip=True)
259
- }
260
-
261
- if tag.contents:
262
- result['children'] = [_tag_to_dict(child) for child in tag.contents]
263
-
264
- return result
265
-
266
- return json.dumps(_tag_to_dict(self._soup), indent=indent)
267
-
268
- def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> ScoutSearchResult:
269
- """
270
- Find the first matching element.
271
-
272
- Args:
273
- name (str, optional): Tag name to search for
274
- attrs (dict, optional): Attributes to match
275
- recursive (bool, optional): Search recursively
276
- text (str, optional): Text content to match
277
-
278
- Returns:
279
- ScoutSearchResult: First matching element
280
- """
281
- result = self._soup.find(name, attrs, recursive, text, **kwargs)
282
- return ScoutSearchResult([result]) if result else ScoutSearchResult([])
283
-
284
- def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> ScoutSearchResult:
285
- """
286
- Find all matching elements.
287
-
288
- Args:
289
- name (str, optional): Tag name to search for
290
- attrs (dict, optional): Attributes to match
291
- recursive (bool, optional): Search recursively
292
- text (str, optional): Text content to match
293
- limit (int, optional): Maximum number of results
294
-
295
- Returns:
296
- ScoutSearchResult: List of matching elements
297
- """
298
- results = self._soup.find_all(name, attrs, recursive, text, limit, **kwargs)
299
- return ScoutSearchResult(results)
300
-
301
- def find_parent(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
302
- """
303
- Find the first parent matching given criteria.
304
-
305
- Args:
306
- name (str, optional): Tag name to search for
307
- attrs (dict, optional): Attributes to match
308
-
309
- Returns:
310
- Tag or None: First matching parent
311
- """
312
- current = self._soup.parent
313
- while current:
314
- if (name is None or current.name == name) and \
315
- all(current.get(k) == v for k, v in attrs.items()):
316
- return current
317
- current = current.parent
318
- return None
319
-
320
- def find_parents(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
321
- """
322
- Find all parents matching given criteria.
323
-
324
- Args:
325
- name (str, optional): Tag name to search for
326
- attrs (dict, optional): Attributes to match
327
- limit (int, optional): Maximum number of results
328
-
329
- Returns:
330
- List[Tag]: List of matching parents
331
- """
332
- parents = []
333
- current = self._soup.parent
334
- while current and (limit is None or len(parents) < limit):
335
- if (name is None or current.name == name) and \
336
- all(current.get(k) == v for k, v in attrs.items()):
337
- parents.append(current)
338
- current = current.parent
339
- return parents
340
-
341
- def find_next_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
342
- """
343
- Find the next sibling matching given criteria.
344
-
345
- Args:
346
- name (str, optional): Tag name to search for
347
- attrs (dict, optional): Attributes to match
348
-
349
- Returns:
350
- Tag or None: First matching next sibling
351
- """
352
- if not self._soup.parent:
353
- return None
354
-
355
- siblings = self._soup.parent.contents
356
- try:
357
- current_index = siblings.index(self._soup)
358
- for sibling in siblings[current_index + 1:]:
359
- if isinstance(sibling, Tag):
360
- if (name is None or sibling.name == name) and \
361
- all(sibling.get(k) == v for k, v in attrs.items()):
362
- return sibling
363
- except ValueError:
364
- pass
365
- return None
366
-
367
- def find_next_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
368
- """
369
- Find all next siblings matching given criteria.
370
-
371
- Args:
372
- name (str, optional): Tag name to search for
373
- attrs (dict, optional): Attributes to match
374
- limit (int, optional): Maximum number of results
375
-
376
- Returns:
377
- List[Tag]: List of matching next siblings
378
- """
379
- if not self._soup.parent:
380
- return []
381
-
382
- siblings = []
383
- siblings_list = self._soup.parent.contents
384
- try:
385
- current_index = siblings_list.index(self._soup)
386
- for sibling in siblings_list[current_index + 1:]:
387
- if isinstance(sibling, Tag):
388
- if (name is None or sibling.name == name) and \
389
- all(sibling.get(k) == v for k, v in attrs.items()):
390
- siblings.append(sibling)
391
- if limit and len(siblings) == limit:
392
- break
393
- except ValueError:
394
- pass
395
- return siblings
396
-
397
- def find_previous_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
398
- """Find the previous sibling matching given criteria."""
399
- if not self._soup.parent:
400
- return None
401
-
402
- siblings = self._soup.parent.contents
403
- try:
404
- current_index = siblings.index(self._soup)
405
- for sibling in reversed(siblings[:current_index]):
406
- if isinstance(sibling, Tag):
407
- if (name is None or sibling.name == name) and all(
408
- sibling.get(k) == v for k, v in attrs.items()
409
- ):
410
- return sibling
411
- except ValueError:
412
- pass
413
- return None
414
-
415
- def find_previous_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
416
- """Find all previous siblings matching given criteria."""
417
- if not self._soup.parent:
418
- return []
419
-
420
- siblings = []
421
- siblings_list = self._soup.parent.contents
422
- try:
423
- current_index = siblings_list.index(self._soup)
424
- for sibling in reversed(siblings_list[:current_index]):
425
- if isinstance(sibling, Tag):
426
- if (name is None or sibling.name == name) and all(
427
- sibling.get(k) == v for k, v in attrs.items()
428
- ):
429
- siblings.append(sibling)
430
- if limit and len(siblings) == limit:
431
- break
432
- except ValueError:
433
- pass
434
- return siblings
435
-
436
- def select(self, selector: str) -> List[Tag]:
437
- """
438
- Select elements using CSS selector.
439
-
440
- Args:
441
- selector (str): CSS selector string
442
-
443
- Returns:
444
- List[Tag]: List of matching elements
445
- """
446
- return self._soup.select(selector)
447
-
448
- def select_one(self, selector: str) -> Optional[Tag]:
449
- """
450
- Select the first element matching the CSS selector.
451
-
452
- Args:
453
- selector (str): CSS selector string
454
-
455
- Returns:
456
- Tag or None: First matching element
457
- """
458
- return self._soup.select_one(selector)
459
-
460
- def get_text(self, separator=' ', strip=False, types=None) -> str:
461
- """
462
- Extract all text from the parsed document.
463
-
464
- Args:
465
- separator (str, optional): Text separator
466
- strip (bool, optional): Strip whitespace
467
- types (list, optional): Types of content to extract
468
-
469
- Returns:
470
- str: Extracted text
471
- """
472
- tokenizer = SentenceTokenizer()
473
- text = self._soup.get_text(separator, strip, types)
474
- sentences = tokenizer.tokenize(text)
475
- return "\n\n".join(sentences)
476
-
477
- def remove_tags(self, tags: List[str]) -> None:
478
- """
479
- Remove specified tags and their contents from the document.
480
-
481
- Args:
482
- tags (List[str]): List of tag names to remove
483
- """
484
- for tag_name in tags:
485
- for tag in self._soup.find_all(tag_name):
486
- tag.decompose()
487
-
488
- def prettify(self, formatter='minimal') -> str:
489
- """
490
- Return a formatted, pretty-printed version of the HTML.
491
-
492
- Args:
493
- formatter (str, optional): Formatting style
494
-
495
- Returns:
496
- str: Prettified HTML
497
- """
498
- return self._soup.prettify(formatter)
499
-
500
- def decompose(self, tag: Tag = None) -> None:
501
- """
502
- Remove a tag and its contents from the document.
503
-
504
- Args:
505
- tag (Tag, optional): Tag to remove. If None, removes the root tag.
506
- """
507
- if tag is None:
508
- tag = self._soup
509
- tag.decompose()
510
-
511
- def extract(self, tag: Tag = None) -> Tag:
512
- """
513
- Remove a tag from the document and return it.
514
-
515
- Args:
516
- tag (Tag, optional): Tag to extract. If None, extracts the root tag.
517
-
518
- Returns:
519
- Tag: Extracted tag
520
- """
521
- if tag is None:
522
- tag = self._soup
523
- return tag.extract()
524
-
525
- def clear(self, tag: Tag = None) -> None:
526
- """
527
- Remove a tag's contents while keeping the tag itself.
528
-
529
- Args:
530
- tag (Tag, optional): Tag to clear. If None, clears the root tag.
531
- """
532
- if tag is None:
533
- tag = self._soup
534
- tag.clear()
535
-
536
- def replace_with(self, old_tag: Tag, new_tag: Tag) -> None:
537
- """
538
- Replace one tag with another.
539
-
540
- Args:
541
- old_tag (Tag): Tag to replace
542
- new_tag (Tag): Replacement tag
543
- """
544
- old_tag.replace_with(new_tag)
545
-
546
- def encode(self, encoding='utf-8') -> bytes:
547
- """
548
- Encode the document to a specific encoding.
549
-
550
- Args:
551
- encoding (str, optional): Encoding to use
552
-
553
- Returns:
554
- bytes: Encoded document
555
- """
556
- return str(self._soup).encode(encoding)
557
-
558
- def decode(self, encoding='utf-8') -> str:
559
- """
560
- Decode the document from a specific encoding.
561
-
562
- Args:
563
- encoding (str, optional): Encoding to use
564
-
565
- Returns:
566
- str: Decoded document
567
- """
568
- return str(self._soup)
569
-
570
- def __str__(self) -> str:
571
- """
572
- String representation of the parsed document.
573
-
574
- Returns:
575
- str: HTML content
576
- """
577
- return str(self._soup)
578
-
579
- def __repr__(self) -> str:
580
- """
581
- Detailed representation of the Scout object.
582
-
583
- Returns:
584
- str: Scout object description
585
- """
586
- return f"Scout(features='{self.features}', content_length={len(self.markup)})"
587
-
588
- def _preprocess_markup(self, markup: str, encoding: Optional[str] = None) -> str:
589
- """
590
- Preprocess markup before parsing.
591
-
592
- Args:
593
- markup (str): Input markup
594
- encoding (str, optional): Encoding to use
595
-
596
- Returns:
597
- str: Preprocessed markup
598
- """
599
- # Decode markup
600
- decoded_markup = decode_markup(markup, encoding)
601
-
602
- # Basic HTML cleaning
603
- # Remove comments, normalize whitespace, etc.
604
- decoded_markup = re.sub(r'<!--.*?-->', '', decoded_markup, flags=re.DOTALL)
605
- decoded_markup = re.sub(r'\s+', ' ', decoded_markup)
606
-
607
- return decoded_markup
1
+ """
2
+ Scout Main Module - HTML Parsing and Traversal
3
+ """
4
+
5
+ import hashlib
6
+ import json
7
+ import re
8
+ import unicodedata
9
+ import urllib.parse
10
+ from typing import Any, Dict, List, Literal, Optional, Union
11
+
12
+ from ..element import NavigableString, Tag
13
+ from ..parsers import ParserRegistry
14
+ from ..utils import decode_markup
15
+ from .search_result import ScoutSearchResult
16
+ from .text_analyzer import ScoutTextAnalyzer
17
+ from .web_analyzer import ScoutWebAnalyzer
18
+
19
+
20
+ class Scout:
21
+ """
22
+ Scout - Making web scraping a breeze! 🌊
23
+ A comprehensive HTML parsing and traversal library.
24
+ Enhanced with advanced features and intelligent parsing.
25
+ """
26
+
27
+ def __init__(
28
+ self,
29
+ markup: Union[str, bytes] = "",
30
+ features: str = "html.parser",
31
+ from_encoding: Optional[str] = None,
32
+ exclude_encodings: Optional[List[str]] = None,
33
+ element_classes: Optional[Dict[str, Any]] = None,
34
+ **kwargs,
35
+ ):
36
+ """
37
+ Initialize Scout with HTML content.
38
+
39
+ Args:
40
+ markup (str): HTML content to parse
41
+ features (str): Parser to use ('html.parser', 'lxml', 'html5lib', 'lxml-xml')
42
+ from_encoding (str): Source encoding (if known)
43
+ exclude_encodings (list): Encodings to avoid
44
+ element_classes (dict): Custom classes for different element types
45
+ **kwargs: Additional parsing options
46
+ """
47
+ # Store original markup and settings
48
+ self.original_encoding = from_encoding
49
+ self.exclude_encodings = exclude_encodings or []
50
+ self.element_classes = element_classes or {}
51
+ self.builder_features = features
52
+ self.contains_replacement_characters = False
53
+
54
+ # Intelligent markup handling
55
+ self.markup = self._preprocess_markup(markup, from_encoding)
56
+ self.features = features
57
+ self.from_encoding = from_encoding
58
+
59
+ # Get the right parser for the job
60
+ if features not in ParserRegistry.list_parsers():
61
+ raise ValueError(
62
+ f"Invalid parser '{features}'! Choose from: {', '.join(ParserRegistry.list_parsers().keys())}"
63
+ )
64
+
65
+ parser_class = ParserRegistry.get_parser(features)
66
+ self.parser = parser_class
67
+
68
+ # Parse that HTML! 🎯
69
+ self._soup = self.parser.parse(self.markup)
70
+
71
+ # Set up the root element properly
72
+ if hasattr(self._soup, "name"):
73
+ self.name = self._soup.name
74
+ else:
75
+ self.name = "[document]"
76
+
77
+ # BS4-like attributes
78
+ self.attrs = self._soup.attrs if hasattr(self._soup, "attrs") else {}
79
+ self.contents = self._soup.contents if hasattr(self._soup, "contents") else []
80
+ self.parent = None
81
+ self.next_sibling = None
82
+ self.previous_sibling = None
83
+
84
+ # Advanced parsing options and caching
85
+ self._cache = {}
86
+ self._tag_name_cache = {}
87
+ self._css_selector_cache = {}
88
+
89
+ # Text and web analyzers
90
+ self.text_analyzer = ScoutTextAnalyzer()
91
+ self.web_analyzer = ScoutWebAnalyzer()
92
+
93
+ def normalize_text(self, text: str, form: Literal["NFC", "NFD", "NFKC", "NFKD"] = "NFKD") -> str:
94
+ """
95
+ Normalize text using Unicode normalization.
96
+
97
+ Args:
98
+ text (str): Input text
99
+ form (Literal["NFC", "NFD", "NFKC", "NFKD"], optional): Normalization form
100
+
101
+ Returns:
102
+ str: Normalized text
103
+ """
104
+ return unicodedata.normalize(form, text)
105
+
106
+ def url_parse(self, url: str) -> Dict[str, str]:
107
+ """
108
+ Parse and analyze a URL.
109
+
110
+ Args:
111
+ url (str): URL to parse
112
+
113
+ Returns:
114
+ Dict[str, str]: Parsed URL components
115
+ """
116
+ parsed = urllib.parse.urlparse(url)
117
+ return {
118
+ "scheme": parsed.scheme,
119
+ "netloc": parsed.netloc,
120
+ "path": parsed.path,
121
+ "params": parsed.params,
122
+ "query": parsed.query,
123
+ "fragment": parsed.fragment,
124
+ }
125
+
126
+ def analyze_page_structure(self) -> Dict[str, Any]:
127
+ """
128
+ Analyze the structure of the parsed page.
129
+
130
+ Returns:
131
+ Dict[str, Any]: Page structure analysis
132
+ """
133
+ return self.web_analyzer.analyze_page_structure(self)
134
+
135
+ def analyze_text(self, text: Optional[str] = None) -> Dict[str, Any]:
136
+ """
137
+ Perform advanced text analysis.
138
+
139
+ Args:
140
+ text (str, optional): Text to analyze. If None, uses page text.
141
+
142
+ Returns:
143
+ Dict[str, Any]: Text analysis results
144
+ """
145
+ if text is None:
146
+ text = self.get_text()
147
+
148
+ return {
149
+ "word_count": self.text_analyzer.count_words(text),
150
+ "entities": self.text_analyzer.extract_entities(text),
151
+ "tokens": self.text_analyzer.tokenize(text),
152
+ }
153
+
154
+ def extract_semantic_info(self) -> Dict[str, Any]:
155
+ """
156
+ Extract semantic information from the document.
157
+
158
+ Returns:
159
+ Dict[str, Any]: Semantic information
160
+ """
161
+ semantic_info = {
162
+ "headings": {
163
+ "h1": [h.get_text(strip=True) for h in self.find_all("h1")],
164
+ "h2": [h.get_text(strip=True) for h in self.find_all("h2")],
165
+ "h3": [h.get_text(strip=True) for h in self.find_all("h3")],
166
+ },
167
+ "lists": {
168
+ "ul": [ul.find_all("li") for ul in self.find_all("ul")],
169
+ "ol": [ol.find_all("li") for ol in self.find_all("ol")],
170
+ },
171
+ "tables": {
172
+ "count": len(self.find_all("table")),
173
+ "headers": [table.find_all("th") for table in self.find_all("table")],
174
+ },
175
+ }
176
+ return semantic_info
177
+
178
+ def cache(self, key: str, value: Any = None) -> Any:
179
+ """
180
+ Manage a cache for parsed content.
181
+
182
+ Args:
183
+ key (str): Cache key
184
+ value (Any, optional): Value to cache
185
+
186
+ Returns:
187
+ Any: Cached value or None
188
+ """
189
+ if value is not None:
190
+ self._cache[key] = value
191
+ return self._cache.get(key)
192
+
193
+ def hash_content(self, method="md5") -> str:
194
+ """
195
+ Generate a hash of the parsed content.
196
+
197
+ Args:
198
+ method (str, optional): Hashing method
199
+
200
+ Returns:
201
+ str: Content hash
202
+ """
203
+ hash_methods = {"md5": hashlib.md5, "sha1": hashlib.sha1, "sha256": hashlib.sha256}
204
+
205
+ if method not in hash_methods:
206
+ raise ValueError(f"Unsupported hash method: {method}")
207
+
208
+ hasher = hash_methods[method]()
209
+ hasher.update(str(self._soup).encode("utf-8"))
210
+ return hasher.hexdigest()
211
+
212
+ def extract_links(self, base_url: Optional[str] = None) -> List[Dict[str, str]]:
213
+ """
214
+ Extract all links from the document.
215
+
216
+ Args:
217
+ base_url (str, optional): Base URL for resolving relative links
218
+
219
+ Returns:
220
+ List[Dict[str, str]]: List of link dictionaries
221
+ """
222
+ links = []
223
+ for link in self.find_all(["a", "link"]):
224
+ href = link.get("href")
225
+ if href:
226
+ # Resolve relative URLs if base_url is provided
227
+ if base_url and not href.startswith(("http://", "https://", "//")):
228
+ href = f"{base_url.rstrip('/')}/{href.lstrip('/')}"
229
+
230
+ links.append(
231
+ {
232
+ "href": href,
233
+ "text": link.get_text(strip=True),
234
+ "rel": link.get("rel", [None])[0],
235
+ "type": link.get("type"),
236
+ }
237
+ )
238
+ return links
239
+
240
+ def extract_metadata(self) -> Dict[str, Any]:
241
+ """
242
+ Extract metadata from HTML document.
243
+
244
+ Returns:
245
+ Dict[str, Any]: Extracted metadata
246
+ """
247
+ title_tag = self.find("title")
248
+ desc_tag = self.find("meta", attrs={"name": "description"})
249
+ keywords_tag = self.find("meta", attrs={"name": "keywords"})
250
+
251
+ metadata = {
252
+ "title": title_tag.get_text(strip=True) if title_tag else None,
253
+ "description": desc_tag.get("content") if desc_tag else None,
254
+ "keywords": keywords_tag.get("content").split(",")
255
+ if keywords_tag and keywords_tag.get("content")
256
+ else [],
257
+ "og_metadata": {},
258
+ "twitter_metadata": {},
259
+ }
260
+
261
+ # Open Graph metadata
262
+ for meta in self.find_all("meta", attrs={"property": re.compile(r"^og:")}):
263
+ key = meta.get("property")
264
+ if key and key.startswith("og:"):
265
+ if isinstance(metadata["og_metadata"], dict):
266
+ metadata["og_metadata"][key[3:]] = meta.get("content")
267
+
268
+ # Twitter Card metadata
269
+ for meta in self.find_all("meta", attrs={"name": re.compile(r"^twitter:")}):
270
+ key = meta.get("name")
271
+ if key and key.startswith("twitter:"):
272
+ if isinstance(metadata["twitter_metadata"], dict):
273
+ metadata["twitter_metadata"][key[8:]] = meta.get("content")
274
+
275
+ return metadata
276
+
277
+ def to_json(self, indent=2) -> str:
278
+ """
279
+ Convert parsed content to JSON.
280
+
281
+ Args:
282
+ indent (int, optional): JSON indentation
283
+
284
+ Returns:
285
+ str: JSON representation of the document
286
+ """
287
+
288
+ def _tag_to_dict(tag):
289
+ if isinstance(tag, NavigableString):
290
+ return str(tag)
291
+
292
+ result = {"name": tag.name, "attrs": tag.attrs, "text": tag.get_text(strip=True)}
293
+
294
+ if tag.contents:
295
+ result["children"] = [_tag_to_dict(child) for child in tag.contents]
296
+
297
+ return result
298
+
299
+ return json.dumps(_tag_to_dict(self._soup), indent=indent)
300
+
301
+ def find(
302
+ self, name=None, attrs={}, recursive=True, text=None, class_=None, **kwargs
303
+ ) -> Optional[Tag]:
304
+ """
305
+ Find the first matching element. Returns a single Tag or None.
306
+ Highly compatible with BS4.
307
+ """
308
+ return self._soup.find(name, attrs, recursive, text, limit=1, class_=class_, **kwargs)
309
+
310
+ def find_all(
311
+ self, name=None, attrs={}, recursive=True, text=None, limit=None, class_=None, **kwargs
312
+ ) -> ScoutSearchResult:
313
+ """
314
+ Find all matching elements.
315
+
316
+ Args:
317
+ name (str, optional): Tag name to search for
318
+ attrs (dict, optional): Attributes to match
319
+ recursive (bool, optional): Search recursively
320
+ text (str, optional): Text content to match
321
+ limit (int, optional): Maximum number of results
322
+
323
+ Returns:
324
+ ScoutSearchResult: List of matching elements
325
+ """
326
+ results = self._soup.find_all(name, attrs, recursive, text, limit, class_=class_, **kwargs)
327
+ return ScoutSearchResult(results)
328
+
329
+ def find_parent(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
330
+ """
331
+ Find the first parent matching given criteria.
332
+
333
+ Args:
334
+ name (str, optional): Tag name to search for
335
+ attrs (dict, optional): Attributes to match
336
+
337
+ Returns:
338
+ Tag or None: First matching parent
339
+ """
340
+ current = self._soup.parent
341
+ while current:
342
+ if (name is None or current.name == name) and all(
343
+ current.get(k) == v for k, v in attrs.items()
344
+ ):
345
+ return current
346
+ current = current.parent
347
+ return None
348
+
349
+ def find_parents(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
350
+ """
351
+ Find all parents matching given criteria.
352
+
353
+ Args:
354
+ name (str, optional): Tag name to search for
355
+ attrs (dict, optional): Attributes to match
356
+ limit (int, optional): Maximum number of results
357
+
358
+ Returns:
359
+ List[Tag]: List of matching parents
360
+ """
361
+ parents = []
362
+ current = self._soup.parent
363
+ while current and (limit is None or len(parents) < limit):
364
+ if (name is None or current.name == name) and all(
365
+ current.get(k) == v for k, v in attrs.items()
366
+ ):
367
+ parents.append(current)
368
+ current = current.parent
369
+ return parents
370
+
371
+ def find_next_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
372
+ """
373
+ Find the next sibling matching given criteria.
374
+
375
+ Args:
376
+ name (str, optional): Tag name to search for
377
+ attrs (dict, optional): Attributes to match
378
+
379
+ Returns:
380
+ Tag or None: First matching next sibling
381
+ """
382
+ if not self._soup.parent:
383
+ return None
384
+
385
+ siblings = self._soup.parent.contents
386
+ try:
387
+ current_index = siblings.index(self._soup)
388
+ for sibling in siblings[current_index + 1 :]:
389
+ if isinstance(sibling, Tag):
390
+ if (name is None or sibling.name == name) and all(
391
+ sibling.get(k) == v for k, v in attrs.items()
392
+ ):
393
+ return sibling
394
+ except ValueError:
395
+ pass
396
+ return None
397
+
398
+ def find_next_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
399
+ """
400
+ Find all next siblings matching given criteria.
401
+
402
+ Args:
403
+ name (str, optional): Tag name to search for
404
+ attrs (dict, optional): Attributes to match
405
+ limit (int, optional): Maximum number of results
406
+
407
+ Returns:
408
+ List[Tag]: List of matching next siblings
409
+ """
410
+ if not self._soup.parent:
411
+ return []
412
+
413
+ siblings = []
414
+ siblings_list = self._soup.parent.contents
415
+ try:
416
+ current_index = siblings_list.index(self._soup)
417
+ for sibling in siblings_list[current_index + 1 :]:
418
+ if isinstance(sibling, Tag):
419
+ if (name is None or sibling.name == name) and all(
420
+ sibling.get(k) == v for k, v in attrs.items()
421
+ ):
422
+ siblings.append(sibling)
423
+ if limit and len(siblings) == limit:
424
+ break
425
+ except ValueError:
426
+ pass
427
+ return siblings
428
+
429
+ def find_previous_sibling(self, name=None, attrs={}, **kwargs) -> Optional[Tag]:
430
+ """Find the previous sibling matching given criteria."""
431
+ if not self._soup.parent:
432
+ return None
433
+
434
+ siblings = self._soup.parent.contents
435
+ try:
436
+ current_index = siblings.index(self._soup)
437
+ for sibling in reversed(siblings[:current_index]):
438
+ if isinstance(sibling, Tag):
439
+ if (name is None or sibling.name == name) and all(
440
+ sibling.get(k) == v for k, v in attrs.items()
441
+ ):
442
+ return sibling
443
+ except ValueError:
444
+ pass
445
+ return None
446
+
447
+ def find_previous_siblings(self, name=None, attrs={}, limit=None, **kwargs) -> List[Tag]:
448
+ """Find all previous siblings matching given criteria."""
449
+ if not self._soup.parent:
450
+ return []
451
+
452
+ siblings = []
453
+ siblings_list = self._soup.parent.contents
454
+ try:
455
+ current_index = siblings_list.index(self._soup)
456
+ for sibling in reversed(siblings_list[:current_index]):
457
+ if isinstance(sibling, Tag):
458
+ if (name is None or sibling.name == name) and all(
459
+ sibling.get(k) == v for k, v in attrs.items()
460
+ ):
461
+ siblings.append(sibling)
462
+ if limit and len(siblings) == limit:
463
+ break
464
+ except ValueError:
465
+ pass
466
+ return siblings
467
+
468
+ def find_next(self, name=None, attrs={}, text=None, **kwargs) -> Optional[Tag]:
469
+ """
470
+ Find the next element in document order.
471
+
472
+ Args:
473
+ name: Tag name to search for
474
+ attrs: Attributes to match
475
+ text: Text content to match
476
+ **kwargs: Additional attributes
477
+
478
+ Returns:
479
+ Optional[Tag]: Next matching element or None
480
+ """
481
+ return self._soup.find_next(name, attrs, text, **kwargs)
482
+
483
+ def find_all_next(self, name=None, attrs={}, text=None, limit=None, **kwargs) -> List[Tag]:
484
+ """
485
+ Find all next elements in document order.
486
+
487
+ Args:
488
+ name: Tag name to search for
489
+ attrs: Attributes to match
490
+ text: Text content to match
491
+ limit: Maximum number of results
492
+ **kwargs: Additional attributes
493
+
494
+ Returns:
495
+ List[Tag]: List of matching elements
496
+ """
497
+ return self._soup.find_all_next(name, attrs, text, limit, **kwargs)
498
+
499
+ def find_previous(self, name=None, attrs={}, text=None, **kwargs) -> Optional[Tag]:
500
+ """
501
+ Find the previous element in document order.
502
+
503
+ Args:
504
+ name: Tag name to search for
505
+ attrs: Attributes to match
506
+ text: Text content to match
507
+ **kwargs: Additional attributes
508
+
509
+ Returns:
510
+ Optional[Tag]: Previous matching element or None
511
+ """
512
+ return self._soup.find_previous(name, attrs, text, **kwargs)
513
+
514
+ def find_all_previous(self, name=None, attrs={}, text=None, limit=None, **kwargs) -> List[Tag]:
515
+ """
516
+ Find all previous elements in document order.
517
+
518
+ Args:
519
+ name: Tag name to search for
520
+ attrs: Attributes to match
521
+ text: Text content to match
522
+ limit: Maximum number of results
523
+ **kwargs: Additional attributes
524
+
525
+ Returns:
526
+ List[Tag]: List of matching elements
527
+ """
528
+ return self._soup.find_all_previous(name, attrs, text, limit, **kwargs)
529
+
530
+ def select(self, selector: str) -> List[Tag]:
531
+ """
532
+ Select elements using CSS selector.
533
+
534
+ Args:
535
+ selector (str): CSS selector string
536
+
537
+ Returns:
538
+ List[Tag]: List of matching elements
539
+ """
540
+ return self._soup.select(selector)
541
+
542
+ def select_one(self, selector: str) -> Optional[Tag]:
543
+ """
544
+ Select the first element matching the CSS selector.
545
+
546
+ Args:
547
+ selector (str): CSS selector string
548
+
549
+ Returns:
550
+ Tag or None: First matching element
551
+ """
552
+ return self._soup.select_one(selector)
553
+
554
+ def get_text(self, separator="", strip=False, types=None) -> str:
555
+ """
556
+ Extract all text from the parsed document.
557
+ Standard behavior like BS4.
558
+ """
559
+ return self._soup.get_text(separator, strip, types)
560
+
561
+ @property
562
+ def text(self) -> str:
563
+ """BS4 compatible text property."""
564
+ return self.get_text()
565
+
566
+ @property
567
+ def string(self) -> Optional[str]:
568
+ """BS4 compatible string property."""
569
+ return self._soup.string
570
+
571
+ def get_text_robust(
572
+ self, separator=" ", strip=False, types=None, encoding_fallbacks=None
573
+ ) -> str:
574
+ """Extract text robustly, trying multiple encodings if needed."""
575
+ try:
576
+ return self.get_text(separator, strip, types)
577
+ except UnicodeDecodeError:
578
+ if encoding_fallbacks:
579
+ for enc in encoding_fallbacks:
580
+ try:
581
+ return self._soup.get_text(separator, strip, types).encode(enc).decode(enc)
582
+ except Exception:
583
+ continue
584
+ raise
585
+
586
+ def remove_tags(self, tags: List[str]) -> None:
587
+ """
588
+ Remove specified tags and their contents from the document.
589
+
590
+ Args:
591
+ tags (List[str]): List of tag names to remove
592
+ """
593
+ for tag_name in tags:
594
+ for tag in self._soup.find_all(tag_name):
595
+ tag.decompose()
596
+
597
+ def prettify(self, formatter="minimal") -> str:
598
+ """
599
+ Return a formatted, pretty-printed version of the HTML.
600
+
601
+ Args:
602
+ formatter (str, optional): Formatting style
603
+
604
+ Returns:
605
+ str: Prettified HTML
606
+ """
607
+ return self._soup.prettify(formatter)
608
+
609
+ def decompose(self, tag: Optional[Tag] = None) -> None:
610
+ """
611
+ Remove a tag and its contents from the document.
612
+
613
+ Args:
614
+ tag (Tag, optional): Tag to remove. If None, removes the root tag.
615
+ """
616
+ if tag is None:
617
+ tag = self._soup
618
+ assert tag is not None
619
+ tag.decompose()
620
+
621
+ def extract(self, tag: Optional[Tag] = None) -> Tag:
622
+ """
623
+ Remove a tag from the document and return it.
624
+
625
+ Args:
626
+ tag (Tag, optional): Tag to extract. If None, extracts the root tag.
627
+
628
+ Returns:
629
+ Tag: Extracted tag
630
+ """
631
+ if tag is None:
632
+ tag = self._soup
633
+ assert tag is not None
634
+ return tag.extract()
635
+
636
+ def clear(self, tag: Optional[Tag] = None) -> None:
637
+ """
638
+ Remove a tag's contents while keeping the tag itself.
639
+
640
+ Args:
641
+ tag (Tag, optional): Tag to clear. If None, clears the root tag.
642
+ """
643
+ if tag is None:
644
+ tag = self._soup
645
+ assert tag is not None
646
+ tag.clear()
647
+
648
+ def replace_with(self, old_tag: Tag, new_tag: Tag) -> None:
649
+ """
650
+ Replace one tag with another.
651
+
652
+ Args:
653
+ old_tag (Tag): Tag to replace
654
+ new_tag (Tag): Replacement tag
655
+ """
656
+ old_tag.replace_with(new_tag)
657
+
658
+ def encode(self, encoding="utf-8", errors="strict") -> bytes:
659
+ """Encode the document to a specific encoding with error handling."""
660
+ try:
661
+ return str(self._soup).encode(encoding, errors)
662
+ except Exception:
663
+ return str(self._soup).encode("utf-8", errors)
664
+
665
+ def decode(self, encoding="utf-8", errors="strict") -> str:
666
+ """Decode the document from a specific encoding with error handling.
667
+
668
+ Note: The parsed soup is represented as a str in memory, so decoding
669
+ simply returns the string representation.
670
+ """
671
+ try:
672
+ return str(self._soup)
673
+ except Exception:
674
+ return str(self._soup)
675
+
676
+ def __str__(self) -> str:
677
+ """
678
+ String representation of the parsed document.
679
+
680
+ Returns:
681
+ str: HTML content
682
+ """
683
+ return str(self._soup)
684
+
685
+ def __repr__(self) -> str:
686
+ """
687
+ Detailed representation of the Scout object.
688
+
689
+ Returns:
690
+ str: Scout object description
691
+ """
692
+ return f"Scout(features='{self.features}', content_length={len(self.markup)})"
693
+
694
+ def _preprocess_markup(self, markup: Union[str, bytes], encoding: Optional[str] = None) -> str:
695
+ """
696
+ Preprocess markup before parsing.
697
+
698
+ Args:
699
+ markup (str): Input markup
700
+ encoding (str, optional): Encoding to use
701
+
702
+ Returns:
703
+ str: Preprocessed markup
704
+ """
705
+ # Decode markup
706
+ decoded_markup = decode_markup(markup, encoding)
707
+
708
+ # Basic HTML cleaning
709
+ # Remove comments, normalize whitespace, etc.
710
+ decoded_markup = re.sub(r"<!--.*?-->", "", decoded_markup, flags=re.DOTALL)
711
+ decoded_markup = re.sub(r"\s+", " ", decoded_markup)
712
+
713
+ return decoded_markup
714
+
715
+ def wrap(self, wrapper_tag: Tag) -> Tag:
716
+ """Wrap the root tag in another tag with error handling."""
717
+ try:
718
+ return self._soup.wrap(wrapper_tag)
719
+ except Exception:
720
+ return wrapper_tag
721
+
722
+ def unwrap(self) -> None:
723
+ """Unwrap the root tag, keeping its contents in the parent, with error handling."""
724
+ try:
725
+ self._soup.unwrap()
726
+ except Exception:
727
+ pass
728
+
729
+ def insert_before(self, new_element: Tag) -> None:
730
+ """Insert a tag or string immediately before the root tag with error handling."""
731
+ try:
732
+ self._soup.insert_before(new_element)
733
+ except Exception:
734
+ pass
735
+
736
+ def insert_after(self, new_element: Tag) -> None:
737
+ """Insert a tag or string immediately after the root tag with error handling."""
738
+ try:
739
+ self._soup.insert_after(new_element)
740
+ except Exception:
741
+ pass
742
+
743
+ def append(self, tag: Tag) -> None:
744
+ """Append a tag to the root tag with error handling."""
745
+ try:
746
+ self._soup.append(tag)
747
+ except Exception:
748
+ pass
749
+
750
+ @property
751
+ def descendants(self):
752
+ """Yield all descendants of the root tag in document order."""
753
+ return self._soup.descendants
754
+
755
+ @property
756
+ def parents(self):
757
+ """Yield all parents of the root tag up the tree."""
758
+ return self._soup.parents
759
+
760
+ @property
761
+ def next_element(self):
762
+ """Return the next element in document order after the root tag."""
763
+ return self._soup.next_element
764
+
765
+ @property
766
+ def previous_element(self):
767
+ """Return the previous element in document order before the root tag."""
768
+ return self._soup.previous_element
769
+
770
+ def fetch_and_parse(self, url: str, session=None, **kwargs) -> "Scout":
771
+ """Fetch HTML from a URL and parse it with Scout. Prefers curl_cffi."""
772
+ try:
773
+ from curl_cffi import requests as curleq
774
+
775
+ s = session or curleq.Session()
776
+ resp = s.get(url, **kwargs)
777
+ return Scout(resp.content, features=self.features)
778
+ except ImportError:
779
+ import requests
780
+
781
+ s = session or requests.Session()
782
+ resp = s.get(url, **kwargs)
783
+ return Scout(resp.content, features=self.features)
784
+
785
+ def tables_to_dataframe(self, table_index=0, pandas_module=None):
786
+ """Convert the nth table in the document to a pandas DataFrame."""
787
+ try:
788
+ if pandas_module:
789
+ pd = pandas_module
790
+ else:
791
+ import pandas as pd # type: ignore
792
+ except ImportError:
793
+ raise ImportError("pandas is required for tables_to_dataframe. Please install pandas.")
794
+ tables = self.find_all("table")
795
+ if not tables or table_index >= len(tables):
796
+ return None
797
+ table = tables[table_index]
798
+ rows = table.find_all("tr")
799
+ data = [[cell.get_text(strip=True) for cell in row.find_all(["td", "th"])] for row in rows]
800
+ return pd.DataFrame(data)