webscout 8.2.9__py3-none-any.whl → 2026.1.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (413) hide show
  1. webscout/AIauto.py +524 -251
  2. webscout/AIbase.py +247 -319
  3. webscout/AIutel.py +68 -703
  4. webscout/Bard.py +1072 -1026
  5. webscout/Extra/GitToolkit/__init__.py +10 -10
  6. webscout/Extra/GitToolkit/gitapi/__init__.py +20 -12
  7. webscout/Extra/GitToolkit/gitapi/gist.py +142 -0
  8. webscout/Extra/GitToolkit/gitapi/organization.py +91 -0
  9. webscout/Extra/GitToolkit/gitapi/repository.py +308 -195
  10. webscout/Extra/GitToolkit/gitapi/search.py +162 -0
  11. webscout/Extra/GitToolkit/gitapi/trending.py +236 -0
  12. webscout/Extra/GitToolkit/gitapi/user.py +128 -96
  13. webscout/Extra/GitToolkit/gitapi/utils.py +82 -62
  14. webscout/Extra/YTToolkit/README.md +443 -375
  15. webscout/Extra/YTToolkit/YTdownloader.py +953 -957
  16. webscout/Extra/YTToolkit/__init__.py +3 -3
  17. webscout/Extra/YTToolkit/transcriber.py +595 -476
  18. webscout/Extra/YTToolkit/ytapi/README.md +230 -44
  19. webscout/Extra/YTToolkit/ytapi/__init__.py +22 -6
  20. webscout/Extra/YTToolkit/ytapi/captions.py +190 -0
  21. webscout/Extra/YTToolkit/ytapi/channel.py +302 -307
  22. webscout/Extra/YTToolkit/ytapi/errors.py +13 -13
  23. webscout/Extra/YTToolkit/ytapi/extras.py +178 -118
  24. webscout/Extra/YTToolkit/ytapi/hashtag.py +120 -0
  25. webscout/Extra/YTToolkit/ytapi/https.py +89 -88
  26. webscout/Extra/YTToolkit/ytapi/patterns.py +61 -61
  27. webscout/Extra/YTToolkit/ytapi/playlist.py +59 -59
  28. webscout/Extra/YTToolkit/ytapi/pool.py +8 -8
  29. webscout/Extra/YTToolkit/ytapi/query.py +143 -40
  30. webscout/Extra/YTToolkit/ytapi/shorts.py +122 -0
  31. webscout/Extra/YTToolkit/ytapi/stream.py +68 -63
  32. webscout/Extra/YTToolkit/ytapi/suggestions.py +97 -0
  33. webscout/Extra/YTToolkit/ytapi/utils.py +66 -62
  34. webscout/Extra/YTToolkit/ytapi/video.py +403 -232
  35. webscout/Extra/__init__.py +2 -3
  36. webscout/Extra/gguf.py +1298 -684
  37. webscout/Extra/tempmail/README.md +487 -487
  38. webscout/Extra/tempmail/__init__.py +28 -28
  39. webscout/Extra/tempmail/async_utils.py +143 -141
  40. webscout/Extra/tempmail/base.py +172 -161
  41. webscout/Extra/tempmail/cli.py +191 -187
  42. webscout/Extra/tempmail/emailnator.py +88 -84
  43. webscout/Extra/tempmail/mail_tm.py +378 -361
  44. webscout/Extra/tempmail/temp_mail_io.py +304 -292
  45. webscout/Extra/weather.py +196 -194
  46. webscout/Extra/weather_ascii.py +17 -15
  47. webscout/Provider/AISEARCH/PERPLEXED_search.py +175 -0
  48. webscout/Provider/AISEARCH/Perplexity.py +292 -333
  49. webscout/Provider/AISEARCH/README.md +106 -279
  50. webscout/Provider/AISEARCH/__init__.py +16 -9
  51. webscout/Provider/AISEARCH/brave_search.py +298 -0
  52. webscout/Provider/AISEARCH/iask_search.py +357 -410
  53. webscout/Provider/AISEARCH/monica_search.py +200 -220
  54. webscout/Provider/AISEARCH/webpilotai_search.py +242 -255
  55. webscout/Provider/Algion.py +413 -0
  56. webscout/Provider/Andi.py +74 -69
  57. webscout/Provider/Apriel.py +313 -0
  58. webscout/Provider/Ayle.py +323 -0
  59. webscout/Provider/ChatSandbox.py +329 -342
  60. webscout/Provider/ClaudeOnline.py +365 -0
  61. webscout/Provider/Cohere.py +232 -208
  62. webscout/Provider/DeepAI.py +367 -0
  63. webscout/Provider/Deepinfra.py +467 -340
  64. webscout/Provider/EssentialAI.py +217 -0
  65. webscout/Provider/ExaAI.py +274 -261
  66. webscout/Provider/Gemini.py +175 -169
  67. webscout/Provider/GithubChat.py +385 -369
  68. webscout/Provider/Gradient.py +286 -0
  69. webscout/Provider/Groq.py +556 -801
  70. webscout/Provider/HadadXYZ.py +323 -0
  71. webscout/Provider/HeckAI.py +392 -375
  72. webscout/Provider/HuggingFace.py +387 -0
  73. webscout/Provider/IBM.py +340 -0
  74. webscout/Provider/Jadve.py +317 -291
  75. webscout/Provider/K2Think.py +306 -0
  76. webscout/Provider/Koboldai.py +221 -384
  77. webscout/Provider/Netwrck.py +273 -270
  78. webscout/Provider/Nvidia.py +310 -0
  79. webscout/Provider/OPENAI/DeepAI.py +489 -0
  80. webscout/Provider/OPENAI/K2Think.py +423 -0
  81. webscout/Provider/OPENAI/PI.py +463 -0
  82. webscout/Provider/OPENAI/README.md +890 -952
  83. webscout/Provider/OPENAI/TogetherAI.py +405 -0
  84. webscout/Provider/OPENAI/TwoAI.py +255 -357
  85. webscout/Provider/OPENAI/__init__.py +148 -40
  86. webscout/Provider/OPENAI/ai4chat.py +348 -293
  87. webscout/Provider/OPENAI/akashgpt.py +436 -0
  88. webscout/Provider/OPENAI/algion.py +303 -0
  89. webscout/Provider/OPENAI/{exachat.py → ayle.py} +365 -444
  90. webscout/Provider/OPENAI/base.py +253 -249
  91. webscout/Provider/OPENAI/cerebras.py +296 -0
  92. webscout/Provider/OPENAI/chatgpt.py +870 -556
  93. webscout/Provider/OPENAI/chatsandbox.py +233 -173
  94. webscout/Provider/OPENAI/deepinfra.py +403 -322
  95. webscout/Provider/OPENAI/e2b.py +2370 -1414
  96. webscout/Provider/OPENAI/elmo.py +278 -0
  97. webscout/Provider/OPENAI/exaai.py +452 -417
  98. webscout/Provider/OPENAI/freeassist.py +446 -0
  99. webscout/Provider/OPENAI/gradient.py +448 -0
  100. webscout/Provider/OPENAI/groq.py +380 -364
  101. webscout/Provider/OPENAI/hadadxyz.py +292 -0
  102. webscout/Provider/OPENAI/heckai.py +333 -308
  103. webscout/Provider/OPENAI/huggingface.py +321 -0
  104. webscout/Provider/OPENAI/ibm.py +425 -0
  105. webscout/Provider/OPENAI/llmchat.py +253 -0
  106. webscout/Provider/OPENAI/llmchatco.py +378 -335
  107. webscout/Provider/OPENAI/meta.py +541 -0
  108. webscout/Provider/OPENAI/netwrck.py +374 -357
  109. webscout/Provider/OPENAI/nvidia.py +317 -0
  110. webscout/Provider/OPENAI/oivscode.py +348 -287
  111. webscout/Provider/OPENAI/openrouter.py +328 -0
  112. webscout/Provider/OPENAI/pydantic_imports.py +1 -172
  113. webscout/Provider/OPENAI/sambanova.py +397 -0
  114. webscout/Provider/OPENAI/sonus.py +305 -304
  115. webscout/Provider/OPENAI/textpollinations.py +370 -339
  116. webscout/Provider/OPENAI/toolbaz.py +375 -413
  117. webscout/Provider/OPENAI/typefully.py +419 -355
  118. webscout/Provider/OPENAI/typliai.py +279 -0
  119. webscout/Provider/OPENAI/utils.py +314 -318
  120. webscout/Provider/OPENAI/wisecat.py +359 -387
  121. webscout/Provider/OPENAI/writecream.py +185 -163
  122. webscout/Provider/OPENAI/x0gpt.py +462 -365
  123. webscout/Provider/OPENAI/zenmux.py +380 -0
  124. webscout/Provider/OpenRouter.py +386 -0
  125. webscout/Provider/Openai.py +337 -496
  126. webscout/Provider/PI.py +443 -429
  127. webscout/Provider/QwenLM.py +346 -254
  128. webscout/Provider/STT/__init__.py +28 -0
  129. webscout/Provider/STT/base.py +303 -0
  130. webscout/Provider/STT/elevenlabs.py +264 -0
  131. webscout/Provider/Sambanova.py +317 -0
  132. webscout/Provider/TTI/README.md +69 -82
  133. webscout/Provider/TTI/__init__.py +37 -7
  134. webscout/Provider/TTI/base.py +147 -64
  135. webscout/Provider/TTI/claudeonline.py +393 -0
  136. webscout/Provider/TTI/magicstudio.py +292 -201
  137. webscout/Provider/TTI/miragic.py +180 -0
  138. webscout/Provider/TTI/pollinations.py +331 -221
  139. webscout/Provider/TTI/together.py +334 -0
  140. webscout/Provider/TTI/utils.py +14 -11
  141. webscout/Provider/TTS/README.md +186 -192
  142. webscout/Provider/TTS/__init__.py +43 -10
  143. webscout/Provider/TTS/base.py +523 -159
  144. webscout/Provider/TTS/deepgram.py +286 -156
  145. webscout/Provider/TTS/elevenlabs.py +189 -111
  146. webscout/Provider/TTS/freetts.py +218 -0
  147. webscout/Provider/TTS/murfai.py +288 -113
  148. webscout/Provider/TTS/openai_fm.py +364 -129
  149. webscout/Provider/TTS/parler.py +203 -111
  150. webscout/Provider/TTS/qwen.py +334 -0
  151. webscout/Provider/TTS/sherpa.py +286 -0
  152. webscout/Provider/TTS/speechma.py +693 -580
  153. webscout/Provider/TTS/streamElements.py +275 -333
  154. webscout/Provider/TTS/utils.py +280 -280
  155. webscout/Provider/TextPollinationsAI.py +331 -308
  156. webscout/Provider/TogetherAI.py +450 -0
  157. webscout/Provider/TwoAI.py +309 -475
  158. webscout/Provider/TypliAI.py +311 -305
  159. webscout/Provider/UNFINISHED/ChatHub.py +219 -209
  160. webscout/Provider/{OPENAI/glider.py → UNFINISHED/ChutesAI.py} +331 -326
  161. webscout/Provider/{GizAI.py → UNFINISHED/GizAI.py} +300 -295
  162. webscout/Provider/{Marcus.py → UNFINISHED/Marcus.py} +218 -198
  163. webscout/Provider/UNFINISHED/Qodo.py +481 -0
  164. webscout/Provider/{MCPCore.py → UNFINISHED/XenAI.py} +330 -315
  165. webscout/Provider/UNFINISHED/Youchat.py +347 -330
  166. webscout/Provider/UNFINISHED/aihumanizer.py +41 -0
  167. webscout/Provider/UNFINISHED/grammerchecker.py +37 -0
  168. webscout/Provider/UNFINISHED/liner.py +342 -0
  169. webscout/Provider/UNFINISHED/liner_api_request.py +246 -263
  170. webscout/Provider/{samurai.py → UNFINISHED/samurai.py} +231 -224
  171. webscout/Provider/WiseCat.py +256 -233
  172. webscout/Provider/WrDoChat.py +390 -370
  173. webscout/Provider/__init__.py +115 -174
  174. webscout/Provider/ai4chat.py +181 -174
  175. webscout/Provider/akashgpt.py +330 -335
  176. webscout/Provider/cerebras.py +397 -290
  177. webscout/Provider/cleeai.py +236 -213
  178. webscout/Provider/elmo.py +291 -283
  179. webscout/Provider/geminiapi.py +343 -208
  180. webscout/Provider/julius.py +245 -223
  181. webscout/Provider/learnfastai.py +333 -325
  182. webscout/Provider/llama3mitril.py +230 -215
  183. webscout/Provider/llmchat.py +308 -258
  184. webscout/Provider/llmchatco.py +321 -306
  185. webscout/Provider/meta.py +996 -801
  186. webscout/Provider/oivscode.py +332 -309
  187. webscout/Provider/searchchat.py +316 -292
  188. webscout/Provider/sonus.py +264 -258
  189. webscout/Provider/toolbaz.py +359 -353
  190. webscout/Provider/turboseek.py +332 -266
  191. webscout/Provider/typefully.py +262 -202
  192. webscout/Provider/x0gpt.py +332 -299
  193. webscout/__init__.py +31 -39
  194. webscout/__main__.py +5 -5
  195. webscout/cli.py +585 -524
  196. webscout/client.py +1497 -70
  197. webscout/conversation.py +140 -436
  198. webscout/exceptions.py +383 -362
  199. webscout/litagent/__init__.py +29 -29
  200. webscout/litagent/agent.py +492 -455
  201. webscout/litagent/constants.py +60 -60
  202. webscout/models.py +505 -181
  203. webscout/optimizers.py +74 -420
  204. webscout/prompt_manager.py +376 -288
  205. webscout/sanitize.py +1514 -0
  206. webscout/scout/README.md +452 -404
  207. webscout/scout/__init__.py +8 -8
  208. webscout/scout/core/__init__.py +7 -7
  209. webscout/scout/core/crawler.py +330 -210
  210. webscout/scout/core/scout.py +800 -607
  211. webscout/scout/core/search_result.py +51 -96
  212. webscout/scout/core/text_analyzer.py +64 -63
  213. webscout/scout/core/text_utils.py +412 -277
  214. webscout/scout/core/web_analyzer.py +54 -52
  215. webscout/scout/element.py +872 -478
  216. webscout/scout/parsers/__init__.py +70 -69
  217. webscout/scout/parsers/html5lib_parser.py +182 -172
  218. webscout/scout/parsers/html_parser.py +238 -236
  219. webscout/scout/parsers/lxml_parser.py +203 -178
  220. webscout/scout/utils.py +38 -37
  221. webscout/search/__init__.py +47 -0
  222. webscout/search/base.py +201 -0
  223. webscout/search/bing_main.py +45 -0
  224. webscout/search/brave_main.py +92 -0
  225. webscout/search/duckduckgo_main.py +57 -0
  226. webscout/search/engines/__init__.py +127 -0
  227. webscout/search/engines/bing/__init__.py +15 -0
  228. webscout/search/engines/bing/base.py +35 -0
  229. webscout/search/engines/bing/images.py +114 -0
  230. webscout/search/engines/bing/news.py +96 -0
  231. webscout/search/engines/bing/suggestions.py +36 -0
  232. webscout/search/engines/bing/text.py +109 -0
  233. webscout/search/engines/brave/__init__.py +19 -0
  234. webscout/search/engines/brave/base.py +47 -0
  235. webscout/search/engines/brave/images.py +213 -0
  236. webscout/search/engines/brave/news.py +353 -0
  237. webscout/search/engines/brave/suggestions.py +318 -0
  238. webscout/search/engines/brave/text.py +167 -0
  239. webscout/search/engines/brave/videos.py +364 -0
  240. webscout/search/engines/duckduckgo/__init__.py +25 -0
  241. webscout/search/engines/duckduckgo/answers.py +80 -0
  242. webscout/search/engines/duckduckgo/base.py +189 -0
  243. webscout/search/engines/duckduckgo/images.py +100 -0
  244. webscout/search/engines/duckduckgo/maps.py +183 -0
  245. webscout/search/engines/duckduckgo/news.py +70 -0
  246. webscout/search/engines/duckduckgo/suggestions.py +22 -0
  247. webscout/search/engines/duckduckgo/text.py +221 -0
  248. webscout/search/engines/duckduckgo/translate.py +48 -0
  249. webscout/search/engines/duckduckgo/videos.py +80 -0
  250. webscout/search/engines/duckduckgo/weather.py +84 -0
  251. webscout/search/engines/mojeek.py +61 -0
  252. webscout/search/engines/wikipedia.py +77 -0
  253. webscout/search/engines/yahoo/__init__.py +41 -0
  254. webscout/search/engines/yahoo/answers.py +19 -0
  255. webscout/search/engines/yahoo/base.py +34 -0
  256. webscout/search/engines/yahoo/images.py +323 -0
  257. webscout/search/engines/yahoo/maps.py +19 -0
  258. webscout/search/engines/yahoo/news.py +258 -0
  259. webscout/search/engines/yahoo/suggestions.py +140 -0
  260. webscout/search/engines/yahoo/text.py +273 -0
  261. webscout/search/engines/yahoo/translate.py +19 -0
  262. webscout/search/engines/yahoo/videos.py +302 -0
  263. webscout/search/engines/yahoo/weather.py +220 -0
  264. webscout/search/engines/yandex.py +67 -0
  265. webscout/search/engines/yep/__init__.py +13 -0
  266. webscout/search/engines/yep/base.py +34 -0
  267. webscout/search/engines/yep/images.py +101 -0
  268. webscout/search/engines/yep/suggestions.py +38 -0
  269. webscout/search/engines/yep/text.py +99 -0
  270. webscout/search/http_client.py +172 -0
  271. webscout/search/results.py +141 -0
  272. webscout/search/yahoo_main.py +57 -0
  273. webscout/search/yep_main.py +48 -0
  274. webscout/server/__init__.py +48 -0
  275. webscout/server/config.py +78 -0
  276. webscout/server/exceptions.py +69 -0
  277. webscout/server/providers.py +286 -0
  278. webscout/server/request_models.py +131 -0
  279. webscout/server/request_processing.py +404 -0
  280. webscout/server/routes.py +642 -0
  281. webscout/server/server.py +351 -0
  282. webscout/server/ui_templates.py +1171 -0
  283. webscout/swiftcli/__init__.py +79 -95
  284. webscout/swiftcli/core/__init__.py +7 -7
  285. webscout/swiftcli/core/cli.py +574 -297
  286. webscout/swiftcli/core/context.py +98 -104
  287. webscout/swiftcli/core/group.py +268 -241
  288. webscout/swiftcli/decorators/__init__.py +28 -28
  289. webscout/swiftcli/decorators/command.py +243 -221
  290. webscout/swiftcli/decorators/options.py +247 -220
  291. webscout/swiftcli/decorators/output.py +392 -252
  292. webscout/swiftcli/exceptions.py +21 -21
  293. webscout/swiftcli/plugins/__init__.py +9 -9
  294. webscout/swiftcli/plugins/base.py +134 -135
  295. webscout/swiftcli/plugins/manager.py +269 -269
  296. webscout/swiftcli/utils/__init__.py +58 -59
  297. webscout/swiftcli/utils/formatting.py +251 -252
  298. webscout/swiftcli/utils/parsing.py +368 -267
  299. webscout/update_checker.py +280 -136
  300. webscout/utils.py +28 -14
  301. webscout/version.py +2 -1
  302. webscout/version.py.bak +3 -0
  303. webscout/zeroart/__init__.py +218 -135
  304. webscout/zeroart/base.py +70 -66
  305. webscout/zeroart/effects.py +155 -101
  306. webscout/zeroart/fonts.py +1799 -1239
  307. webscout-2026.1.19.dist-info/METADATA +638 -0
  308. webscout-2026.1.19.dist-info/RECORD +312 -0
  309. {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/WHEEL +1 -1
  310. {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/entry_points.txt +1 -1
  311. webscout/DWEBS.py +0 -520
  312. webscout/Extra/Act.md +0 -309
  313. webscout/Extra/GitToolkit/gitapi/README.md +0 -110
  314. webscout/Extra/autocoder/__init__.py +0 -9
  315. webscout/Extra/autocoder/autocoder.py +0 -1105
  316. webscout/Extra/autocoder/autocoder_utiles.py +0 -332
  317. webscout/Extra/gguf.md +0 -430
  318. webscout/Extra/weather.md +0 -281
  319. webscout/Litlogger/README.md +0 -10
  320. webscout/Litlogger/__init__.py +0 -15
  321. webscout/Litlogger/formats.py +0 -4
  322. webscout/Litlogger/handlers.py +0 -103
  323. webscout/Litlogger/levels.py +0 -13
  324. webscout/Litlogger/logger.py +0 -92
  325. webscout/Provider/AI21.py +0 -177
  326. webscout/Provider/AISEARCH/DeepFind.py +0 -254
  327. webscout/Provider/AISEARCH/felo_search.py +0 -202
  328. webscout/Provider/AISEARCH/genspark_search.py +0 -324
  329. webscout/Provider/AISEARCH/hika_search.py +0 -186
  330. webscout/Provider/AISEARCH/scira_search.py +0 -298
  331. webscout/Provider/Aitopia.py +0 -316
  332. webscout/Provider/AllenAI.py +0 -440
  333. webscout/Provider/Blackboxai.py +0 -791
  334. webscout/Provider/ChatGPTClone.py +0 -237
  335. webscout/Provider/ChatGPTGratis.py +0 -194
  336. webscout/Provider/Cloudflare.py +0 -324
  337. webscout/Provider/ExaChat.py +0 -358
  338. webscout/Provider/Flowith.py +0 -217
  339. webscout/Provider/FreeGemini.py +0 -250
  340. webscout/Provider/Glider.py +0 -225
  341. webscout/Provider/HF_space/__init__.py +0 -0
  342. webscout/Provider/HF_space/qwen_qwen2.py +0 -206
  343. webscout/Provider/HuggingFaceChat.py +0 -469
  344. webscout/Provider/Hunyuan.py +0 -283
  345. webscout/Provider/LambdaChat.py +0 -411
  346. webscout/Provider/Llama3.py +0 -259
  347. webscout/Provider/Nemotron.py +0 -218
  348. webscout/Provider/OLLAMA.py +0 -396
  349. webscout/Provider/OPENAI/BLACKBOXAI.py +0 -766
  350. webscout/Provider/OPENAI/Cloudflare.py +0 -378
  351. webscout/Provider/OPENAI/FreeGemini.py +0 -283
  352. webscout/Provider/OPENAI/NEMOTRON.py +0 -232
  353. webscout/Provider/OPENAI/Qwen3.py +0 -283
  354. webscout/Provider/OPENAI/api.py +0 -969
  355. webscout/Provider/OPENAI/c4ai.py +0 -373
  356. webscout/Provider/OPENAI/chatgptclone.py +0 -494
  357. webscout/Provider/OPENAI/copilot.py +0 -242
  358. webscout/Provider/OPENAI/flowith.py +0 -162
  359. webscout/Provider/OPENAI/freeaichat.py +0 -359
  360. webscout/Provider/OPENAI/mcpcore.py +0 -389
  361. webscout/Provider/OPENAI/multichat.py +0 -376
  362. webscout/Provider/OPENAI/opkfc.py +0 -496
  363. webscout/Provider/OPENAI/scirachat.py +0 -477
  364. webscout/Provider/OPENAI/standardinput.py +0 -433
  365. webscout/Provider/OPENAI/typegpt.py +0 -364
  366. webscout/Provider/OPENAI/uncovrAI.py +0 -463
  367. webscout/Provider/OPENAI/venice.py +0 -431
  368. webscout/Provider/OPENAI/yep.py +0 -382
  369. webscout/Provider/OpenGPT.py +0 -209
  370. webscout/Provider/Perplexitylabs.py +0 -415
  371. webscout/Provider/Reka.py +0 -214
  372. webscout/Provider/StandardInput.py +0 -290
  373. webscout/Provider/TTI/aiarta.py +0 -365
  374. webscout/Provider/TTI/artbit.py +0 -0
  375. webscout/Provider/TTI/fastflux.py +0 -200
  376. webscout/Provider/TTI/piclumen.py +0 -203
  377. webscout/Provider/TTI/pixelmuse.py +0 -225
  378. webscout/Provider/TTS/gesserit.py +0 -128
  379. webscout/Provider/TTS/sthir.py +0 -94
  380. webscout/Provider/TeachAnything.py +0 -229
  381. webscout/Provider/UNFINISHED/puterjs.py +0 -635
  382. webscout/Provider/UNFINISHED/test_lmarena.py +0 -119
  383. webscout/Provider/Venice.py +0 -258
  384. webscout/Provider/VercelAI.py +0 -253
  385. webscout/Provider/Writecream.py +0 -246
  386. webscout/Provider/WritingMate.py +0 -269
  387. webscout/Provider/asksteve.py +0 -220
  388. webscout/Provider/chatglm.py +0 -215
  389. webscout/Provider/copilot.py +0 -425
  390. webscout/Provider/freeaichat.py +0 -285
  391. webscout/Provider/granite.py +0 -235
  392. webscout/Provider/hermes.py +0 -266
  393. webscout/Provider/koala.py +0 -170
  394. webscout/Provider/lmarena.py +0 -198
  395. webscout/Provider/multichat.py +0 -364
  396. webscout/Provider/scira_chat.py +0 -299
  397. webscout/Provider/scnet.py +0 -243
  398. webscout/Provider/talkai.py +0 -194
  399. webscout/Provider/typegpt.py +0 -289
  400. webscout/Provider/uncovr.py +0 -368
  401. webscout/Provider/yep.py +0 -389
  402. webscout/litagent/Readme.md +0 -276
  403. webscout/litprinter/__init__.py +0 -59
  404. webscout/swiftcli/Readme.md +0 -323
  405. webscout/tempid.py +0 -128
  406. webscout/webscout_search.py +0 -1184
  407. webscout/webscout_search_async.py +0 -654
  408. webscout/yep_search.py +0 -347
  409. webscout/zeroart/README.md +0 -89
  410. webscout-8.2.9.dist-info/METADATA +0 -1033
  411. webscout-8.2.9.dist-info/RECORD +0 -289
  412. {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/licenses/LICENSE.md +0 -0
  413. {webscout-8.2.9.dist-info → webscout-2026.1.19.dist-info}/top_level.txt +0 -0
webscout/scout/README.md CHANGED
@@ -1,404 +1,452 @@
1
- # 🕵️ Scout: Next-Gen Web Parsing Library
2
-
3
- <div align="center">
4
-
5
- [![Python](https://img.shields.io/badge/Python-3.8%2B-blue)](https://www.python.org/)
6
- [![License](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
7
- [![Maintenance](https://img.shields.io/badge/Maintained-Yes-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout)
8
- [![Documentation](https://img.shields.io/badge/Docs-Wiki-orange)](https://github.com/OE-LUCIFER/Webscout/wiki)
9
- [![PRs Welcome](https://img.shields.io/badge/PRs-Welcome-brightgreen.svg)](https://github.com/OE-LUCIFER/Webscout/pulls)
10
-
11
- </div>
12
-
13
- ## 📋 Overview
14
-
15
- Scout is a powerful, flexible, and performant HTML parsing library that makes web scraping a breeze! It provides intelligent HTML/XML parsing with advanced features like web crawling, text analysis, semantic extraction, and Markdown conversion. Scout goes beyond traditional parsing libraries with its intuitive API and comprehensive feature set.
16
-
17
- <details open>
18
- <summary><b>Why Choose Scout?</b></summary>
19
-
20
- - **Powerful Parsing**: Multiple parser backends with intelligent markup handling
21
- - **Advanced Analysis**: Built-in text and web content analysis tools
22
- - **Concurrent Crawling**: Efficient multi-threaded web crawling
23
- - **Flexible API**: Intuitive interface similar to BeautifulSoup but with enhanced capabilities
24
- - **Format Conversion**: Convert HTML to JSON, Markdown, and more
25
-
26
- </details>
27
-
28
- ## 📑 Table of Contents
29
-
30
- - [Installation](#-installation)
31
- - [Quick Start](#-quick-start)
32
- - [Features](#-features)
33
- - [Advanced Usage](#-advanced-usage)
34
- - [API Reference](#-api-reference)
35
- - [Dependencies](#-dependencies)
36
- - [Supported Python Versions](#-supported-python-versions)
37
- - [Contributing](#-contributing)
38
- - [License](#-license)
39
-
40
- ## 📦 Installation
41
-
42
- ```bash
43
- pip install webscout
44
- ```
45
-
46
- Or install the latest version from GitHub:
47
-
48
- ```bash
49
- pip install git+https://github.com/OE-LUCIFER/Webscout.git
50
- ```
51
-
52
- ## 🚀 Quick Start
53
-
54
- ### Basic Parsing
55
-
56
- ```python
57
- from webscout.scout import Scout
58
-
59
- # Parse HTML content
60
- html_content = """
61
- <html>
62
- <body>
63
- <h1>Hello, Scout!</h1>
64
- <div class="content">
65
- <p>Web parsing made easy.</p>
66
- <a href="https://example.com">Link</a>
67
- </div>
68
- </body>
69
- </html>
70
- """
71
-
72
- scout = Scout(html_content)
73
-
74
- # Find elements
75
- title = scout.find('h1')
76
- links = scout.find_all('a')
77
-
78
- # Extract text
79
- print(title[0].get_text()) # Output: Hello, Scout!
80
- print(links.attrs('href')) # Output: ['https://example.com']
81
- ```
82
-
83
- ### Web Crawling
84
-
85
- ```python
86
- from webscout.scout import ScoutCrawler
87
-
88
- # Crawl a website with default settings
89
- crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
90
-
91
- # Or customize the crawler
92
- crawler = ScoutCrawler(
93
- 'https://example.com', # base_url
94
- max_pages=100, # maximum pages to crawl
95
- tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
96
- )
97
-
98
- # Start crawling
99
- crawled_pages = crawler.crawl()
100
-
101
- for page in crawled_pages:
102
- print(f"URL: {page['url']}")
103
- print(f"Title: {page['title']}")
104
- print(f"Links found: {len(page['links'])}")
105
- print(f"Crawl depth: {page['depth']}")
106
- ```
107
-
108
- ### Text Analysis
109
-
110
- ```python
111
- from webscout.scout import Scout
112
-
113
- # Parse a webpage
114
- html = """<div><h1>Climate Change</h1><p>Email us at info@example.com or call 555-123-4567.</p>
115
- <p>Visit https://climate-action.org for more information.</p></div>"""
116
- scout = Scout(html)
117
-
118
- # Analyze text and extract entities
119
- analysis = scout.analyze_text()
120
- print(f"Word frequencies: {analysis['word_count']}")
121
- print(f"Entities found: {analysis['entities']}")
122
- ```
123
-
124
- ## ✨ Features
125
-
126
- ### 🔍 Multiple Parser Support
127
-
128
- Scout supports multiple HTML/XML parsers, allowing you to choose the best tool for your specific needs:
129
-
130
- | Parser | Description | Best For |
131
- |--------|-------------|----------|
132
- | `html.parser` | Python's built-in parser | General-purpose parsing, no dependencies |
133
- | `lxml` | Fast C-based parser | Performance-critical applications |
134
- | `html5lib` | Highly compliant HTML5 parser | Handling malformed HTML |
135
- | `lxml-xml` | XML parser | XML document parsing |
136
-
137
- ```python
138
- # Choose your parser
139
- scout = Scout(html_content, features='lxml') # For speed
140
- scout = Scout(html_content, features='html5lib') # For compliance
141
- ```
142
-
143
- ### 🌐 Advanced Parsing Capabilities
144
-
145
- Scout provides powerful tools for navigating and manipulating HTML/XML documents:
146
-
147
- - **Element Selection**: Find elements by tag name, attributes, CSS selectors, and more
148
- - **Tree Traversal**: Navigate parent-child relationships and sibling elements
149
- - **Content Extraction**: Extract text, attributes, and structured data
150
- - **Document Manipulation**: Modify, replace, or remove elements
151
- - **Dynamic Building**: Easily append or insert new nodes
152
-
153
- ```python
154
- # CSS selector support
155
- elements = scout.select('div.content > p')
156
-
157
- # Advanced find with attribute matching
158
- results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
159
-
160
- # Tree traversal
161
- parent = element.find_parent('div')
162
- siblings = element.find_next_siblings('p')
163
- prev_sibling = element.find_previous_sibling('p')
164
- ```
165
-
166
- ### 🧠 Intelligent Analysis
167
-
168
- Scout includes built-in analysis tools for extracting insights from web content:
169
-
170
- #### Text Analysis
171
-
172
- ```python
173
- # Extract and analyze text
174
- text = scout.get_text()
175
- word_counts = scout.text_analyzer.count_words(text)
176
- entities = scout.text_analyzer.extract_entities(text)
177
- ```
178
-
179
- #### Web Structure Analysis
180
-
181
- ```python
182
- # Analyze page structure
183
- structure = scout.analyze_page_structure()
184
- print(f"Most common tags: {structure['tag_distribution']}")
185
- print(f"Page depth: {max(structure['depth_analysis'].keys())}")
186
- ```
187
-
188
- #### Semantic Information Extraction
189
-
190
- ```python
191
- # Extract semantic information
192
- semantics = scout.extract_semantic_info()
193
- print(f"Headings: {semantics['headings']}")
194
- print(f"Lists: {len(semantics['lists']['ul']) + len(semantics['lists']['ol'])}")
195
- print(f"Tables: {semantics['tables']['count']}")
196
- ```
197
-
198
- ### 🕸️ Web Crawling
199
-
200
- Scout includes a powerful concurrent web crawler for fetching and analyzing multiple pages:
201
-
202
- ```python
203
- from webscout.scout import ScoutCrawler
204
-
205
- # Create a crawler with default settings
206
- crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
207
-
208
- # Or customize the crawler with specific options
209
- crawler = ScoutCrawler(
210
- 'https://example.com', # base_url
211
- max_pages=100, # maximum pages to crawl
212
- tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
213
- )
214
-
215
- # Start crawling
216
- pages = crawler.crawl()
217
-
218
- # Process results
219
- for page in pages:
220
- print(f"URL: {page['url']}")
221
- print(f"Title: {page['title']}")
222
- print(f"Links: {len(page['links'])}")
223
- print(f"Depth: {page['depth']}")
224
- ```
225
-
226
- The crawler automatically:
227
- - Stays within the same domain as the base URL
228
- - Uses concurrent requests for faster crawling
229
- - Removes unwanted tags (like scripts and styles) for cleaner text extraction
230
- - Tracks crawl depth for each page
231
-
232
- ### 📄 Format Conversion
233
-
234
- Scout can convert HTML to various formats:
235
-
236
- ```python
237
- # Convert to JSON
238
- json_data = scout.to_json(indent=2)
239
-
240
- # Convert to Markdown
241
- markdown = scout.to_markdown(heading_style='ATX')
242
-
243
- # Pretty-print HTML
244
- pretty_html = scout.prettify()
245
- ```
246
-
247
- ## 🔬 Advanced Usage
248
-
249
- ### Working with Search Results
250
-
251
- Scout's search methods return a `ScoutSearchResult` object with powerful methods for processing results:
252
-
253
- ```python
254
- from webscout.scout import Scout
255
-
256
- scout = Scout(html_content)
257
-
258
- # Find all paragraphs
259
- paragraphs = scout.find_all('p')
260
-
261
- # Extract all text from results
262
- all_text = paragraphs.texts(separator='\n')
263
-
264
- # Extract specific attributes
265
- hrefs = paragraphs.attrs('href')
266
-
267
- # Filter results with a predicate function
268
- important = paragraphs.filter(lambda p: 'important' in p.get('class', []))
269
-
270
- # Transform results
271
- word_counts = paragraphs.map(lambda p: len(p.get_text().split()))
272
-
273
- # Analyze text in results
274
- analysis = paragraphs.analyze_text()
275
- ```
276
-
277
- ### URL Handling and Analysis
278
-
279
- ```python
280
- from webscout.scout import Scout
281
-
282
- scout = Scout(html_content)
283
-
284
- # Parse and analyze URLs
285
- links = scout.extract_links(base_url='https://example.com')
286
- for link in links:
287
- url_components = scout.url_parse(link['href'])
288
- print(f"Domain: {url_components['netloc']}")
289
- print(f"Path: {url_components['path']}")
290
- ```
291
-
292
- ### Metadata Extraction
293
-
294
- ```python
295
- from webscout.scout import Scout
296
-
297
- scout = Scout(html_content)
298
-
299
- # Extract metadata
300
- metadata = scout.extract_metadata()
301
- print(f"Title: {metadata['title']}")
302
- print(f"Description: {metadata['description']}")
303
- print(f"Open Graph: {metadata['og_metadata']}")
304
- print(f"Twitter Card: {metadata['twitter_metadata']}")
305
- ```
306
-
307
- ### Content Hashing and Caching
308
-
309
- ```python
310
- from webscout.scout import Scout
311
-
312
- scout = Scout(html_content)
313
-
314
- # Generate content hash
315
- content_hash = scout.hash_content(method='sha256')
316
-
317
- # Use caching for expensive operations
318
- if not scout.cache('parsed_data'):
319
- data = scout.extract_semantic_info()
320
- scout.cache('parsed_data', data)
321
-
322
- cached_data = scout.cache('parsed_data')
323
- ```
324
-
325
- ## 📚 API Reference
326
-
327
- ### Core Classes
328
-
329
- | Class | Description |
330
- |-------|-------------|
331
- | `Scout` | Main class for HTML parsing and traversal |
332
- | `ScoutCrawler` | Web crawler for fetching and parsing multiple pages |
333
- | `ScoutTextAnalyzer` | Text analysis utilities |
334
- | `ScoutWebAnalyzer` | Web page analysis utilities |
335
- | `ScoutSearchResult` | Enhanced search results with filtering and analysis |
336
- | `Tag` | Represents an HTML/XML tag |
337
- | `NavigableString` | Represents text within an HTML/XML document |
338
-
339
- ### Key Methods
340
-
341
- #### Scout Class
342
-
343
- - `__init__(markup, features='html.parser', from_encoding=None)`: Initialize with HTML content
344
- - `find(name, attrs={}, recursive=True, text=None)`: Find first matching element
345
- - `find_all(name, attrs={}, recursive=True, text=None, limit=None)`: Find all matching elements
346
- - `select(selector)`: Find elements using CSS selector
347
- - `get_text(separator=' ', strip=False)`: Extract text from document
348
- - `analyze_text()`: Perform text analysis
349
- - `analyze_page_structure()`: Analyze document structure
350
- - `extract_semantic_info()`: Extract semantic information
351
- - `extract_links(base_url=None)`: Extract all links
352
- - `extract_metadata()`: Extract metadata from document
353
- - `to_json(indent=2)`: Convert to JSON
354
- - `to_markdown(heading_style='ATX')`: Convert to Markdown
355
- - `prettify(formatter='minimal')`: Pretty-print HTML
356
-
357
- #### ScoutCrawler Class
358
-
359
- - `__init__(base_url, max_pages=50, tags_to_remove=None)`: Initialize the crawler
360
- - `crawl()`: Start crawling from the base URL
361
- - `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
362
- - `_is_valid_url(url)`: Check if a URL is valid (internal method)
363
-
364
- For detailed API documentation, please refer to the [documentation](https://github.com/OE-LUCIFER/Webscout/wiki).
365
-
366
- ## 🔧 Dependencies
367
-
368
- - `curl_cffi`: HTTP library used for web requests
369
- - `lxml`: XML and HTML processing library (optional, recommended)
370
- - `html5lib`: Standards-compliant HTML parser (optional)
371
- - `markdownify`: HTML to Markdown conversion
372
- - `concurrent.futures`: Asynchronous execution (standard library)
373
-
374
- ## 🌈 Supported Python Versions
375
-
376
- - Python 3.8+
377
-
378
- ## 🤝 Contributing
379
-
380
- Contributions are welcome! Here's how you can contribute:
381
-
382
- 1. Fork the repository
383
- 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
384
- 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
385
- 4. Push to the branch (`git push origin feature/amazing-feature`)
386
- 5. Open a Pull Request
387
-
388
- Please make sure to update tests as appropriate.
389
-
390
- ## 📄 License
391
-
392
- This project is licensed under the MIT License - see the LICENSE file for details.
393
-
394
- ---
395
-
396
- <div align="center">
397
- <p>Made with ❤️ by the Webscout team</p>
398
- <p>
399
- <a href="https://github.com/OE-LUCIFER/Webscout">GitHub</a>
400
- <a href="https://github.com/OE-LUCIFER/Webscout/wiki">Documentation</a>
401
- <a href="https://github.com/OE-LUCIFER/Webscout/issues">Report Bug</a>
402
- <a href="https://github.com/OE-LUCIFER/Webscout/issues">Request Feature</a>
403
- </p>
404
- </div>
1
+ **🚀 The Most Advanced HTML Parser & Web Crawler for AI/LLM Data Collection**
2
+
3
+ **🌟 Built for the Future • Powered by Intelligence • Trusted by Developers**
4
+
5
+
6
+ ## 📋 Overview
7
+
8
+ Scout is an ultra-powerful, enterprise-grade HTML parsing and web crawling library designed for the AI era. Built with LLM data collection in mind, Scout provides unparalleled capabilities for extracting, analyzing, and processing web content at scale. With its BS4-compatible API enhanced with modern features, Scout is the go-to solution for serious web scraping projects.
9
+
10
+ <details open>
11
+ <summary><b>🌟 Why Scout is the Ultimate Choice</b></summary>
12
+
13
+ - **🧠 LLM-Optimized Crawling**: Purpose-built for collecting high-quality training data for Large Language Models
14
+ - **🌐 Subdomain Intelligence**: Automatically discovers and crawls subdomains (e.g., blog.example.com, docs.example.com)
15
+ - **⚡ Lightning-Fast Performance**: Multi-threaded concurrent crawling with intelligent rate limiting
16
+ - **🎯 Surgical Precision**: Advanced content extraction that preserves structure while removing noise
17
+ - **🔍 Deep Analysis**: Built-in NLP capabilities for entity extraction, text analysis, and semantic understanding
18
+ - **🛡️ Enterprise-Ready**: Robust error handling, retry mechanisms, and respect for robots.txt
19
+ - **📊 Rich Data Extraction**: Captures metadata, structured data, semantic content, and more
20
+ - **🔄 Format Flexibility**: Export to JSON, Markdown, CSV, or custom formats
21
+ - **🎨 BS4++ API**: Familiar interface with 10x more features
22
+
23
+ </details>
24
+
25
+ ## 📑 Table of Contents
26
+
27
+ - [Installation](#-installation)
28
+ - [Quick Start](#-quick-start)
29
+ - [Features](#-features)
30
+ - [Advanced Usage](#-advanced-usage)
31
+ - [API Reference](#-api-reference)
32
+ - [Dependencies](#-dependencies)
33
+ - [Supported Python Versions](#-supported-python-versions)
34
+ - [Contributing](#-contributing)
35
+ - [License](#-license)
36
+
37
+ ## 📦 Installation
38
+
39
+ ```bash
40
+ pip install webscout
41
+ ```
42
+
43
+ Or install the latest version from GitHub:
44
+
45
+ ```bash
46
+ pip install git+https://github.com/pyscout/Webscout.git
47
+ ```
48
+
49
+ ## 🚀 Quick Start
50
+
51
+ ### Basic Parsing
52
+
53
+ ```python
54
+ from webscout.scout import Scout
55
+
56
+ # Parse HTML content
57
+ html_content = """
58
+ <html>
59
+ <body>
60
+ <h1>Hello, Scout!</h1>
61
+ <div class="content">
62
+ <p>Web parsing made easy.</p>
63
+ <a href="https://example.com">Link</a>
64
+ </div>
65
+ </body>
66
+ </html>
67
+ """
68
+
69
+ scout = Scout(html_content)
70
+
71
+ # Find elements
72
+ title = scout.find('h1')
73
+ links = scout.find_all('a')
74
+
75
+ # Extract text
76
+ print(title[0].get_text()) # Output: Hello, Scout!
77
+ print(links.attrs('href')) # Output: ['https://example.com']
78
+ ```
79
+
80
+ ### Web Crawling
81
+
82
+ ```python
83
+ from webscout.scout import ScoutCrawler
84
+
85
+ # Crawl a website with default settings
86
+ crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
87
+
88
+ # Or customize the crawler
89
+ crawler = ScoutCrawler(
90
+ 'https://example.com', # base_url
91
+ max_pages=100, # maximum pages to crawl
92
+ tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
93
+ )
94
+
95
+ # Start crawling
96
+ crawled_pages = crawler.crawl()
97
+
98
+ for page in crawled_pages:
99
+ print(f"URL: {page['url']}")
100
+ print(f"Title: {page['title']}")
101
+ print(f"Links found: {len(page['links'])}")
102
+ print(f"Crawl depth: {page['depth']}")
103
+ ```
104
+
105
+ ### Text Analysis
106
+
107
+ ```python
108
+ from webscout.scout import Scout
109
+
110
+ # Parse a webpage
111
+ html = """<div><h1>Climate Change</h1><p>Email us at info@example.com or call 555-123-4567.</p>
112
+ <p>Visit https://climate-action.org for more information.</p></div>"""
113
+ scout = Scout(html)
114
+
115
+ # Analyze text and extract entities
116
+ analysis = scout.analyze_text()
117
+ print(f"Word frequencies: {analysis['word_count']}")
118
+ print(f"Entities found: {analysis['entities']}")
119
+ ```
120
+
121
+ ## Features
122
+
123
+ ### 🔍 Multiple Parser Support
124
+
125
+ Scout supports multiple HTML/XML parsers, allowing you to choose the best tool for your specific needs:
126
+
127
+ | Parser | Description | Best For |
128
+ |--------|-------------|----------|
129
+ | `html.parser` | Python's built-in parser | General-purpose parsing, no dependencies |
130
+ | `lxml` | Fast C-based parser | Performance-critical applications |
131
+ | `html5lib` | Highly compliant HTML5 parser | Handling malformed HTML |
132
+ | `lxml-xml` | XML parser | XML document parsing |
133
+
134
+ ```python
135
+ # Choose your parser
136
+ scout = Scout(html_content, features='lxml') # For speed
137
+ scout = Scout(html_content, features='html5lib') # For compliance
138
+ ```
139
+
140
+ ### 🌐 Advanced Parsing Capabilities
141
+
142
+ Scout provides powerful tools for navigating and manipulating HTML/XML documents:
143
+
144
+ - **Element Selection**: Find elements by tag name, attributes, CSS selectors, and more
145
+ - **Tree Traversal**: Navigate parent-child relationships and sibling elements
146
+ - **Content Extraction**: Extract text, attributes, and structured data
147
+ - **Document Manipulation**: Modify, replace, or remove elements
148
+ - **Dynamic Building**: Easily append or insert new nodes
149
+
150
+ #### CSS Selector Support
151
+
152
+ Scout includes a comprehensive CSS selector engine that supports all common selector types:
153
+
154
+ ```python
155
+ # Tag selectors
156
+ paragraphs = scout.select('p')
157
+ divs = scout.select('div')
158
+
159
+ # Class selectors
160
+ items = scout.select('.item') # Single class
161
+ cards = scout.select('div.card') # Tag + class
162
+ special = scout.select('.card.special') # Multiple classes
163
+
164
+ # ID selectors
165
+ header = scout.select_one('#header') # Single element by ID
166
+ menu = scout.select('nav#main-menu') # Tag + ID
167
+
168
+ # Attribute selectors
169
+ links = scout.select('a[href]') # Has attribute
170
+ external = scout.select('a[rel="nofollow"]') # Attribute value
171
+ images = scout.select('img[alt]') # Has alt attribute
172
+
173
+ # Descendant selectors (space)
174
+ nested = scout.select('div p') # Any p inside div
175
+ deep = scout.select('article section p') # Deeply nested
176
+
177
+ # Child selectors (>)
178
+ direct = scout.select('ul > li') # Direct children only
179
+ menu_items = scout.select('nav#menu > ul > li') # Multiple levels
180
+
181
+ # Combined selectors
182
+ complex = scout.select('div.container > p.text[lang="en"]')
183
+ links = scout.select('ol#results > li.item a[href]')
184
+
185
+ # Get first match only
186
+ first = scout.select_one('p.intro')
187
+ ```
188
+
189
+ **Supported Selector Types:**
190
+ - **Tag**: `p`, `div`, `a`
191
+ - **Class**: `.class`, `div.class`, `.class1.class2`
192
+ - **ID**: `#id`, `div#id`
193
+ - **Attribute**: `[attr]`, `[attr="value"]`
194
+ - **Descendant**: `div p`, `article section p`
195
+ - **Child**: `div > p`, `ul > li`
196
+ - **Combined**: `p.class#id[attr="value"]`
197
+
198
+ #### Element Navigation
199
+
200
+ ```python
201
+ # Advanced find with attribute matching
202
+ results = scout.find_all('a', attrs={'class': 'external', 'rel': 'nofollow'})
203
+
204
+ # Tree traversal
205
+ parent = element.find_parent('div')
206
+ siblings = element.find_next_siblings('p')
207
+ prev_sibling = element.find_previous_sibling('p')
208
+ ```
209
+
210
+ ### 🧠 Intelligent Analysis
211
+
212
+ Scout includes built-in analysis tools for extracting insights from web content:
213
+
214
+ #### Text Analysis
215
+
216
+ ```python
217
+ # Extract and analyze text
218
+ text = scout.get_text()
219
+ word_counts = scout.text_analyzer.count_words(text)
220
+ entities = scout.text_analyzer.extract_entities(text)
221
+ ```
222
+
223
+ #### Web Structure Analysis
224
+
225
+ ```python
226
+ # Analyze page structure
227
+ structure = scout.analyze_page_structure()
228
+ print(f"Most common tags: {structure['tag_distribution']}")
229
+ print(f"Page depth: {max(structure['depth_analysis'].keys())}")
230
+ ```
231
+
232
+ #### Semantic Information Extraction
233
+
234
+ ```python
235
+ # Extract semantic information
236
+ semantics = scout.extract_semantic_info()
237
+ print(f"Headings: {semantics['headings']}")
238
+ print(f"Lists: {len(semantics['lists']['ul']) + len(semantics['lists']['ol'])}")
239
+ print(f"Tables: {semantics['tables']['count']}")
240
+ ```
241
+
242
+ ### 🕸️ Web Crawling
243
+
244
+ Scout includes a powerful concurrent web crawler for fetching and analyzing multiple pages:
245
+
246
+ ```python
247
+ from webscout.scout import ScoutCrawler
248
+
249
+ # Create a crawler with default settings
250
+ crawler = ScoutCrawler('https://example.com') # Default: max_pages=50
251
+
252
+ # Or customize the crawler with specific options
253
+ crawler = ScoutCrawler(
254
+ 'https://example.com', # base_url
255
+ max_pages=100, # maximum pages to crawl
256
+ tags_to_remove=['script', 'style', 'nav'] # tags to remove from content
257
+ )
258
+
259
+ # Start crawling
260
+ pages = crawler.crawl()
261
+
262
+ # Process results
263
+ for page in pages:
264
+ print(f"URL: {page['url']}")
265
+ print(f"Title: {page['title']}")
266
+ print(f"Links: {len(page['links'])}")
267
+ print(f"Depth: {page['depth']}")
268
+ ```
269
+
270
+ The crawler automatically:
271
+ - Stays within the same domain as the base URL
272
+ - Uses concurrent requests for faster crawling
273
+ - Removes unwanted tags (like scripts and styles) for cleaner text extraction
274
+ - Tracks crawl depth for each page
275
+
276
+ ### 📄 Format Conversion
277
+
278
+ Scout can convert HTML to various formats:
279
+
280
+ ```python
281
+ # Convert to JSON
282
+ json_data = scout.to_json(indent=2)
283
+
284
+ # Convert to Markdown
285
+ markdown = scout.to_markdown(heading_style='ATX')
286
+
287
+ # Pretty-print HTML
288
+ pretty_html = scout.prettify()
289
+ ```
290
+
291
+ ## 🔬 Advanced Usage
292
+
293
+ ### Working with Search Results
294
+
295
+ Scout's search methods return a `ScoutSearchResult` object with powerful methods for processing results:
296
+
297
+ ```python
298
+ from webscout.scout import Scout
299
+
300
+ scout = Scout(html_content)
301
+
302
+ # Find all paragraphs
303
+ paragraphs = scout.find_all('p')
304
+
305
+ # Extract all text from results
306
+ all_text = paragraphs.texts(separator='\n')
307
+
308
+ # Extract specific attributes
309
+ hrefs = paragraphs.attrs('href')
310
+
311
+ # Filter results with a predicate function
312
+ important = paragraphs.filter(lambda p: 'important' in p.get('class', []))
313
+
314
+ # Transform results
315
+ word_counts = paragraphs.map(lambda p: len(p.get_text().split()))
316
+
317
+ # Analyze text in results
318
+ analysis = paragraphs.analyze_text()
319
+ ```
320
+
321
+ ### URL Handling and Analysis
322
+
323
+ ```python
324
+ from webscout.scout import Scout
325
+
326
+ scout = Scout(html_content)
327
+
328
+ # Parse and analyze URLs
329
+ links = scout.extract_links(base_url='https://example.com')
330
+ for link in links:
331
+ url_components = scout.url_parse(link['href'])
332
+ print(f"Domain: {url_components['netloc']}")
333
+ print(f"Path: {url_components['path']}")
334
+ ```
335
+
336
+ ### Metadata Extraction
337
+
338
+ ```python
339
+ from webscout.scout import Scout
340
+
341
+ scout = Scout(html_content)
342
+
343
+ # Extract metadata
344
+ metadata = scout.extract_metadata()
345
+ print(f"Title: {metadata['title']}")
346
+ print(f"Description: {metadata['description']}")
347
+ print(f"Open Graph: {metadata['og_metadata']}")
348
+ print(f"Twitter Card: {metadata['twitter_metadata']}")
349
+ ```
350
+
351
+ ### Content Hashing and Caching
352
+
353
+ ```python
354
+ from webscout.scout import Scout
355
+
356
+ scout = Scout(html_content)
357
+
358
+ # Generate content hash
359
+ content_hash = scout.hash_content(method='sha256')
360
+
361
+ # Use caching for expensive operations
362
+ if not scout.cache('parsed_data'):
363
+ data = scout.extract_semantic_info()
364
+ scout.cache('parsed_data', data)
365
+
366
+ cached_data = scout.cache('parsed_data')
367
+ ```
368
+
369
+ ## 📚 API Reference
370
+
371
+ ### Core Classes
372
+
373
+ | Class | Description |
374
+ |-------|-------------|
375
+ | `Scout` | Main class for HTML parsing and traversal |
376
+ | `ScoutCrawler` | Web crawler for fetching and parsing multiple pages |
377
+ | `ScoutTextAnalyzer` | Text analysis utilities |
378
+ | `ScoutWebAnalyzer` | Web page analysis utilities |
379
+ | `ScoutSearchResult` | Enhanced search results with filtering and analysis |
380
+ | `Tag` | Represents an HTML/XML tag |
381
+ | `NavigableString` | Represents text within an HTML/XML document |
382
+
383
+ ### Key Methods
384
+
385
+ #### Scout Class
386
+
387
+ - `__init__(markup, features='html.parser', from_encoding=None)`: Initialize with HTML content
388
+ - `find(name, attrs={}, recursive=True, text=None)`: Find first matching element
389
+ - `find_all(name, attrs={}, recursive=True, text=None, limit=None)`: Find all matching elements
390
+ - `find_next(name, attrs={}, text=None)`: Find next element in document order
391
+ - `find_all_next(name, attrs={}, text=None, limit=None)`: Find all next elements in document order
392
+ - `find_previous(name, attrs={}, text=None)`: Find previous element in document order
393
+ - `find_all_previous(name, attrs={}, text=None, limit=None)`: Find all previous elements in document order
394
+ - `select(selector)`: Find elements using CSS selector
395
+ - `get_text(separator=' ', strip=False)`: Extract text from document
396
+ - `analyze_text()`: Perform text analysis
397
+ - `analyze_page_structure()`: Analyze document structure
398
+ - `extract_semantic_info()`: Extract semantic information
399
+ - `extract_links(base_url=None)`: Extract all links
400
+ - `extract_metadata()`: Extract metadata from document
401
+ - `to_json(indent=2)`: Convert to JSON
402
+ - `to_markdown(heading_style='ATX')`: Convert to Markdown
403
+ - `prettify(formatter='minimal')`: Pretty-print HTML
404
+
405
+ #### ScoutCrawler Class
406
+
407
+ - `__init__(base_url, max_pages=50, tags_to_remove=None)`: Initialize the crawler
408
+ - `crawl()`: Start crawling from the base URL
409
+ - `_crawl_page(url, depth=0)`: Crawl a single page (internal method)
410
+ - `_is_valid_url(url)`: Check if a URL is valid (internal method)
411
+
412
+ For detailed API documentation, please refer to the [documentation](https://github.com/pyscout/Webscout/wiki).
413
+
414
+ ## 🔧 Dependencies
415
+
416
+ - `curl_cffi`: HTTP library used for web requests
417
+ - `lxml`: XML and HTML processing library (optional, recommended)
418
+ - `html5lib`: Standards-compliant HTML parser (optional)
419
+ - `markdownify`: HTML to Markdown conversion
420
+ - `concurrent.futures`: Asynchronous execution (standard library)
421
+
422
+ ## 🌈 Supported Python Versions
423
+
424
+ - Python 3.8+
425
+
426
+ ## 🤝 Contributing
427
+
428
+ Contributions are welcome! Here's how you can contribute:
429
+
430
+ 1. Fork the repository
431
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
432
+ 3. Commit your changes (`git commit -m 'Add some amazing feature'`)
433
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
434
+ 5. Open a Pull Request
435
+
436
+ Please make sure to update tests as appropriate.
437
+
438
+ ## 📄 License
439
+
440
+ This project is licensed under the MIT License - see the LICENSE file for details.
441
+
442
+ ---
443
+
444
+ <div align="center">
445
+ <p>Made with ❤️ by the Webscout team</p>
446
+ <p>
447
+ <a href="https://github.com/pyscout/Webscout">GitHub</a> •
448
+ <a href="https://github.com/pyscout/Webscout/wiki">Documentation</a> •
449
+ <a href="https://github.com/pyscout/Webscout/issues">Report Bug</a> •
450
+ <a href="https://github.com/pyscout/Webscout/issues">Request Feature</a>
451
+ </p>
452
+ </div>