webscout 8.2.6__py3-none-any.whl → 8.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (292) hide show
  1. webscout/AIutel.py +97 -87
  2. webscout/version.py +1 -1
  3. {webscout-8.2.6.dist-info → webscout-8.2.7.dist-info}/METADATA +2 -15
  4. webscout-8.2.7.dist-info/RECORD +26 -0
  5. {webscout-8.2.6.dist-info → webscout-8.2.7.dist-info}/WHEEL +1 -1
  6. webscout-8.2.7.dist-info/entry_points.txt +3 -0
  7. webscout-8.2.7.dist-info/top_level.txt +1 -0
  8. webscout/Extra/GitToolkit/__init__.py +0 -10
  9. webscout/Extra/GitToolkit/gitapi/__init__.py +0 -12
  10. webscout/Extra/GitToolkit/gitapi/repository.py +0 -195
  11. webscout/Extra/GitToolkit/gitapi/user.py +0 -96
  12. webscout/Extra/GitToolkit/gitapi/utils.py +0 -62
  13. webscout/Extra/YTToolkit/YTdownloader.py +0 -957
  14. webscout/Extra/YTToolkit/__init__.py +0 -3
  15. webscout/Extra/YTToolkit/transcriber.py +0 -476
  16. webscout/Extra/YTToolkit/ytapi/__init__.py +0 -6
  17. webscout/Extra/YTToolkit/ytapi/channel.py +0 -307
  18. webscout/Extra/YTToolkit/ytapi/errors.py +0 -13
  19. webscout/Extra/YTToolkit/ytapi/extras.py +0 -45
  20. webscout/Extra/YTToolkit/ytapi/https.py +0 -88
  21. webscout/Extra/YTToolkit/ytapi/patterns.py +0 -61
  22. webscout/Extra/YTToolkit/ytapi/playlist.py +0 -59
  23. webscout/Extra/YTToolkit/ytapi/pool.py +0 -8
  24. webscout/Extra/YTToolkit/ytapi/query.py +0 -40
  25. webscout/Extra/YTToolkit/ytapi/stream.py +0 -63
  26. webscout/Extra/YTToolkit/ytapi/utils.py +0 -62
  27. webscout/Extra/YTToolkit/ytapi/video.py +0 -232
  28. webscout/Extra/__init__.py +0 -7
  29. webscout/Extra/autocoder/__init__.py +0 -9
  30. webscout/Extra/autocoder/autocoder.py +0 -910
  31. webscout/Extra/autocoder/autocoder_utiles.py +0 -332
  32. webscout/Extra/gguf.py +0 -684
  33. webscout/Extra/tempmail/__init__.py +0 -28
  34. webscout/Extra/tempmail/async_utils.py +0 -141
  35. webscout/Extra/tempmail/base.py +0 -161
  36. webscout/Extra/tempmail/cli.py +0 -187
  37. webscout/Extra/tempmail/emailnator.py +0 -84
  38. webscout/Extra/tempmail/mail_tm.py +0 -361
  39. webscout/Extra/tempmail/temp_mail_io.py +0 -292
  40. webscout/Extra/weather.py +0 -194
  41. webscout/Extra/weather_ascii.py +0 -76
  42. webscout/Litlogger/__init__.py +0 -67
  43. webscout/Litlogger/core/__init__.py +0 -6
  44. webscout/Litlogger/core/level.py +0 -23
  45. webscout/Litlogger/core/logger.py +0 -165
  46. webscout/Litlogger/handlers/__init__.py +0 -12
  47. webscout/Litlogger/handlers/console.py +0 -33
  48. webscout/Litlogger/handlers/file.py +0 -143
  49. webscout/Litlogger/handlers/network.py +0 -173
  50. webscout/Litlogger/styles/__init__.py +0 -7
  51. webscout/Litlogger/styles/colors.py +0 -249
  52. webscout/Litlogger/styles/formats.py +0 -458
  53. webscout/Litlogger/styles/text.py +0 -87
  54. webscout/Litlogger/utils/__init__.py +0 -6
  55. webscout/Litlogger/utils/detectors.py +0 -153
  56. webscout/Litlogger/utils/formatters.py +0 -200
  57. webscout/Provider/AI21.py +0 -177
  58. webscout/Provider/AISEARCH/DeepFind.py +0 -250
  59. webscout/Provider/AISEARCH/ISou.py +0 -256
  60. webscout/Provider/AISEARCH/Perplexity.py +0 -359
  61. webscout/Provider/AISEARCH/__init__.py +0 -10
  62. webscout/Provider/AISEARCH/felo_search.py +0 -228
  63. webscout/Provider/AISEARCH/genspark_search.py +0 -208
  64. webscout/Provider/AISEARCH/hika_search.py +0 -198
  65. webscout/Provider/AISEARCH/iask_search.py +0 -436
  66. webscout/Provider/AISEARCH/monica_search.py +0 -246
  67. webscout/Provider/AISEARCH/scira_search.py +0 -322
  68. webscout/Provider/AISEARCH/webpilotai_search.py +0 -281
  69. webscout/Provider/Aitopia.py +0 -316
  70. webscout/Provider/AllenAI.py +0 -447
  71. webscout/Provider/Andi.py +0 -228
  72. webscout/Provider/Blackboxai.py +0 -229
  73. webscout/Provider/ChatGPTClone.py +0 -237
  74. webscout/Provider/ChatGPTGratis.py +0 -194
  75. webscout/Provider/ChatSandbox.py +0 -342
  76. webscout/Provider/Cloudflare.py +0 -325
  77. webscout/Provider/Cohere.py +0 -208
  78. webscout/Provider/Deepinfra.py +0 -338
  79. webscout/Provider/ElectronHub.py +0 -773
  80. webscout/Provider/ExaAI.py +0 -261
  81. webscout/Provider/ExaChat.py +0 -358
  82. webscout/Provider/Free2GPT.py +0 -241
  83. webscout/Provider/GPTWeb.py +0 -249
  84. webscout/Provider/Gemini.py +0 -169
  85. webscout/Provider/GithubChat.py +0 -370
  86. webscout/Provider/GizAI.py +0 -285
  87. webscout/Provider/Glider.py +0 -222
  88. webscout/Provider/Groq.py +0 -801
  89. webscout/Provider/HF_space/__init__.py +0 -0
  90. webscout/Provider/HF_space/qwen_qwen2.py +0 -206
  91. webscout/Provider/HeckAI.py +0 -257
  92. webscout/Provider/HuggingFaceChat.py +0 -469
  93. webscout/Provider/Hunyuan.py +0 -283
  94. webscout/Provider/Jadve.py +0 -291
  95. webscout/Provider/Koboldai.py +0 -381
  96. webscout/Provider/LambdaChat.py +0 -411
  97. webscout/Provider/Llama3.py +0 -259
  98. webscout/Provider/MCPCore.py +0 -315
  99. webscout/Provider/Marcus.py +0 -206
  100. webscout/Provider/Nemotron.py +0 -218
  101. webscout/Provider/Netwrck.py +0 -270
  102. webscout/Provider/OLLAMA.py +0 -396
  103. webscout/Provider/OPENAI/__init__.py +0 -28
  104. webscout/Provider/OPENAI/ai4chat.py +0 -286
  105. webscout/Provider/OPENAI/base.py +0 -46
  106. webscout/Provider/OPENAI/c4ai.py +0 -367
  107. webscout/Provider/OPENAI/chatgpt.py +0 -549
  108. webscout/Provider/OPENAI/chatgptclone.py +0 -481
  109. webscout/Provider/OPENAI/deepinfra.py +0 -309
  110. webscout/Provider/OPENAI/e2b.py +0 -1350
  111. webscout/Provider/OPENAI/exaai.py +0 -404
  112. webscout/Provider/OPENAI/exachat.py +0 -437
  113. webscout/Provider/OPENAI/freeaichat.py +0 -352
  114. webscout/Provider/OPENAI/glider.py +0 -316
  115. webscout/Provider/OPENAI/groq.py +0 -354
  116. webscout/Provider/OPENAI/heckai.py +0 -341
  117. webscout/Provider/OPENAI/llmchatco.py +0 -327
  118. webscout/Provider/OPENAI/mcpcore.py +0 -376
  119. webscout/Provider/OPENAI/multichat.py +0 -368
  120. webscout/Provider/OPENAI/netwrck.py +0 -350
  121. webscout/Provider/OPENAI/opkfc.py +0 -488
  122. webscout/Provider/OPENAI/scirachat.py +0 -462
  123. webscout/Provider/OPENAI/sonus.py +0 -294
  124. webscout/Provider/OPENAI/standardinput.py +0 -425
  125. webscout/Provider/OPENAI/textpollinations.py +0 -329
  126. webscout/Provider/OPENAI/toolbaz.py +0 -406
  127. webscout/Provider/OPENAI/typegpt.py +0 -346
  128. webscout/Provider/OPENAI/uncovrAI.py +0 -455
  129. webscout/Provider/OPENAI/utils.py +0 -211
  130. webscout/Provider/OPENAI/venice.py +0 -413
  131. webscout/Provider/OPENAI/wisecat.py +0 -381
  132. webscout/Provider/OPENAI/writecream.py +0 -156
  133. webscout/Provider/OPENAI/x0gpt.py +0 -371
  134. webscout/Provider/OPENAI/yep.py +0 -327
  135. webscout/Provider/OpenGPT.py +0 -209
  136. webscout/Provider/Openai.py +0 -496
  137. webscout/Provider/PI.py +0 -429
  138. webscout/Provider/Perplexitylabs.py +0 -415
  139. webscout/Provider/QwenLM.py +0 -254
  140. webscout/Provider/Reka.py +0 -214
  141. webscout/Provider/StandardInput.py +0 -290
  142. webscout/Provider/TTI/AiForce/__init__.py +0 -22
  143. webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
  144. webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
  145. webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
  146. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
  147. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
  148. webscout/Provider/TTI/ImgSys/__init__.py +0 -23
  149. webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
  150. webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
  151. webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
  152. webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
  153. webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
  154. webscout/Provider/TTI/Nexra/__init__.py +0 -22
  155. webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
  156. webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
  157. webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
  158. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
  159. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
  160. webscout/Provider/TTI/__init__.py +0 -12
  161. webscout/Provider/TTI/aiarta/__init__.py +0 -2
  162. webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
  163. webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
  164. webscout/Provider/TTI/artbit/__init__.py +0 -22
  165. webscout/Provider/TTI/artbit/async_artbit.py +0 -155
  166. webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
  167. webscout/Provider/TTI/fastflux/__init__.py +0 -22
  168. webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
  169. webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
  170. webscout/Provider/TTI/huggingface/__init__.py +0 -22
  171. webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
  172. webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
  173. webscout/Provider/TTI/piclumen/__init__.py +0 -23
  174. webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
  175. webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
  176. webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
  177. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
  178. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
  179. webscout/Provider/TTI/talkai/__init__.py +0 -4
  180. webscout/Provider/TTI/talkai/async_talkai.py +0 -229
  181. webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
  182. webscout/Provider/TTS/__init__.py +0 -8
  183. webscout/Provider/TTS/base.py +0 -159
  184. webscout/Provider/TTS/deepgram.py +0 -156
  185. webscout/Provider/TTS/elevenlabs.py +0 -111
  186. webscout/Provider/TTS/gesserit.py +0 -128
  187. webscout/Provider/TTS/murfai.py +0 -113
  188. webscout/Provider/TTS/parler.py +0 -111
  189. webscout/Provider/TTS/speechma.py +0 -180
  190. webscout/Provider/TTS/streamElements.py +0 -333
  191. webscout/Provider/TTS/utils.py +0 -280
  192. webscout/Provider/TeachAnything.py +0 -233
  193. webscout/Provider/TextPollinationsAI.py +0 -306
  194. webscout/Provider/TwoAI.py +0 -280
  195. webscout/Provider/TypliAI.py +0 -305
  196. webscout/Provider/Venice.py +0 -258
  197. webscout/Provider/VercelAI.py +0 -253
  198. webscout/Provider/WiseCat.py +0 -233
  199. webscout/Provider/WrDoChat.py +0 -370
  200. webscout/Provider/Writecream.py +0 -237
  201. webscout/Provider/WritingMate.py +0 -269
  202. webscout/Provider/Youchat.py +0 -330
  203. webscout/Provider/__init__.py +0 -178
  204. webscout/Provider/ai4chat.py +0 -203
  205. webscout/Provider/aimathgpt.py +0 -189
  206. webscout/Provider/akashgpt.py +0 -335
  207. webscout/Provider/asksteve.py +0 -212
  208. webscout/Provider/bagoodex.py +0 -145
  209. webscout/Provider/cerebras.py +0 -288
  210. webscout/Provider/chatglm.py +0 -215
  211. webscout/Provider/cleeai.py +0 -213
  212. webscout/Provider/copilot.py +0 -425
  213. webscout/Provider/elmo.py +0 -283
  214. webscout/Provider/freeaichat.py +0 -285
  215. webscout/Provider/geminiapi.py +0 -208
  216. webscout/Provider/geminiprorealtime.py +0 -160
  217. webscout/Provider/granite.py +0 -235
  218. webscout/Provider/hermes.py +0 -266
  219. webscout/Provider/julius.py +0 -223
  220. webscout/Provider/koala.py +0 -268
  221. webscout/Provider/learnfastai.py +0 -325
  222. webscout/Provider/llama3mitril.py +0 -215
  223. webscout/Provider/llmchat.py +0 -255
  224. webscout/Provider/llmchatco.py +0 -306
  225. webscout/Provider/meta.py +0 -798
  226. webscout/Provider/multichat.py +0 -364
  227. webscout/Provider/scira_chat.py +0 -297
  228. webscout/Provider/scnet.py +0 -243
  229. webscout/Provider/searchchat.py +0 -292
  230. webscout/Provider/sonus.py +0 -258
  231. webscout/Provider/talkai.py +0 -194
  232. webscout/Provider/toolbaz.py +0 -353
  233. webscout/Provider/turboseek.py +0 -266
  234. webscout/Provider/typefully.py +0 -330
  235. webscout/Provider/typegpt.py +0 -289
  236. webscout/Provider/uncovr.py +0 -368
  237. webscout/Provider/x0gpt.py +0 -299
  238. webscout/Provider/yep.py +0 -389
  239. webscout/litagent/__init__.py +0 -29
  240. webscout/litagent/agent.py +0 -455
  241. webscout/litagent/constants.py +0 -60
  242. webscout/litprinter/__init__.py +0 -59
  243. webscout/scout/__init__.py +0 -8
  244. webscout/scout/core/__init__.py +0 -7
  245. webscout/scout/core/crawler.py +0 -140
  246. webscout/scout/core/scout.py +0 -568
  247. webscout/scout/core/search_result.py +0 -96
  248. webscout/scout/core/text_analyzer.py +0 -63
  249. webscout/scout/core/text_utils.py +0 -277
  250. webscout/scout/core/web_analyzer.py +0 -52
  251. webscout/scout/core.py +0 -881
  252. webscout/scout/element.py +0 -460
  253. webscout/scout/parsers/__init__.py +0 -69
  254. webscout/scout/parsers/html5lib_parser.py +0 -172
  255. webscout/scout/parsers/html_parser.py +0 -236
  256. webscout/scout/parsers/lxml_parser.py +0 -178
  257. webscout/scout/utils.py +0 -37
  258. webscout/swiftcli/__init__.py +0 -95
  259. webscout/swiftcli/core/__init__.py +0 -7
  260. webscout/swiftcli/core/cli.py +0 -297
  261. webscout/swiftcli/core/context.py +0 -104
  262. webscout/swiftcli/core/group.py +0 -241
  263. webscout/swiftcli/decorators/__init__.py +0 -28
  264. webscout/swiftcli/decorators/command.py +0 -221
  265. webscout/swiftcli/decorators/options.py +0 -220
  266. webscout/swiftcli/decorators/output.py +0 -252
  267. webscout/swiftcli/exceptions.py +0 -21
  268. webscout/swiftcli/plugins/__init__.py +0 -9
  269. webscout/swiftcli/plugins/base.py +0 -135
  270. webscout/swiftcli/plugins/manager.py +0 -262
  271. webscout/swiftcli/utils/__init__.py +0 -59
  272. webscout/swiftcli/utils/formatting.py +0 -252
  273. webscout/swiftcli/utils/parsing.py +0 -267
  274. webscout/zeroart/__init__.py +0 -55
  275. webscout/zeroart/base.py +0 -60
  276. webscout/zeroart/effects.py +0 -99
  277. webscout/zeroart/fonts.py +0 -816
  278. webscout-8.2.6.dist-info/RECORD +0 -307
  279. webscout-8.2.6.dist-info/entry_points.txt +0 -3
  280. webscout-8.2.6.dist-info/top_level.txt +0 -2
  281. webstoken/__init__.py +0 -30
  282. webstoken/classifier.py +0 -189
  283. webstoken/keywords.py +0 -216
  284. webstoken/language.py +0 -128
  285. webstoken/ner.py +0 -164
  286. webstoken/normalizer.py +0 -35
  287. webstoken/processor.py +0 -77
  288. webstoken/sentiment.py +0 -206
  289. webstoken/stemmer.py +0 -73
  290. webstoken/tagger.py +0 -60
  291. webstoken/tokenizer.py +0 -158
  292. {webscout-8.2.6.dist-info → webscout-8.2.7.dist-info}/licenses/LICENSE.md +0 -0
webscout/scout/element.py DELETED
@@ -1,460 +0,0 @@
1
- """
2
- Scout Element Module - Advanced HTML Element Representation
3
- """
4
-
5
- import re
6
- from typing import Optional, List, Dict, Union, Any
7
-
8
- class NavigableString(str):
9
- """
10
- A string that knows its place in the document tree.
11
- Mimics BeautifulSoup's NavigableString for better compatibility.
12
- """
13
- def __new__(cls, text: str):
14
- """
15
- Create a new NavigableString instance.
16
-
17
- Args:
18
- text (str): String content
19
- """
20
- return str.__new__(cls, text)
21
-
22
- def __init__(self, text: str):
23
- """
24
- Initialize a navigable string.
25
-
26
- Args:
27
- text (str): String content
28
- """
29
- self.parent = None
30
-
31
- def __repr__(self):
32
- """String representation."""
33
- return f"NavigableString({super().__repr__()})"
34
-
35
- def __add__(self, other):
36
- """
37
- Allow concatenation of NavigableString with other strings.
38
-
39
- Args:
40
- other (str): String to concatenate
41
-
42
- Returns:
43
- str: Concatenated string
44
- """
45
- return str(self) + str(other)
46
-
47
- def strip(self, chars=None):
48
- """
49
- Strip whitespace or specified characters.
50
-
51
- Args:
52
- chars (str, optional): Characters to strip
53
-
54
- Returns:
55
- str: Stripped string
56
- """
57
- return NavigableString(super().strip(chars))
58
-
59
- class Tag:
60
- """
61
- Represents an HTML tag with advanced traversal and manipulation capabilities.
62
- Enhanced to closely mimic BeautifulSoup's Tag class.
63
- """
64
- def __init__(self, name: str, attrs: Dict[str, str] = None):
65
- """
66
- Initialize a Tag with name and attributes.
67
-
68
- Args:
69
- name (str): Tag name
70
- attrs (dict, optional): Tag attributes
71
- """
72
- self.name = name
73
- self.attrs = attrs or {}
74
- self.contents = []
75
- self.parent = None
76
- self.string = None # For single string content
77
-
78
- def __str__(self):
79
- """String representation of the tag."""
80
- return self.decode_contents()
81
-
82
- def __repr__(self):
83
- """Detailed representation of the tag."""
84
- return f"<{self.name} {self.attrs}>"
85
-
86
- def __call__(self, *args, **kwargs):
87
- """
88
- Allows calling find_all directly on the tag.
89
- Mimics BeautifulSoup's behavior.
90
- """
91
- return self.find_all(*args, **kwargs)
92
-
93
- def __contains__(self, item):
94
- """
95
- Check if an item is in the tag's contents.
96
-
97
- Args:
98
- item: Item to search for
99
-
100
- Returns:
101
- bool: True if item is in contents, False otherwise
102
- """
103
- return item in self.contents
104
-
105
- def __getitem__(self, key):
106
- """
107
- Get an attribute value using dictionary-like access.
108
-
109
- Args:
110
- key (str): Attribute name
111
-
112
- Returns:
113
- Any: Attribute value
114
- """
115
- return self.attrs[key]
116
-
117
- def __iter__(self):
118
- """
119
- Iterate through tag's contents.
120
-
121
- Returns:
122
- Iterator: Contents of the tag
123
- """
124
- return iter(self.contents)
125
-
126
- def __eq__(self, other):
127
- """
128
- Compare tags based on name and attributes.
129
-
130
- Args:
131
- other (Tag): Tag to compare
132
-
133
- Returns:
134
- bool: True if tags are equivalent
135
- """
136
- if not isinstance(other, Tag):
137
- return False
138
- return (
139
- self.name == other.name and
140
- self.attrs == other.attrs and
141
- str(self) == str(other)
142
- )
143
-
144
- def __hash__(self):
145
- """
146
- Generate a hash for the tag.
147
-
148
- Returns:
149
- int: Hash value
150
- """
151
- return hash((self.name, frozenset(self.attrs.items()), str(self)))
152
-
153
- def find(self, name=None, attrs={}, recursive=True, text=None, **kwargs) -> Optional['Tag']:
154
- """
155
- Find the first matching child element.
156
- Enhanced with more flexible matching.
157
-
158
- Args:
159
- name (str, optional): Tag name to search for
160
- attrs (dict, optional): Attributes to match
161
- recursive (bool, optional): Search recursively
162
- text (str, optional): Text content to match
163
-
164
- Returns:
165
- Tag or None: First matching element
166
- """
167
- results = self.find_all(name, attrs, recursive, text, limit=1, **kwargs)
168
- return results[0] if results else None
169
-
170
- def find_all(self, name=None, attrs={}, recursive=True, text=None, limit=None, **kwargs) -> List['Tag']:
171
- """
172
- Find all matching child elements.
173
- Enhanced with more flexible matching and BeautifulSoup-like features.
174
-
175
- Args:
176
- name (str, optional): Tag name to search for
177
- attrs (dict, optional): Attributes to match
178
- recursive (bool, optional): Search recursively
179
- text (str, optional): Text content to match
180
- limit (int, optional): Maximum number of results
181
-
182
- Returns:
183
- List[Tag]: List of matching elements
184
- """
185
- results = []
186
-
187
- def _match(tag):
188
- # Check tag name with case-insensitive and regex support
189
- if name:
190
- if isinstance(name, str):
191
- if tag.name.lower() != name.lower():
192
- return False
193
- elif isinstance(name, re.Pattern):
194
- if not name.search(tag.name):
195
- return False
196
-
197
- # Check attributes with more flexible matching
198
- for k, v in attrs.items():
199
- # Handle special attribute matching
200
- if k == 'class':
201
- tag_classes = tag.get('class', [])
202
- if isinstance(v, str) and v not in tag_classes:
203
- return False
204
- elif isinstance(v, list) and not all(cls in tag_classes for cls in v):
205
- return False
206
- elif k == 'id':
207
- if tag.get('id') != v:
208
- return False
209
- else:
210
- # Regex or exact match for other attributes
211
- tag_attr = tag.attrs.get(k)
212
- if isinstance(v, re.Pattern):
213
- if not v.search(str(tag_attr)):
214
- return False
215
- elif tag_attr != v:
216
- return False
217
-
218
- # Check text content
219
- if text:
220
- tag_text = tag.get_text(strip=True)
221
- if isinstance(text, str) and text.lower() not in tag_text.lower():
222
- return False
223
- elif isinstance(text, re.Pattern) and not text.search(tag_text):
224
- return False
225
-
226
- return True
227
-
228
- def _search(element):
229
- if _match(element):
230
- results.append(element)
231
- if limit and len(results) == limit:
232
- return
233
-
234
- if recursive:
235
- for child in element.contents:
236
- if isinstance(child, Tag):
237
- _search(child)
238
-
239
- _search(self)
240
- return results
241
-
242
- def select(self, selector: str) -> List['Tag']:
243
- """
244
- Select elements using CSS selector.
245
- Enhanced to support more complex selectors.
246
-
247
- Args:
248
- selector (str): CSS selector string
249
-
250
- Returns:
251
- List[Tag]: List of matching elements
252
- """
253
- # More advanced CSS selector parsing
254
- # This is a simplified implementation and might need more robust parsing
255
- parts = re.split(r'\s+', selector.strip())
256
- results = []
257
-
258
- def _match_selector(tag, selector_part):
259
- # Support more complex selectors
260
- if selector_part.startswith('.'):
261
- # Class selector
262
- return selector_part[1:] in tag.get('class', [])
263
- elif selector_part.startswith('#'):
264
- # ID selector
265
- return tag.get('id') == selector_part[1:]
266
- elif '[' in selector_part and ']' in selector_part:
267
- # Attribute selector
268
- attr_match = re.match(r'(\w+)\[([^=]+)(?:=(.+))?\]', selector_part)
269
- if attr_match:
270
- tag_name, attr, value = attr_match.groups()
271
- if tag_name and tag.name != tag_name:
272
- return False
273
- if value:
274
- return tag.get(attr) == value.strip("'\"")
275
- return attr in tag.attrs
276
- else:
277
- # Tag selector
278
- return tag.name == selector_part
279
-
280
- def _recursive_select(element, selector_parts):
281
- if not selector_parts:
282
- results.append(element)
283
- return
284
-
285
- current_selector = selector_parts[0]
286
- remaining_selectors = selector_parts[1:]
287
-
288
- if _match_selector(element, current_selector):
289
- if not remaining_selectors:
290
- results.append(element)
291
- else:
292
- for child in element.contents:
293
- if isinstance(child, Tag):
294
- _recursive_select(child, remaining_selectors)
295
-
296
- for child in self.contents:
297
- if isinstance(child, Tag):
298
- _recursive_select(child, parts)
299
-
300
- return results
301
-
302
- def select_one(self, selector: str) -> Optional['Tag']:
303
- """
304
- Select the first element matching the CSS selector.
305
-
306
- Args:
307
- selector (str): CSS selector string
308
-
309
- Returns:
310
- Tag or None: First matching element
311
- """
312
- results = self.select(selector)
313
- return results[0] if results else None
314
-
315
- def get_text(self, separator=' ', strip=False, types=None) -> str:
316
- """
317
- Extract text from the tag and its descendants.
318
- Enhanced to support more flexible text extraction.
319
-
320
- Args:
321
- separator (str, optional): Text separator
322
- strip (bool, optional): Strip whitespace
323
- types (list, optional): Types of content to extract
324
-
325
- Returns:
326
- str: Extracted text
327
- """
328
- texts = []
329
- for content in self.contents:
330
- # Support filtering by content type
331
- if types is None or type(content) in types:
332
- if isinstance(content, NavigableString):
333
- texts.append(str(content))
334
- elif isinstance(content, Tag):
335
- texts.append(content.get_text(separator, strip))
336
-
337
- text = separator.join(texts)
338
- text = re.sub(r'\n\n+', '\n', text) # Replace multiple newlines with single newlines
339
- return text.strip() if strip else text
340
-
341
- def find_text(self, pattern: Union[str, re.Pattern], **kwargs) -> Optional[str]:
342
- """
343
- Find the first text matching a pattern.
344
-
345
- Args:
346
- pattern (str or re.Pattern): Pattern to match
347
- **kwargs: Additional arguments for get_text()
348
-
349
- Returns:
350
- str or None: First matching text
351
- """
352
- text = self.get_text(**kwargs)
353
-
354
- if isinstance(pattern, str):
355
- return pattern if pattern in text else None
356
- elif isinstance(pattern, re.Pattern):
357
- match = pattern.search(text)
358
- return match.group(0) if match else None
359
-
360
- def replace_text(self, old: Union[str, re.Pattern], new: str, **kwargs) -> str:
361
- """
362
- Replace text matching a pattern.
363
-
364
- Args:
365
- old (str or re.Pattern): Pattern to replace
366
- new (str): Replacement text
367
- **kwargs: Additional arguments for get_text()
368
-
369
- Returns:
370
- str: Modified text
371
- """
372
- text = self.get_text(**kwargs)
373
-
374
- if isinstance(old, str):
375
- return text.replace(old, new)
376
- elif isinstance(old, re.Pattern):
377
- return old.sub(new, text)
378
-
379
- def get(self, key: str, default: Any = None) -> Any:
380
- """
381
- Get an attribute value.
382
-
383
- Args:
384
- key (str): Attribute name
385
- default (Any, optional): Default value if attribute not found
386
-
387
- Returns:
388
- Any: Attribute value or default
389
- """
390
- return self.attrs.get(key, default)
391
-
392
- def decompose(self) -> None:
393
- """Remove the tag and its contents from the document."""
394
- if self.parent:
395
- self.parent.contents.remove(self)
396
-
397
- def extract(self) -> 'Tag':
398
- """
399
- Remove the tag from the document and return it.
400
-
401
- Returns:
402
- Tag: Extracted tag
403
- """
404
- self.decompose()
405
- return self
406
-
407
- def clear(self) -> None:
408
- """Remove all contents of the tag."""
409
- self.contents.clear()
410
-
411
- def replace_with(self, new_tag: 'Tag') -> None:
412
- """
413
- Replace this tag with another tag.
414
-
415
- Args:
416
- new_tag (Tag): Tag to replace the current tag
417
- """
418
- if self.parent:
419
- index = self.parent.contents.index(self)
420
- self.parent.contents[index] = new_tag
421
- new_tag.parent = self.parent
422
-
423
- def decode_contents(self, eventual_encoding='utf-8') -> str:
424
- """
425
- Decode the contents of the tag to a string.
426
-
427
- Args:
428
- eventual_encoding (str, optional): Encoding to use
429
-
430
- Returns:
431
- str: Decoded contents
432
- """
433
- return ''.join(str(content) for content in self.contents)
434
-
435
- def prettify(self, formatter='minimal') -> str:
436
- """
437
- Return a nicely formatted representation of the tag.
438
-
439
- Args:
440
- formatter (str, optional): Formatting style
441
-
442
- Returns:
443
- str: Prettified tag representation
444
- """
445
- def _prettify(tag, indent=0):
446
- result = ' ' * indent + f'<{tag.name}'
447
- for k, v in tag.attrs.items():
448
- result += f' {k}="{v}"'
449
- result += '>\n'
450
-
451
- for content in tag.contents:
452
- if isinstance(content, Tag):
453
- result += _prettify(content, indent + 2)
454
- else:
455
- result += ' ' * (indent + 2) + str(content) + '\n'
456
-
457
- result += ' ' * indent + f'</{tag.name}>\n'
458
- return result
459
-
460
- return _prettify(self)
@@ -1,69 +0,0 @@
1
- """
2
- Scout Parsers - Unified Parsing Interfaces
3
- """
4
-
5
- from typing import Dict, Type, Any
6
-
7
- from .html_parser import HTMLParser
8
- from .lxml_parser import LXMLParser
9
- from .html5lib_parser import HTML5Parser
10
-
11
- class ParserRegistry:
12
- """
13
- Centralized parser registry for Scout library.
14
- Manages and provides access to different HTML parsing strategies.
15
- """
16
-
17
- _PARSERS: Dict[str, Type[Any]] = {
18
- 'html.parser': HTMLParser,
19
- 'lxml': LXMLParser,
20
- 'html5lib': HTML5Parser
21
- }
22
-
23
- @classmethod
24
- def get_parser(cls, parser_name: str = 'html.parser') -> Any:
25
- """
26
- Retrieve a parser by its name.
27
-
28
- Args:
29
- parser_name (str): Name of the parser to retrieve
30
-
31
- Returns:
32
- Parser instance
33
-
34
- Raises:
35
- ValueError: If the parser is not found
36
- """
37
- if parser_name not in cls._PARSERS:
38
- raise ValueError(f"Parser '{parser_name}' not found. Available parsers: {list(cls._PARSERS.keys())}")
39
-
40
- return cls._PARSERS[parser_name]()
41
-
42
- @classmethod
43
- def register_parser(cls, name: str, parser_class: Type[Any]):
44
- """
45
- Register a new parser dynamically.
46
-
47
- Args:
48
- name (str): Name of the parser
49
- parser_class (Type): Parser class to register
50
- """
51
- cls._PARSERS[name] = parser_class
52
-
53
- @classmethod
54
- def list_parsers(cls) -> Dict[str, Type[Any]]:
55
- """
56
- List all registered parsers.
57
-
58
- Returns:
59
- Dict of available parsers
60
- """
61
- return cls._PARSERS.copy()
62
-
63
- # Expose key classes and functions
64
- __all__ = [
65
- 'HTMLParser',
66
- 'LXMLParser',
67
- 'HTML5Parser',
68
- 'ParserRegistry'
69
- ]
@@ -1,172 +0,0 @@
1
- """
2
- Scout HTML5 Parser - Advanced HTML5 Parsing with html5lib
3
- """
4
-
5
- import re
6
- from typing import List, Optional, Dict, Any, Union
7
-
8
- import html5lib
9
- from ..element import Tag, NavigableString
10
-
11
- class HTML5Parser:
12
- """
13
- Advanced HTML5 parser using html5lib library.
14
- Provides robust parsing with enhanced error handling and flexibility.
15
- """
16
-
17
- def __init__(self, namespaces: bool = False, debug: bool = False):
18
- """
19
- Initialize the HTML5 parser with advanced parsing capabilities.
20
-
21
- Args:
22
- namespaces (bool): Whether to preserve namespace information
23
- debug (bool): Enable debug mode for parsing
24
- """
25
- self._namespaces = namespaces
26
- self._debug = debug
27
- self._parsing_errors = []
28
-
29
- def parse(self, markup: str) -> Tag:
30
- """
31
- Parse HTML5 markup and return the root tag.
32
-
33
- Args:
34
- markup (str): HTML5 content to parse
35
-
36
- Returns:
37
- Tag: Parsed document root
38
- """
39
- try:
40
- # Preprocess markup to handle common issues
41
- markup = self._preprocess_markup(markup)
42
-
43
- # Parse the markup
44
- tree = html5lib.parse(
45
- markup,
46
- namespaceHTMLElements=self._namespaces,
47
- transport_encoding='utf-8'
48
- )
49
-
50
- # Convert parsed tree to Scout Tag
51
- return self._convert_element(tree.getroot())
52
-
53
- except Exception as e:
54
- self._parsing_errors.append(str(e))
55
- return Tag('root')
56
-
57
- def _preprocess_markup(self, markup: str) -> str:
58
- """
59
- Preprocess HTML markup to handle common parsing issues.
60
-
61
- Args:
62
- markup (str): Raw HTML markup
63
-
64
- Returns:
65
- str: Preprocessed HTML markup
66
- """
67
- # Remove HTML comments
68
- markup = re.sub(r'<!--.*?-->', '', markup, flags=re.DOTALL)
69
-
70
- # Handle unclosed tags
71
- markup = re.sub(r'<(br|img|input|hr|meta)([^>]*?)(?<!/)>', r'<\1\2 />', markup, flags=re.IGNORECASE)
72
-
73
- return markup
74
-
75
- def _convert_element(self, element) -> Tag:
76
- """
77
- Convert html5lib element to Scout Tag.
78
-
79
- Args:
80
- element: html5lib parsed element
81
-
82
- Returns:
83
- Tag: Converted Scout Tag
84
- """
85
- # Create Tag with name and attributes
86
- tag = Tag(element.tag, dict(element.attrib))
87
-
88
- # Add text content
89
- if element.text:
90
- tag.contents.append(NavigableString(element.text))
91
-
92
- # Recursively add child elements
93
- for child in element:
94
- child_tag = self._convert_element(child)
95
- child_tag.parent = tag
96
- tag.contents.append(child_tag)
97
-
98
- # Add tail text
99
- if child.tail:
100
- tail_text = NavigableString(child.tail)
101
- tail_text.parent = tag
102
- tag.contents.append(tail_text)
103
-
104
- return tag
105
-
106
- def get_parsing_errors(self) -> List[str]:
107
- """
108
- Retrieve parsing errors encountered during processing.
109
-
110
- Returns:
111
- List[str]: List of parsing error messages
112
- """
113
- return self._parsing_errors
114
-
115
- def find_all(self, markup: str, tag: Optional[Union[str, List[str]]] = None,
116
- attrs: Optional[Dict[str, Any]] = None,
117
- recursive: bool = True,
118
- text: Optional[str] = None,
119
- limit: Optional[int] = None) -> List[Tag]:
120
- """
121
- Find all matching elements in the parsed document.
122
-
123
- Args:
124
- markup (str): HTML content to parse
125
- tag (str or List[str], optional): Tag name(s) to search for
126
- attrs (dict, optional): Attribute filters
127
- recursive (bool): Whether to search recursively
128
- text (str, optional): Text content to search for
129
- limit (int, optional): Maximum number of results
130
-
131
- Returns:
132
- List[Tag]: List of matching tags
133
- """
134
- root = self.parse(markup)
135
-
136
- def matches(element: Tag) -> bool:
137
- """Check if an element matches search criteria."""
138
- # Tag filter
139
- if tag and isinstance(tag, str) and element.name != tag:
140
- return False
141
- if tag and isinstance(tag, list) and element.name not in tag:
142
- return False
143
-
144
- # Attribute filter
145
- if attrs:
146
- for key, value in attrs.items():
147
- if key not in element.attrs or element.attrs[key] != value:
148
- return False
149
-
150
- # Text filter
151
- if text:
152
- element_text = ' '.join([str(c) for c in element.contents if isinstance(c, NavigableString)])
153
- if text not in element_text:
154
- return False
155
-
156
- return True
157
-
158
- def collect_matches(element: Tag, results: List[Tag]):
159
- """Recursively collect matching elements."""
160
- if matches(element):
161
- results.append(element)
162
- if limit and len(results) >= limit:
163
- return
164
-
165
- if recursive:
166
- for child in element.contents:
167
- if isinstance(child, Tag):
168
- collect_matches(child, results)
169
-
170
- results = []
171
- collect_matches(root, results)
172
- return results