webscout 8.2.2__py3-none-any.whl → 8.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (306) hide show
  1. webscout/AIauto.py +112 -22
  2. webscout/AIbase.py +144 -7
  3. webscout/AIutel.py +249 -131
  4. webscout/Bard.py +579 -206
  5. webscout/DWEBS.py +78 -35
  6. webscout/__init__.py +0 -1
  7. webscout/cli.py +256 -0
  8. webscout/conversation.py +307 -436
  9. webscout/exceptions.py +23 -0
  10. webscout/prompt_manager.py +56 -42
  11. webscout/version.py +1 -1
  12. webscout/webscout_search.py +65 -47
  13. webscout/webscout_search_async.py +81 -126
  14. webscout/yep_search.py +93 -43
  15. {webscout-8.2.2.dist-info → webscout-8.2.7.dist-info}/METADATA +172 -52
  16. webscout-8.2.7.dist-info/RECORD +26 -0
  17. {webscout-8.2.2.dist-info → webscout-8.2.7.dist-info}/WHEEL +1 -1
  18. webscout-8.2.7.dist-info/entry_points.txt +3 -0
  19. webscout-8.2.7.dist-info/top_level.txt +1 -0
  20. inferno/__init__.py +0 -6
  21. inferno/__main__.py +0 -9
  22. inferno/cli.py +0 -6
  23. webscout/Extra/GitToolkit/__init__.py +0 -10
  24. webscout/Extra/GitToolkit/gitapi/__init__.py +0 -12
  25. webscout/Extra/GitToolkit/gitapi/repository.py +0 -195
  26. webscout/Extra/GitToolkit/gitapi/user.py +0 -96
  27. webscout/Extra/GitToolkit/gitapi/utils.py +0 -62
  28. webscout/Extra/YTToolkit/YTdownloader.py +0 -957
  29. webscout/Extra/YTToolkit/__init__.py +0 -3
  30. webscout/Extra/YTToolkit/transcriber.py +0 -476
  31. webscout/Extra/YTToolkit/ytapi/__init__.py +0 -6
  32. webscout/Extra/YTToolkit/ytapi/channel.py +0 -307
  33. webscout/Extra/YTToolkit/ytapi/errors.py +0 -13
  34. webscout/Extra/YTToolkit/ytapi/extras.py +0 -45
  35. webscout/Extra/YTToolkit/ytapi/https.py +0 -88
  36. webscout/Extra/YTToolkit/ytapi/patterns.py +0 -61
  37. webscout/Extra/YTToolkit/ytapi/playlist.py +0 -59
  38. webscout/Extra/YTToolkit/ytapi/pool.py +0 -8
  39. webscout/Extra/YTToolkit/ytapi/query.py +0 -40
  40. webscout/Extra/YTToolkit/ytapi/stream.py +0 -63
  41. webscout/Extra/YTToolkit/ytapi/utils.py +0 -62
  42. webscout/Extra/YTToolkit/ytapi/video.py +0 -232
  43. webscout/Extra/__init__.py +0 -7
  44. webscout/Extra/autocoder/__init__.py +0 -9
  45. webscout/Extra/autocoder/autocoder.py +0 -849
  46. webscout/Extra/autocoder/autocoder_utiles.py +0 -332
  47. webscout/Extra/gguf.py +0 -682
  48. webscout/Extra/tempmail/__init__.py +0 -28
  49. webscout/Extra/tempmail/async_utils.py +0 -141
  50. webscout/Extra/tempmail/base.py +0 -161
  51. webscout/Extra/tempmail/cli.py +0 -187
  52. webscout/Extra/tempmail/emailnator.py +0 -84
  53. webscout/Extra/tempmail/mail_tm.py +0 -361
  54. webscout/Extra/tempmail/temp_mail_io.py +0 -292
  55. webscout/Extra/weather.py +0 -194
  56. webscout/Extra/weather_ascii.py +0 -76
  57. webscout/LLM.py +0 -442
  58. webscout/Litlogger/__init__.py +0 -67
  59. webscout/Litlogger/core/__init__.py +0 -6
  60. webscout/Litlogger/core/level.py +0 -23
  61. webscout/Litlogger/core/logger.py +0 -165
  62. webscout/Litlogger/handlers/__init__.py +0 -12
  63. webscout/Litlogger/handlers/console.py +0 -33
  64. webscout/Litlogger/handlers/file.py +0 -143
  65. webscout/Litlogger/handlers/network.py +0 -173
  66. webscout/Litlogger/styles/__init__.py +0 -7
  67. webscout/Litlogger/styles/colors.py +0 -249
  68. webscout/Litlogger/styles/formats.py +0 -458
  69. webscout/Litlogger/styles/text.py +0 -87
  70. webscout/Litlogger/utils/__init__.py +0 -6
  71. webscout/Litlogger/utils/detectors.py +0 -153
  72. webscout/Litlogger/utils/formatters.py +0 -200
  73. webscout/Local/__init__.py +0 -12
  74. webscout/Local/__main__.py +0 -9
  75. webscout/Local/api.py +0 -576
  76. webscout/Local/cli.py +0 -516
  77. webscout/Local/config.py +0 -75
  78. webscout/Local/llm.py +0 -287
  79. webscout/Local/model_manager.py +0 -253
  80. webscout/Local/server.py +0 -721
  81. webscout/Local/utils.py +0 -93
  82. webscout/Provider/AI21.py +0 -177
  83. webscout/Provider/AISEARCH/DeepFind.py +0 -250
  84. webscout/Provider/AISEARCH/ISou.py +0 -256
  85. webscout/Provider/AISEARCH/Perplexity.py +0 -359
  86. webscout/Provider/AISEARCH/__init__.py +0 -10
  87. webscout/Provider/AISEARCH/felo_search.py +0 -228
  88. webscout/Provider/AISEARCH/genspark_search.py +0 -208
  89. webscout/Provider/AISEARCH/hika_search.py +0 -194
  90. webscout/Provider/AISEARCH/iask_search.py +0 -436
  91. webscout/Provider/AISEARCH/monica_search.py +0 -246
  92. webscout/Provider/AISEARCH/scira_search.py +0 -324
  93. webscout/Provider/AISEARCH/webpilotai_search.py +0 -281
  94. webscout/Provider/Aitopia.py +0 -292
  95. webscout/Provider/AllenAI.py +0 -413
  96. webscout/Provider/Andi.py +0 -228
  97. webscout/Provider/Blackboxai.py +0 -229
  98. webscout/Provider/C4ai.py +0 -432
  99. webscout/Provider/ChatGPTClone.py +0 -226
  100. webscout/Provider/ChatGPTES.py +0 -237
  101. webscout/Provider/ChatGPTGratis.py +0 -194
  102. webscout/Provider/Chatify.py +0 -175
  103. webscout/Provider/Cloudflare.py +0 -273
  104. webscout/Provider/Cohere.py +0 -208
  105. webscout/Provider/DeepSeek.py +0 -196
  106. webscout/Provider/Deepinfra.py +0 -297
  107. webscout/Provider/ElectronHub.py +0 -709
  108. webscout/Provider/ExaAI.py +0 -261
  109. webscout/Provider/ExaChat.py +0 -342
  110. webscout/Provider/Free2GPT.py +0 -241
  111. webscout/Provider/GPTWeb.py +0 -193
  112. webscout/Provider/Gemini.py +0 -169
  113. webscout/Provider/GithubChat.py +0 -367
  114. webscout/Provider/Glider.py +0 -211
  115. webscout/Provider/Groq.py +0 -670
  116. webscout/Provider/HF_space/__init__.py +0 -0
  117. webscout/Provider/HF_space/qwen_qwen2.py +0 -206
  118. webscout/Provider/HeckAI.py +0 -233
  119. webscout/Provider/HuggingFaceChat.py +0 -462
  120. webscout/Provider/Hunyuan.py +0 -272
  121. webscout/Provider/Jadve.py +0 -266
  122. webscout/Provider/Koboldai.py +0 -381
  123. webscout/Provider/LambdaChat.py +0 -392
  124. webscout/Provider/Llama.py +0 -200
  125. webscout/Provider/Llama3.py +0 -204
  126. webscout/Provider/Marcus.py +0 -148
  127. webscout/Provider/Netwrck.py +0 -228
  128. webscout/Provider/OLLAMA.py +0 -396
  129. webscout/Provider/OPENAI/__init__.py +0 -25
  130. webscout/Provider/OPENAI/base.py +0 -46
  131. webscout/Provider/OPENAI/c4ai.py +0 -367
  132. webscout/Provider/OPENAI/chatgpt.py +0 -549
  133. webscout/Provider/OPENAI/chatgptclone.py +0 -460
  134. webscout/Provider/OPENAI/deepinfra.py +0 -272
  135. webscout/Provider/OPENAI/e2b.py +0 -1350
  136. webscout/Provider/OPENAI/exaai.py +0 -404
  137. webscout/Provider/OPENAI/exachat.py +0 -433
  138. webscout/Provider/OPENAI/freeaichat.py +0 -352
  139. webscout/Provider/OPENAI/glider.py +0 -316
  140. webscout/Provider/OPENAI/heckai.py +0 -337
  141. webscout/Provider/OPENAI/llmchatco.py +0 -327
  142. webscout/Provider/OPENAI/netwrck.py +0 -348
  143. webscout/Provider/OPENAI/opkfc.py +0 -488
  144. webscout/Provider/OPENAI/scirachat.py +0 -463
  145. webscout/Provider/OPENAI/sonus.py +0 -294
  146. webscout/Provider/OPENAI/standardinput.py +0 -425
  147. webscout/Provider/OPENAI/textpollinations.py +0 -285
  148. webscout/Provider/OPENAI/toolbaz.py +0 -405
  149. webscout/Provider/OPENAI/typegpt.py +0 -346
  150. webscout/Provider/OPENAI/uncovrAI.py +0 -455
  151. webscout/Provider/OPENAI/utils.py +0 -211
  152. webscout/Provider/OPENAI/venice.py +0 -413
  153. webscout/Provider/OPENAI/wisecat.py +0 -381
  154. webscout/Provider/OPENAI/writecream.py +0 -156
  155. webscout/Provider/OPENAI/x0gpt.py +0 -371
  156. webscout/Provider/OPENAI/yep.py +0 -327
  157. webscout/Provider/OpenGPT.py +0 -199
  158. webscout/Provider/Openai.py +0 -496
  159. webscout/Provider/PI.py +0 -344
  160. webscout/Provider/Perplexitylabs.py +0 -415
  161. webscout/Provider/Phind.py +0 -535
  162. webscout/Provider/PizzaGPT.py +0 -198
  163. webscout/Provider/QwenLM.py +0 -254
  164. webscout/Provider/Reka.py +0 -214
  165. webscout/Provider/StandardInput.py +0 -278
  166. webscout/Provider/TTI/AiForce/__init__.py +0 -22
  167. webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
  168. webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
  169. webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
  170. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
  171. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
  172. webscout/Provider/TTI/ImgSys/__init__.py +0 -23
  173. webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
  174. webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
  175. webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
  176. webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
  177. webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
  178. webscout/Provider/TTI/Nexra/__init__.py +0 -22
  179. webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
  180. webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
  181. webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
  182. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
  183. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
  184. webscout/Provider/TTI/__init__.py +0 -12
  185. webscout/Provider/TTI/aiarta/__init__.py +0 -2
  186. webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
  187. webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
  188. webscout/Provider/TTI/artbit/__init__.py +0 -22
  189. webscout/Provider/TTI/artbit/async_artbit.py +0 -155
  190. webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
  191. webscout/Provider/TTI/fastflux/__init__.py +0 -22
  192. webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
  193. webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
  194. webscout/Provider/TTI/huggingface/__init__.py +0 -22
  195. webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
  196. webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
  197. webscout/Provider/TTI/piclumen/__init__.py +0 -23
  198. webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
  199. webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
  200. webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
  201. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
  202. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
  203. webscout/Provider/TTI/talkai/__init__.py +0 -4
  204. webscout/Provider/TTI/talkai/async_talkai.py +0 -229
  205. webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
  206. webscout/Provider/TTS/__init__.py +0 -7
  207. webscout/Provider/TTS/deepgram.py +0 -156
  208. webscout/Provider/TTS/elevenlabs.py +0 -111
  209. webscout/Provider/TTS/gesserit.py +0 -127
  210. webscout/Provider/TTS/murfai.py +0 -113
  211. webscout/Provider/TTS/parler.py +0 -111
  212. webscout/Provider/TTS/speechma.py +0 -180
  213. webscout/Provider/TTS/streamElements.py +0 -333
  214. webscout/Provider/TTS/utils.py +0 -280
  215. webscout/Provider/TeachAnything.py +0 -187
  216. webscout/Provider/TextPollinationsAI.py +0 -231
  217. webscout/Provider/TwoAI.py +0 -199
  218. webscout/Provider/Venice.py +0 -219
  219. webscout/Provider/VercelAI.py +0 -234
  220. webscout/Provider/WebSim.py +0 -228
  221. webscout/Provider/WiseCat.py +0 -196
  222. webscout/Provider/Writecream.py +0 -211
  223. webscout/Provider/WritingMate.py +0 -197
  224. webscout/Provider/Youchat.py +0 -330
  225. webscout/Provider/__init__.py +0 -198
  226. webscout/Provider/ai4chat.py +0 -202
  227. webscout/Provider/aimathgpt.py +0 -189
  228. webscout/Provider/akashgpt.py +0 -342
  229. webscout/Provider/askmyai.py +0 -158
  230. webscout/Provider/asksteve.py +0 -203
  231. webscout/Provider/bagoodex.py +0 -145
  232. webscout/Provider/cerebras.py +0 -242
  233. webscout/Provider/chatglm.py +0 -205
  234. webscout/Provider/cleeai.py +0 -213
  235. webscout/Provider/copilot.py +0 -428
  236. webscout/Provider/elmo.py +0 -234
  237. webscout/Provider/freeaichat.py +0 -271
  238. webscout/Provider/gaurish.py +0 -244
  239. webscout/Provider/geminiapi.py +0 -208
  240. webscout/Provider/geminiprorealtime.py +0 -160
  241. webscout/Provider/granite.py +0 -187
  242. webscout/Provider/hermes.py +0 -219
  243. webscout/Provider/julius.py +0 -223
  244. webscout/Provider/koala.py +0 -268
  245. webscout/Provider/labyrinth.py +0 -340
  246. webscout/Provider/learnfastai.py +0 -266
  247. webscout/Provider/lepton.py +0 -194
  248. webscout/Provider/llama3mitril.py +0 -180
  249. webscout/Provider/llamatutor.py +0 -192
  250. webscout/Provider/llmchat.py +0 -213
  251. webscout/Provider/llmchatco.py +0 -311
  252. webscout/Provider/meta.py +0 -794
  253. webscout/Provider/multichat.py +0 -325
  254. webscout/Provider/promptrefine.py +0 -193
  255. webscout/Provider/scira_chat.py +0 -277
  256. webscout/Provider/scnet.py +0 -187
  257. webscout/Provider/searchchat.py +0 -293
  258. webscout/Provider/sonus.py +0 -208
  259. webscout/Provider/talkai.py +0 -194
  260. webscout/Provider/toolbaz.py +0 -320
  261. webscout/Provider/turboseek.py +0 -219
  262. webscout/Provider/tutorai.py +0 -252
  263. webscout/Provider/typefully.py +0 -280
  264. webscout/Provider/typegpt.py +0 -232
  265. webscout/Provider/uncovr.py +0 -312
  266. webscout/Provider/x0gpt.py +0 -256
  267. webscout/Provider/yep.py +0 -376
  268. webscout/litagent/__init__.py +0 -29
  269. webscout/litagent/agent.py +0 -455
  270. webscout/litagent/constants.py +0 -60
  271. webscout/litprinter/__init__.py +0 -59
  272. webscout/scout/__init__.py +0 -8
  273. webscout/scout/core/__init__.py +0 -7
  274. webscout/scout/core/crawler.py +0 -140
  275. webscout/scout/core/scout.py +0 -568
  276. webscout/scout/core/search_result.py +0 -96
  277. webscout/scout/core/text_analyzer.py +0 -63
  278. webscout/scout/core/text_utils.py +0 -277
  279. webscout/scout/core/web_analyzer.py +0 -52
  280. webscout/scout/core.py +0 -881
  281. webscout/scout/element.py +0 -460
  282. webscout/scout/parsers/__init__.py +0 -69
  283. webscout/scout/parsers/html5lib_parser.py +0 -172
  284. webscout/scout/parsers/html_parser.py +0 -236
  285. webscout/scout/parsers/lxml_parser.py +0 -178
  286. webscout/scout/utils.py +0 -37
  287. webscout/swiftcli/__init__.py +0 -809
  288. webscout/zeroart/__init__.py +0 -55
  289. webscout/zeroart/base.py +0 -60
  290. webscout/zeroart/effects.py +0 -99
  291. webscout/zeroart/fonts.py +0 -816
  292. webscout-8.2.2.dist-info/RECORD +0 -309
  293. webscout-8.2.2.dist-info/entry_points.txt +0 -5
  294. webscout-8.2.2.dist-info/top_level.txt +0 -3
  295. webstoken/__init__.py +0 -30
  296. webstoken/classifier.py +0 -189
  297. webstoken/keywords.py +0 -216
  298. webstoken/language.py +0 -128
  299. webstoken/ner.py +0 -164
  300. webstoken/normalizer.py +0 -35
  301. webstoken/processor.py +0 -77
  302. webstoken/sentiment.py +0 -206
  303. webstoken/stemmer.py +0 -73
  304. webstoken/tagger.py +0 -60
  305. webstoken/tokenizer.py +0 -158
  306. {webscout-8.2.2.dist-info → webscout-8.2.7.dist-info/licenses}/LICENSE.md +0 -0
@@ -1,96 +0,0 @@
1
- """
2
- Scout Search Result Module
3
- """
4
-
5
- from typing import List, Union, Callable, Any, Dict, Iterator
6
- from ..element import Tag
7
- from .text_analyzer import ScoutTextAnalyzer
8
-
9
-
10
- class ScoutSearchResult:
11
- """
12
- Represents a search result with advanced querying capabilities.
13
- Enhanced with more intelligent filtering and processing.
14
- """
15
- def __init__(self, results: List[Tag]):
16
- """
17
- Initialize a search result collection.
18
-
19
- Args:
20
- results (List[Tag]): List of matching tags
21
- """
22
- self._results = results
23
-
24
- def __len__(self) -> int:
25
- return len(self._results)
26
-
27
- def __iter__(self) -> Iterator[Tag]:
28
- return iter(self._results)
29
-
30
- def __getitem__(self, index: Union[int, slice]) -> Union[Tag, List[Tag]]:
31
- return self._results[index]
32
-
33
- def texts(self, separator=' ', strip=True) -> List[str]:
34
- """
35
- Extract texts from all results.
36
-
37
- Args:
38
- separator (str, optional): Text separator
39
- strip (bool, optional): Strip whitespace
40
-
41
- Returns:
42
- List[str]: List of extracted texts
43
- """
44
- return [tag.get_text(separator, strip) for tag in self._results]
45
-
46
- def attrs(self, attr_name: str) -> List[Any]:
47
- """
48
- Extract a specific attribute from all results.
49
-
50
- Args:
51
- attr_name (str): Attribute name to extract
52
-
53
- Returns:
54
- List[Any]: List of attribute values
55
- """
56
- return [tag.get(attr_name) for tag in self._results]
57
-
58
- def filter(self, predicate: Callable[[Tag], bool]) -> 'ScoutSearchResult':
59
- """
60
- Filter results using a predicate function.
61
-
62
- Args:
63
- predicate (Callable[[Tag], bool]): Filtering function
64
-
65
- Returns:
66
- ScoutSearchResult: Filtered search results
67
- """
68
- return ScoutSearchResult([tag for tag in self._results if predicate(tag)])
69
-
70
- def map(self, transform: Callable[[Tag], Any]) -> List[Any]:
71
- """
72
- Transform results using a mapping function.
73
-
74
- Args:
75
- transform (Callable[[Tag], Any]): Transformation function
76
-
77
- Returns:
78
- List[Any]: Transformed results
79
- """
80
- return [transform(tag) for tag in self._results]
81
-
82
- def analyze_text(self) -> Dict[str, Any]:
83
- """
84
- Perform text analysis on search results.
85
-
86
- Returns:
87
- Dict[str, Any]: Text analysis results
88
- """
89
- texts = self.texts(strip=True)
90
- full_text = ' '.join(texts)
91
-
92
- return {
93
- 'total_results': len(self._results),
94
- 'word_count': ScoutTextAnalyzer.count_words(full_text),
95
- 'entities': ScoutTextAnalyzer.extract_entities(full_text)
96
- }
@@ -1,63 +0,0 @@
1
- """
2
- Scout Text Analyzer Module
3
- """
4
- import re
5
- from collections import Counter
6
- from typing import List, Dict, Set
7
-
8
- class ScoutTextAnalyzer:
9
- """
10
- Advanced text analysis and processing utility.
11
- """
12
- @staticmethod
13
- def tokenize(text: str, lowercase=True, remove_punctuation=True) -> List[str]:
14
- """
15
- Tokenize text into words.
16
-
17
- Args:
18
- text (str): Input text
19
- lowercase (bool, optional): Convert to lowercase
20
- remove_punctuation (bool, optional): Remove punctuation
21
-
22
- Returns:
23
- List[str]: List of tokens
24
- """
25
- if lowercase:
26
- text = text.lower()
27
-
28
- if remove_punctuation:
29
- text = re.sub(r'[^\w\s]', '', text)
30
-
31
- return text.split()
32
-
33
- @staticmethod
34
- def count_words(text: str) -> Dict[str, int]:
35
- """
36
- Count word frequencies.
37
-
38
- Args:
39
- text (str): Input text
40
-
41
- Returns:
42
- Dict[str, int]: Word frequency dictionary
43
- """
44
- return dict(Counter(ScoutTextAnalyzer.tokenize(text)))
45
-
46
- @staticmethod
47
- def extract_entities(text: str) -> Dict[str, Set[str]]:
48
- """
49
- Extract named entities from text.
50
-
51
- Args:
52
- text (str): Input text
53
-
54
- Returns:
55
- Dict[str, Set[str]]: Extracted entities
56
- """
57
- entities = {
58
- 'emails': set(re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)),
59
- 'urls': set(re.findall(r'https?://\S+', text)),
60
- 'phones': set(re.findall(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b', text)),
61
- 'dates': set(re.findall(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', text))
62
- }
63
- return entities
@@ -1,277 +0,0 @@
1
- from typing import List, Dict, Tuple, Set, Pattern
2
- import re
3
-
4
-
5
- class SentenceTokenizer:
6
- """Advanced sentence tokenizer with support for complex cases and proper formatting."""
7
-
8
- def __init__(self) -> None:
9
- # Common abbreviations by category
10
- self.TITLES: Set[str] = {
11
- 'mr', 'mrs', 'ms', 'dr', 'prof', 'rev', 'sr', 'jr', 'esq',
12
- 'hon', 'pres', 'gov', 'atty', 'supt', 'det', 'rev', 'col','maj', 'gen', 'capt', 'cmdr',
13
- 'lt', 'sgt', 'cpl', 'pvt'
14
- }
15
-
16
- self.ACADEMIC: Set[str] = {
17
- 'ph.d', 'phd', 'm.d', 'md', 'b.a', 'ba', 'm.a', 'ma', 'd.d.s', 'dds',
18
- 'm.b.a', 'mba', 'b.sc', 'bsc', 'm.sc', 'msc', 'llb', 'll.b', 'bl'
19
- }
20
-
21
- self.ORGANIZATIONS: Set[str] = {
22
- 'inc', 'ltd', 'co', 'corp', 'llc', 'llp', 'assn', 'bros', 'plc', 'cos',
23
- 'intl', 'dept', 'est', 'dist', 'mfg', 'div'
24
- }
25
-
26
- self.MONTHS: Set[str] = {
27
- 'jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec'
28
- }
29
-
30
- self.UNITS: Set[str] = {
31
- 'oz', 'pt', 'qt', 'gal', 'ml', 'cc', 'km', 'cm', 'mm', 'ft', 'in',
32
- 'kg', 'lb', 'lbs', 'hz', 'khz', 'mhz', 'ghz', 'kb', 'mb', 'gb', 'tb'
33
- }
34
-
35
- self.TECHNOLOGY: Set[str] = {
36
- 'v', 'ver', 'app', 'sys', 'dir', 'exe', 'lib', 'api', 'sdk', 'url',
37
- 'cpu', 'gpu', 'ram', 'rom', 'hdd', 'ssd', 'lan', 'wan', 'sql', 'html'
38
- }
39
-
40
- self.MISC: Set[str] = {
41
- 'vs', 'etc', 'ie', 'eg', 'no', 'al', 'ca', 'cf', 'pp', 'est', 'st',
42
- 'approx', 'appt', 'apt', 'dept', 'depts', 'min', 'max', 'avg'
43
- }
44
-
45
- # Combine all abbreviations
46
- self.all_abbreviations: Set[str] = (
47
- self.TITLES | self.ACADEMIC | self.ORGANIZATIONS |
48
- self.MONTHS | self.UNITS | self.TECHNOLOGY | self.MISC
49
- )
50
-
51
- # Special patterns
52
- self.ELLIPSIS: str = r'\.{2,}|…'
53
- self.URL_PATTERN: str = (
54
- r'(?:https?:\/\/|www\.)[\w\-\.]+\.[a-zA-Z]{2,}(?:\/[^\s]*)?'
55
- )
56
- self.EMAIL_PATTERN: str = r'[\w\.-]+@[\w\.-]+\.\w+'
57
- self.NUMBER_PATTERN: str = (
58
- r'\d+(?:\.\d+)?(?:%|°|km|cm|mm|m|kg|g|lb|ft|in|mph|kmh|hz|mhz|ghz)?'
59
- )
60
-
61
- # Quote and bracket pairs
62
- self.QUOTE_PAIRS: Dict[str, str] = {
63
- '"': '"', "'": "'", '"': '"', "「": "」", "『": "』",
64
- "«": "»", "‹": "›", "'": "'", "‚": "'"
65
- }
66
-
67
- self.BRACKETS: Dict[str, str] = {
68
- '(': ')', '[': ']', '{': '}', '⟨': '⟩', '「': '」',
69
- '『': '』', '【': '】', '〖': '〗', '「': '」'
70
- }
71
-
72
- # Compile regex patterns
73
- self._compile_patterns()
74
-
75
- def _compile_patterns(self) -> None:
76
- """Compile regex patterns for better performance."""
77
- # Pattern for finding potential sentence boundaries
78
- self.SENTENCE_END: Pattern = re.compile(
79
- r'''
80
- # Group for sentence endings
81
- (?:
82
- # Standard endings with optional quotes/brackets
83
- (?<=[.!?])[\"\'\)\]\}»›」』\s]*
84
-
85
- # Ellipsis
86
- |(?:\.{2,}|…)
87
-
88
- # Asian-style endings
89
- |(?<=[。!?」』】\s])
90
- )
91
-
92
- # Must be followed by whitespace and capital letter or number
93
- (?=\s+(?:[A-Z0-9]|["'({[\[「『《‹〈][A-Z]))
94
- ''',
95
- re.VERBOSE
96
- )
97
-
98
- # Pattern for abbreviations
99
- abbrev_pattern = '|'.join(re.escape(abbr) for abbr in self.all_abbreviations)
100
- self.ABBREV_PATTERN: Pattern = re.compile(
101
- fr'\b(?:{abbrev_pattern})\.?',
102
- re.IGNORECASE
103
- )
104
-
105
- def _protect_special_cases(self, text: str) -> Tuple[str, Dict[str, str]]:
106
- """Protect URLs, emails, and other special cases from being split."""
107
- protected = text
108
- placeholders: Dict[str, str] = {}
109
- counter = 0
110
-
111
- # Protect URLs and emails
112
- for pattern in [self.URL_PATTERN, self.EMAIL_PATTERN]:
113
- for match in re.finditer(pattern, protected):
114
- placeholder = f'__PROTECTED_{counter}__'
115
- placeholders[placeholder] = match.group()
116
- protected = protected.replace(match.group(), placeholder)
117
- counter += 1
118
-
119
- # Protect quoted content
120
- stack = []
121
- protected_chars = list(protected)
122
- i = 0
123
- while i < len(protected_chars):
124
- char = protected_chars[i]
125
- if char in self.QUOTE_PAIRS:
126
- stack.append((char, i))
127
- elif stack and char == self.QUOTE_PAIRS[stack[-1][0]]:
128
- start_quote, start_idx = stack.pop()
129
- content = ''.join(protected_chars[start_idx:i + 1])
130
- placeholder = f'__PROTECTED_{counter}__'
131
- placeholders[placeholder] = content
132
- protected_chars[start_idx:i + 1] = list(placeholder)
133
- counter += 1
134
- i += 1
135
-
136
- return ''.join(protected_chars), placeholders
137
-
138
- def _restore_special_cases(self, text: str, placeholders: Dict[str, str]) -> str:
139
- """Restore protected content."""
140
- restored = text
141
- for placeholder, original in placeholders.items():
142
- restored = restored.replace(placeholder, original)
143
- return restored
144
-
145
- def _handle_abbreviations(self, text: str) -> str:
146
- """Handle abbreviations to prevent incorrect sentence splitting."""
147
- def replace_abbrev(match: re.Match) -> str:
148
- abbr = match.group().lower().rstrip('.')
149
- if abbr in self.all_abbreviations:
150
- return match.group().replace('.', '__DOT__')
151
- return match.group()
152
-
153
- return self.ABBREV_PATTERN.sub(replace_abbrev, text)
154
-
155
- def _normalize_whitespace(self, text: str) -> str:
156
- """Normalize whitespace while preserving paragraph breaks."""
157
- # Replace multiple newlines with special marker
158
- text = re.sub(r'\n\s*\n', ' __PARA__ ', text)
159
- # Normalize remaining whitespace
160
- text = re.sub(r'\s+', ' ', text)
161
- return text.strip()
162
-
163
- def _restore_formatting(self, sentences: List[str]) -> List[str]:
164
- """Restore original formatting and clean up sentences."""
165
- restored = []
166
- for sentence in sentences:
167
- # Restore dots in abbreviations
168
- sentence = sentence.replace('__DOT__', '.')
169
-
170
- # Restore paragraph breaks
171
- sentence = sentence.replace('__PARA__', '\n\n')
172
-
173
- # Clean up whitespace
174
- sentence = re.sub(r'\s+', ' ', sentence).strip()
175
-
176
- # Capitalize first letter if it's lowercase and not an abbreviation
177
- words = sentence.split()
178
- if words and words[0].lower() not in self.all_abbreviations:
179
- sentence = sentence[0].upper() + sentence[1:]
180
-
181
- if sentence:
182
- restored.append(sentence)
183
-
184
- return restored
185
-
186
- def tokenize(self, text: str) -> List[str]:
187
- """
188
- Split text into sentences while handling complex cases.
189
-
190
- Args:
191
- text (str): Input text to split into sentences.
192
-
193
- Returns:
194
- List[str]: List of properly formatted sentences.
195
- """
196
- if not text or not text.strip():
197
- return []
198
-
199
- # Step 1: Protect special cases
200
- protected_text, placeholders = self._protect_special_cases(text)
201
-
202
- # Step 2: Normalize whitespace
203
- protected_text = self._normalize_whitespace(protected_text)
204
-
205
- # Step 3: Handle abbreviations
206
- protected_text = self._handle_abbreviations(protected_text)
207
-
208
- # Step 4: Split into potential sentences
209
- potential_sentences = self.SENTENCE_END.split(protected_text)
210
-
211
- # Step 5: Process and restore formatting
212
- sentences = self._restore_formatting(potential_sentences)
213
-
214
- # Step 6: Restore special cases
215
- sentences = [self._restore_special_cases(s, placeholders) for s in sentences]
216
-
217
- # Step 7: Post-process sentences
218
- final_sentences = []
219
- current_sentence = []
220
-
221
- for sentence in sentences:
222
- # Skip empty sentences
223
- if not sentence.strip():
224
- continue
225
-
226
- # Check if sentence might be continuation of previous
227
- if current_sentence and sentence[0].islower():
228
- current_sentence.append(sentence)
229
- else:
230
- if current_sentence:
231
- final_sentences.append(' '.join(current_sentence))
232
- current_sentence = [sentence]
233
-
234
- # Add last sentence if exists
235
- if current_sentence:
236
- final_sentences.append(' '.join(current_sentence))
237
-
238
- return final_sentences
239
-
240
-
241
- def split_sentences(text: str) -> List[str]:
242
- """
243
- Convenience function to split text into sentences using SentenceTokenizer.
244
-
245
- Args:
246
- text (str): Input text to split into sentences.
247
-
248
- Returns:
249
- List[str]: List of properly formatted sentences.
250
- """
251
- tokenizer = SentenceTokenizer()
252
- return tokenizer.tokenize(text)
253
-
254
-
255
- if __name__ == "__main__":
256
- # Test text with various challenging cases
257
- test_text: str = """
258
- Dr. Smith (Ph.D., M.D.) visited Washington D.C. on Jan. 20, 2024! He met with Prof. Johnson at 3:30 p.m.
259
- They discussed A.I. and machine learning... "What about the U.S. market?" asked Dr. Smith.
260
- The meeting ended at 5 p.m. Later, they went to Mr. Wilson's house (located at 123 Main St.) for dinner.
261
-
262
- Visit our website at https://www.example.com or email us at test@example.com!
263
- The temperature was 72.5°F (22.5°C). The company's Q3 2023 revenue was $12.5M USD.
264
-
265
- 「これは日本語の文章です。」This is a mixed-language text! How cool is that?
266
-
267
- Some technical specs: CPU: 3.5GHz, RAM: 16GB, Storage: 2TB SSD.
268
- Common abbreviations: etc., i.e., e.g., vs., cf., approx. 100 units.
269
- """
270
-
271
- # Process and print each sentence
272
- sentences: List[str] = split_sentences(test_text)
273
- print("Detected sentences:")
274
- print("-" * 80)
275
- for i, sentence in enumerate(sentences, 1):
276
- print(f"{i}. {sentence}")
277
- print("-" * 80)
@@ -1,52 +0,0 @@
1
- """
2
- Scout Web Analyzer Module
3
- """
4
-
5
- from typing import Dict, Any
6
- from ..element import Tag
7
-
8
- class ScoutWebAnalyzer:
9
- """
10
- Advanced web content analysis utility.
11
- """
12
- @staticmethod
13
- def analyze_page_structure(scout_obj) -> Dict[str, Any]:
14
- """
15
- Analyze the structure of a web page.
16
-
17
- Args:
18
- scout_obj: Parsed Scout object
19
-
20
- Returns:
21
- Dict[str, Any]: Page structure analysis
22
- """
23
- analysis = {
24
- 'tag_distribution': {},
25
- 'class_distribution': {},
26
- 'id_distribution': {},
27
- 'depth_analysis': {}
28
- }
29
-
30
- # Tag distribution
31
- for tag in scout_obj.find_all():
32
- analysis['tag_distribution'][tag.name] = analysis['tag_distribution'].get(tag.name, 0) + 1
33
-
34
- # Class distribution
35
- for tag in scout_obj.find_all(attrs={'class': True}):
36
- for cls in tag.get('class', []):
37
- analysis['class_distribution'][cls] = analysis['class_distribution'].get(cls, 0) + 1
38
-
39
- # ID distribution
40
- for tag in scout_obj.find_all(attrs={'id': True}):
41
- analysis['id_distribution'][tag.get('id')] = analysis['id_distribution'].get(tag.get('id'), 0) + 1
42
-
43
- # Depth analysis
44
- def _analyze_depth(tag, current_depth=0):
45
- analysis['depth_analysis'][current_depth] = analysis['depth_analysis'].get(current_depth, 0) + 1
46
- for child in tag.contents:
47
- if isinstance(child, Tag):
48
- _analyze_depth(child, current_depth + 1)
49
-
50
- _analyze_depth(scout_obj._soup)
51
-
52
- return analysis