webscout 8.2.6__py3-none-any.whl → 8.2.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (292) hide show
  1. webscout/AIutel.py +97 -87
  2. webscout/version.py +1 -1
  3. {webscout-8.2.6.dist-info → webscout-8.2.7.dist-info}/METADATA +2 -15
  4. webscout-8.2.7.dist-info/RECORD +26 -0
  5. {webscout-8.2.6.dist-info → webscout-8.2.7.dist-info}/WHEEL +1 -1
  6. webscout-8.2.7.dist-info/entry_points.txt +3 -0
  7. webscout-8.2.7.dist-info/top_level.txt +1 -0
  8. webscout/Extra/GitToolkit/__init__.py +0 -10
  9. webscout/Extra/GitToolkit/gitapi/__init__.py +0 -12
  10. webscout/Extra/GitToolkit/gitapi/repository.py +0 -195
  11. webscout/Extra/GitToolkit/gitapi/user.py +0 -96
  12. webscout/Extra/GitToolkit/gitapi/utils.py +0 -62
  13. webscout/Extra/YTToolkit/YTdownloader.py +0 -957
  14. webscout/Extra/YTToolkit/__init__.py +0 -3
  15. webscout/Extra/YTToolkit/transcriber.py +0 -476
  16. webscout/Extra/YTToolkit/ytapi/__init__.py +0 -6
  17. webscout/Extra/YTToolkit/ytapi/channel.py +0 -307
  18. webscout/Extra/YTToolkit/ytapi/errors.py +0 -13
  19. webscout/Extra/YTToolkit/ytapi/extras.py +0 -45
  20. webscout/Extra/YTToolkit/ytapi/https.py +0 -88
  21. webscout/Extra/YTToolkit/ytapi/patterns.py +0 -61
  22. webscout/Extra/YTToolkit/ytapi/playlist.py +0 -59
  23. webscout/Extra/YTToolkit/ytapi/pool.py +0 -8
  24. webscout/Extra/YTToolkit/ytapi/query.py +0 -40
  25. webscout/Extra/YTToolkit/ytapi/stream.py +0 -63
  26. webscout/Extra/YTToolkit/ytapi/utils.py +0 -62
  27. webscout/Extra/YTToolkit/ytapi/video.py +0 -232
  28. webscout/Extra/__init__.py +0 -7
  29. webscout/Extra/autocoder/__init__.py +0 -9
  30. webscout/Extra/autocoder/autocoder.py +0 -910
  31. webscout/Extra/autocoder/autocoder_utiles.py +0 -332
  32. webscout/Extra/gguf.py +0 -684
  33. webscout/Extra/tempmail/__init__.py +0 -28
  34. webscout/Extra/tempmail/async_utils.py +0 -141
  35. webscout/Extra/tempmail/base.py +0 -161
  36. webscout/Extra/tempmail/cli.py +0 -187
  37. webscout/Extra/tempmail/emailnator.py +0 -84
  38. webscout/Extra/tempmail/mail_tm.py +0 -361
  39. webscout/Extra/tempmail/temp_mail_io.py +0 -292
  40. webscout/Extra/weather.py +0 -194
  41. webscout/Extra/weather_ascii.py +0 -76
  42. webscout/Litlogger/__init__.py +0 -67
  43. webscout/Litlogger/core/__init__.py +0 -6
  44. webscout/Litlogger/core/level.py +0 -23
  45. webscout/Litlogger/core/logger.py +0 -165
  46. webscout/Litlogger/handlers/__init__.py +0 -12
  47. webscout/Litlogger/handlers/console.py +0 -33
  48. webscout/Litlogger/handlers/file.py +0 -143
  49. webscout/Litlogger/handlers/network.py +0 -173
  50. webscout/Litlogger/styles/__init__.py +0 -7
  51. webscout/Litlogger/styles/colors.py +0 -249
  52. webscout/Litlogger/styles/formats.py +0 -458
  53. webscout/Litlogger/styles/text.py +0 -87
  54. webscout/Litlogger/utils/__init__.py +0 -6
  55. webscout/Litlogger/utils/detectors.py +0 -153
  56. webscout/Litlogger/utils/formatters.py +0 -200
  57. webscout/Provider/AI21.py +0 -177
  58. webscout/Provider/AISEARCH/DeepFind.py +0 -250
  59. webscout/Provider/AISEARCH/ISou.py +0 -256
  60. webscout/Provider/AISEARCH/Perplexity.py +0 -359
  61. webscout/Provider/AISEARCH/__init__.py +0 -10
  62. webscout/Provider/AISEARCH/felo_search.py +0 -228
  63. webscout/Provider/AISEARCH/genspark_search.py +0 -208
  64. webscout/Provider/AISEARCH/hika_search.py +0 -198
  65. webscout/Provider/AISEARCH/iask_search.py +0 -436
  66. webscout/Provider/AISEARCH/monica_search.py +0 -246
  67. webscout/Provider/AISEARCH/scira_search.py +0 -322
  68. webscout/Provider/AISEARCH/webpilotai_search.py +0 -281
  69. webscout/Provider/Aitopia.py +0 -316
  70. webscout/Provider/AllenAI.py +0 -447
  71. webscout/Provider/Andi.py +0 -228
  72. webscout/Provider/Blackboxai.py +0 -229
  73. webscout/Provider/ChatGPTClone.py +0 -237
  74. webscout/Provider/ChatGPTGratis.py +0 -194
  75. webscout/Provider/ChatSandbox.py +0 -342
  76. webscout/Provider/Cloudflare.py +0 -325
  77. webscout/Provider/Cohere.py +0 -208
  78. webscout/Provider/Deepinfra.py +0 -338
  79. webscout/Provider/ElectronHub.py +0 -773
  80. webscout/Provider/ExaAI.py +0 -261
  81. webscout/Provider/ExaChat.py +0 -358
  82. webscout/Provider/Free2GPT.py +0 -241
  83. webscout/Provider/GPTWeb.py +0 -249
  84. webscout/Provider/Gemini.py +0 -169
  85. webscout/Provider/GithubChat.py +0 -370
  86. webscout/Provider/GizAI.py +0 -285
  87. webscout/Provider/Glider.py +0 -222
  88. webscout/Provider/Groq.py +0 -801
  89. webscout/Provider/HF_space/__init__.py +0 -0
  90. webscout/Provider/HF_space/qwen_qwen2.py +0 -206
  91. webscout/Provider/HeckAI.py +0 -257
  92. webscout/Provider/HuggingFaceChat.py +0 -469
  93. webscout/Provider/Hunyuan.py +0 -283
  94. webscout/Provider/Jadve.py +0 -291
  95. webscout/Provider/Koboldai.py +0 -381
  96. webscout/Provider/LambdaChat.py +0 -411
  97. webscout/Provider/Llama3.py +0 -259
  98. webscout/Provider/MCPCore.py +0 -315
  99. webscout/Provider/Marcus.py +0 -206
  100. webscout/Provider/Nemotron.py +0 -218
  101. webscout/Provider/Netwrck.py +0 -270
  102. webscout/Provider/OLLAMA.py +0 -396
  103. webscout/Provider/OPENAI/__init__.py +0 -28
  104. webscout/Provider/OPENAI/ai4chat.py +0 -286
  105. webscout/Provider/OPENAI/base.py +0 -46
  106. webscout/Provider/OPENAI/c4ai.py +0 -367
  107. webscout/Provider/OPENAI/chatgpt.py +0 -549
  108. webscout/Provider/OPENAI/chatgptclone.py +0 -481
  109. webscout/Provider/OPENAI/deepinfra.py +0 -309
  110. webscout/Provider/OPENAI/e2b.py +0 -1350
  111. webscout/Provider/OPENAI/exaai.py +0 -404
  112. webscout/Provider/OPENAI/exachat.py +0 -437
  113. webscout/Provider/OPENAI/freeaichat.py +0 -352
  114. webscout/Provider/OPENAI/glider.py +0 -316
  115. webscout/Provider/OPENAI/groq.py +0 -354
  116. webscout/Provider/OPENAI/heckai.py +0 -341
  117. webscout/Provider/OPENAI/llmchatco.py +0 -327
  118. webscout/Provider/OPENAI/mcpcore.py +0 -376
  119. webscout/Provider/OPENAI/multichat.py +0 -368
  120. webscout/Provider/OPENAI/netwrck.py +0 -350
  121. webscout/Provider/OPENAI/opkfc.py +0 -488
  122. webscout/Provider/OPENAI/scirachat.py +0 -462
  123. webscout/Provider/OPENAI/sonus.py +0 -294
  124. webscout/Provider/OPENAI/standardinput.py +0 -425
  125. webscout/Provider/OPENAI/textpollinations.py +0 -329
  126. webscout/Provider/OPENAI/toolbaz.py +0 -406
  127. webscout/Provider/OPENAI/typegpt.py +0 -346
  128. webscout/Provider/OPENAI/uncovrAI.py +0 -455
  129. webscout/Provider/OPENAI/utils.py +0 -211
  130. webscout/Provider/OPENAI/venice.py +0 -413
  131. webscout/Provider/OPENAI/wisecat.py +0 -381
  132. webscout/Provider/OPENAI/writecream.py +0 -156
  133. webscout/Provider/OPENAI/x0gpt.py +0 -371
  134. webscout/Provider/OPENAI/yep.py +0 -327
  135. webscout/Provider/OpenGPT.py +0 -209
  136. webscout/Provider/Openai.py +0 -496
  137. webscout/Provider/PI.py +0 -429
  138. webscout/Provider/Perplexitylabs.py +0 -415
  139. webscout/Provider/QwenLM.py +0 -254
  140. webscout/Provider/Reka.py +0 -214
  141. webscout/Provider/StandardInput.py +0 -290
  142. webscout/Provider/TTI/AiForce/__init__.py +0 -22
  143. webscout/Provider/TTI/AiForce/async_aiforce.py +0 -224
  144. webscout/Provider/TTI/AiForce/sync_aiforce.py +0 -245
  145. webscout/Provider/TTI/FreeAIPlayground/__init__.py +0 -9
  146. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +0 -181
  147. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +0 -180
  148. webscout/Provider/TTI/ImgSys/__init__.py +0 -23
  149. webscout/Provider/TTI/ImgSys/async_imgsys.py +0 -202
  150. webscout/Provider/TTI/ImgSys/sync_imgsys.py +0 -195
  151. webscout/Provider/TTI/MagicStudio/__init__.py +0 -2
  152. webscout/Provider/TTI/MagicStudio/async_magicstudio.py +0 -111
  153. webscout/Provider/TTI/MagicStudio/sync_magicstudio.py +0 -109
  154. webscout/Provider/TTI/Nexra/__init__.py +0 -22
  155. webscout/Provider/TTI/Nexra/async_nexra.py +0 -286
  156. webscout/Provider/TTI/Nexra/sync_nexra.py +0 -258
  157. webscout/Provider/TTI/PollinationsAI/__init__.py +0 -23
  158. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +0 -311
  159. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +0 -265
  160. webscout/Provider/TTI/__init__.py +0 -12
  161. webscout/Provider/TTI/aiarta/__init__.py +0 -2
  162. webscout/Provider/TTI/aiarta/async_aiarta.py +0 -482
  163. webscout/Provider/TTI/aiarta/sync_aiarta.py +0 -440
  164. webscout/Provider/TTI/artbit/__init__.py +0 -22
  165. webscout/Provider/TTI/artbit/async_artbit.py +0 -155
  166. webscout/Provider/TTI/artbit/sync_artbit.py +0 -148
  167. webscout/Provider/TTI/fastflux/__init__.py +0 -22
  168. webscout/Provider/TTI/fastflux/async_fastflux.py +0 -261
  169. webscout/Provider/TTI/fastflux/sync_fastflux.py +0 -252
  170. webscout/Provider/TTI/huggingface/__init__.py +0 -22
  171. webscout/Provider/TTI/huggingface/async_huggingface.py +0 -199
  172. webscout/Provider/TTI/huggingface/sync_huggingface.py +0 -195
  173. webscout/Provider/TTI/piclumen/__init__.py +0 -23
  174. webscout/Provider/TTI/piclumen/async_piclumen.py +0 -268
  175. webscout/Provider/TTI/piclumen/sync_piclumen.py +0 -233
  176. webscout/Provider/TTI/pixelmuse/__init__.py +0 -4
  177. webscout/Provider/TTI/pixelmuse/async_pixelmuse.py +0 -249
  178. webscout/Provider/TTI/pixelmuse/sync_pixelmuse.py +0 -182
  179. webscout/Provider/TTI/talkai/__init__.py +0 -4
  180. webscout/Provider/TTI/talkai/async_talkai.py +0 -229
  181. webscout/Provider/TTI/talkai/sync_talkai.py +0 -207
  182. webscout/Provider/TTS/__init__.py +0 -8
  183. webscout/Provider/TTS/base.py +0 -159
  184. webscout/Provider/TTS/deepgram.py +0 -156
  185. webscout/Provider/TTS/elevenlabs.py +0 -111
  186. webscout/Provider/TTS/gesserit.py +0 -128
  187. webscout/Provider/TTS/murfai.py +0 -113
  188. webscout/Provider/TTS/parler.py +0 -111
  189. webscout/Provider/TTS/speechma.py +0 -180
  190. webscout/Provider/TTS/streamElements.py +0 -333
  191. webscout/Provider/TTS/utils.py +0 -280
  192. webscout/Provider/TeachAnything.py +0 -233
  193. webscout/Provider/TextPollinationsAI.py +0 -306
  194. webscout/Provider/TwoAI.py +0 -280
  195. webscout/Provider/TypliAI.py +0 -305
  196. webscout/Provider/Venice.py +0 -258
  197. webscout/Provider/VercelAI.py +0 -253
  198. webscout/Provider/WiseCat.py +0 -233
  199. webscout/Provider/WrDoChat.py +0 -370
  200. webscout/Provider/Writecream.py +0 -237
  201. webscout/Provider/WritingMate.py +0 -269
  202. webscout/Provider/Youchat.py +0 -330
  203. webscout/Provider/__init__.py +0 -178
  204. webscout/Provider/ai4chat.py +0 -203
  205. webscout/Provider/aimathgpt.py +0 -189
  206. webscout/Provider/akashgpt.py +0 -335
  207. webscout/Provider/asksteve.py +0 -212
  208. webscout/Provider/bagoodex.py +0 -145
  209. webscout/Provider/cerebras.py +0 -288
  210. webscout/Provider/chatglm.py +0 -215
  211. webscout/Provider/cleeai.py +0 -213
  212. webscout/Provider/copilot.py +0 -425
  213. webscout/Provider/elmo.py +0 -283
  214. webscout/Provider/freeaichat.py +0 -285
  215. webscout/Provider/geminiapi.py +0 -208
  216. webscout/Provider/geminiprorealtime.py +0 -160
  217. webscout/Provider/granite.py +0 -235
  218. webscout/Provider/hermes.py +0 -266
  219. webscout/Provider/julius.py +0 -223
  220. webscout/Provider/koala.py +0 -268
  221. webscout/Provider/learnfastai.py +0 -325
  222. webscout/Provider/llama3mitril.py +0 -215
  223. webscout/Provider/llmchat.py +0 -255
  224. webscout/Provider/llmchatco.py +0 -306
  225. webscout/Provider/meta.py +0 -798
  226. webscout/Provider/multichat.py +0 -364
  227. webscout/Provider/scira_chat.py +0 -297
  228. webscout/Provider/scnet.py +0 -243
  229. webscout/Provider/searchchat.py +0 -292
  230. webscout/Provider/sonus.py +0 -258
  231. webscout/Provider/talkai.py +0 -194
  232. webscout/Provider/toolbaz.py +0 -353
  233. webscout/Provider/turboseek.py +0 -266
  234. webscout/Provider/typefully.py +0 -330
  235. webscout/Provider/typegpt.py +0 -289
  236. webscout/Provider/uncovr.py +0 -368
  237. webscout/Provider/x0gpt.py +0 -299
  238. webscout/Provider/yep.py +0 -389
  239. webscout/litagent/__init__.py +0 -29
  240. webscout/litagent/agent.py +0 -455
  241. webscout/litagent/constants.py +0 -60
  242. webscout/litprinter/__init__.py +0 -59
  243. webscout/scout/__init__.py +0 -8
  244. webscout/scout/core/__init__.py +0 -7
  245. webscout/scout/core/crawler.py +0 -140
  246. webscout/scout/core/scout.py +0 -568
  247. webscout/scout/core/search_result.py +0 -96
  248. webscout/scout/core/text_analyzer.py +0 -63
  249. webscout/scout/core/text_utils.py +0 -277
  250. webscout/scout/core/web_analyzer.py +0 -52
  251. webscout/scout/core.py +0 -881
  252. webscout/scout/element.py +0 -460
  253. webscout/scout/parsers/__init__.py +0 -69
  254. webscout/scout/parsers/html5lib_parser.py +0 -172
  255. webscout/scout/parsers/html_parser.py +0 -236
  256. webscout/scout/parsers/lxml_parser.py +0 -178
  257. webscout/scout/utils.py +0 -37
  258. webscout/swiftcli/__init__.py +0 -95
  259. webscout/swiftcli/core/__init__.py +0 -7
  260. webscout/swiftcli/core/cli.py +0 -297
  261. webscout/swiftcli/core/context.py +0 -104
  262. webscout/swiftcli/core/group.py +0 -241
  263. webscout/swiftcli/decorators/__init__.py +0 -28
  264. webscout/swiftcli/decorators/command.py +0 -221
  265. webscout/swiftcli/decorators/options.py +0 -220
  266. webscout/swiftcli/decorators/output.py +0 -252
  267. webscout/swiftcli/exceptions.py +0 -21
  268. webscout/swiftcli/plugins/__init__.py +0 -9
  269. webscout/swiftcli/plugins/base.py +0 -135
  270. webscout/swiftcli/plugins/manager.py +0 -262
  271. webscout/swiftcli/utils/__init__.py +0 -59
  272. webscout/swiftcli/utils/formatting.py +0 -252
  273. webscout/swiftcli/utils/parsing.py +0 -267
  274. webscout/zeroart/__init__.py +0 -55
  275. webscout/zeroart/base.py +0 -60
  276. webscout/zeroart/effects.py +0 -99
  277. webscout/zeroart/fonts.py +0 -816
  278. webscout-8.2.6.dist-info/RECORD +0 -307
  279. webscout-8.2.6.dist-info/entry_points.txt +0 -3
  280. webscout-8.2.6.dist-info/top_level.txt +0 -2
  281. webstoken/__init__.py +0 -30
  282. webstoken/classifier.py +0 -189
  283. webstoken/keywords.py +0 -216
  284. webstoken/language.py +0 -128
  285. webstoken/ner.py +0 -164
  286. webstoken/normalizer.py +0 -35
  287. webstoken/processor.py +0 -77
  288. webstoken/sentiment.py +0 -206
  289. webstoken/stemmer.py +0 -73
  290. webstoken/tagger.py +0 -60
  291. webstoken/tokenizer.py +0 -158
  292. {webscout-8.2.6.dist-info → webscout-8.2.7.dist-info}/licenses/LICENSE.md +0 -0
webstoken/keywords.py DELETED
@@ -1,216 +0,0 @@
1
- """
2
- Keyword extraction module using statistical and graph-based approaches.
3
- """
4
-
5
- from typing import Dict, List, Set, Tuple
6
- from collections import Counter, defaultdict
7
- import math
8
- import re
9
-
10
- from .tokenizer import WordTokenizer
11
- from .normalizer import TextNormalizer
12
-
13
-
14
- class KeywordExtractor:
15
- """Keyword extraction using TF-IDF and TextRank-inspired algorithms."""
16
-
17
- def __init__(self):
18
- self.word_tokenizer = WordTokenizer()
19
- self.normalizer = TextNormalizer()
20
-
21
- # Common words to filter out beyond basic stop words
22
- self.filter_words: Set[str] = {
23
- 'would', 'could', 'should', 'said', 'also', 'may', 'might',
24
- 'must', 'need', 'shall', 'want', 'way', 'time', 'just',
25
- 'now', 'like', 'make', 'made', 'well', 'back', 'even',
26
- 'still', 'way', 'take', 'took', 'get', 'got', 'go', 'went'
27
- }
28
-
29
- def _split_into_sentences(self, text: str) -> List[str]:
30
- """Split text into sentences using simple rules."""
31
- text = re.sub(r'\s+', ' ', text)
32
- sentences = re.split(r'[.!?]+', text)
33
- return [s.strip() for s in sentences if s.strip()]
34
-
35
- def _calculate_word_scores(self, text: str) -> Dict[str, float]:
36
- """Calculate word importance scores using frequency and position."""
37
- # Normalize and tokenize text
38
- text = self.normalizer.normalize(text)
39
- sentences = self._split_into_sentences(text)
40
-
41
- word_scores: Dict[str, float] = defaultdict(float)
42
- word_positions: Dict[str, List[int]] = defaultdict(list)
43
-
44
- # Calculate word frequencies and positions
45
- for i, sentence in enumerate(sentences):
46
- words = self.word_tokenizer.tokenize(sentence)
47
- for j, word in enumerate(words):
48
- word = word.lower()
49
- if (word.isalnum() and
50
- len(word) > 2 and
51
- word not in self.filter_words and
52
- word not in self.normalizer.stop_words):
53
- word_scores[word] += 1
54
- word_positions[word].append(i)
55
-
56
- # Adjust scores based on position
57
- num_sentences = len(sentences)
58
- for word, positions in word_positions.items():
59
- # Words appearing in first or last sentences get bonus
60
- if 0 in positions:
61
- word_scores[word] *= 1.2
62
- if num_sentences - 1 in positions:
63
- word_scores[word] *= 1.1
64
-
65
- # Words appearing throughout text get bonus
66
- coverage = len(set(positions)) / num_sentences
67
- word_scores[word] *= (1 + coverage)
68
-
69
- return word_scores
70
-
71
- def _calculate_word_cooccurrence(self, text: str, window_size: int = 3) -> Dict[str, Dict[str, int]]:
72
- """Calculate word co-occurrence matrix."""
73
- # Normalize and tokenize text
74
- text = self.normalizer.normalize(text)
75
- words = self.word_tokenizer.tokenize(text)
76
-
77
- # Filter words
78
- filtered_words = [
79
- word.lower() for word in words
80
- if (word.isalnum() and
81
- len(word) > 2 and
82
- word.lower() not in self.filter_words and
83
- word.lower() not in self.normalizer.stop_words)
84
- ]
85
-
86
- # Build co-occurrence matrix
87
- cooccurrence: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
88
-
89
- for i, word in enumerate(filtered_words):
90
- for j in range(max(0, i - window_size), min(len(filtered_words), i + window_size + 1)):
91
- if i != j:
92
- cooccurrence[word][filtered_words[j]] += 1
93
- cooccurrence[filtered_words[j]][word] += 1
94
-
95
- return cooccurrence
96
-
97
- def _textrank_scores(self, cooccurrence: Dict[str, Dict[str, int]], damping: float = 0.85,
98
- iterations: int = 30) -> Dict[str, float]:
99
- """Calculate TextRank scores from co-occurrence matrix."""
100
- scores = {word: 1.0 for word in cooccurrence}
101
-
102
- for _ in range(iterations):
103
- new_scores = {}
104
- for word in scores:
105
- if not cooccurrence[word]:
106
- continue
107
-
108
- incoming_score = sum(
109
- scores[other] * cooccurrence[word][other] / sum(cooccurrence[other].values())
110
- for other in cooccurrence[word]
111
- )
112
- new_scores[word] = (1 - damping) + damping * incoming_score
113
-
114
- # Check convergence
115
- score_diff = sum(abs(new_scores[w] - scores[w]) for w in scores)
116
- scores = new_scores
117
- if score_diff < 0.0001:
118
- break
119
-
120
- return scores
121
-
122
- def extract_keywords(self, text: str, num_keywords: int = 10,
123
- use_textrank: bool = True) -> List[Tuple[str, float]]:
124
- """
125
- Extract keywords from text using combined frequency and graph-based approach.
126
-
127
- Args:
128
- text: Input text
129
- num_keywords: Number of keywords to return
130
- use_textrank: Whether to use TextRank algorithm
131
-
132
- Returns:
133
- List of (keyword, score) tuples, sorted by score
134
- """
135
- if not text:
136
- return []
137
-
138
- # Get frequency-based scores
139
- freq_scores = self._calculate_word_scores(text)
140
-
141
- if use_textrank:
142
- # Get TextRank scores
143
- cooccurrence = self._calculate_word_cooccurrence(text)
144
- textrank_scores = self._textrank_scores(cooccurrence)
145
-
146
- # Combine scores
147
- combined_scores = {
148
- word: freq_scores[word] * textrank_scores.get(word, 0)
149
- for word in freq_scores
150
- }
151
- else:
152
- combined_scores = freq_scores
153
-
154
- # Sort and return top keywords
155
- sorted_words = sorted(
156
- combined_scores.items(),
157
- key=lambda x: x[1],
158
- reverse=True
159
- )
160
-
161
- return sorted_words[:num_keywords]
162
-
163
- def extract_keyphrases(self, text: str, num_phrases: int = 5,
164
- min_words: int = 2, max_words: int = 4) -> List[Tuple[str, float]]:
165
- """
166
- Extract key phrases from text.
167
-
168
- Args:
169
- text: Input text
170
- num_phrases: Number of phrases to return
171
- min_words: Minimum words in phrase
172
- max_words: Maximum words in phrase
173
-
174
- Returns:
175
- List of (phrase, score) tuples, sorted by score
176
- """
177
- # Normalize and split into sentences
178
- text = self.normalizer.normalize(text)
179
- sentences = self._split_into_sentences(text)
180
-
181
- # Get word importance scores
182
- word_scores = self._calculate_word_scores(text)
183
-
184
- # Extract candidate phrases
185
- phrases: Dict[str, float] = {}
186
-
187
- for sentence in sentences:
188
- words = self.word_tokenizer.tokenize(sentence)
189
-
190
- # Generate phrases of different lengths
191
- for i in range(len(words)):
192
- for length in range(min_words, min(max_words + 1, len(words) - i + 1)):
193
- phrase_words = words[i:i+length]
194
-
195
- # Filter phrases
196
- if all(
197
- word.isalnum() and
198
- len(word) > 2 and
199
- word.lower() not in self.filter_words and
200
- word.lower() not in self.normalizer.stop_words
201
- for word in phrase_words
202
- ):
203
- phrase = ' '.join(phrase_words)
204
- # Score is average of word scores
205
- score = sum(word_scores.get(word.lower(), 0) for word in phrase_words)
206
- score /= len(phrase_words)
207
- phrases[phrase] = score
208
-
209
- # Sort and return top phrases
210
- sorted_phrases = sorted(
211
- phrases.items(),
212
- key=lambda x: x[1],
213
- reverse=True
214
- )
215
-
216
- return sorted_phrases[:num_phrases]
webstoken/language.py DELETED
@@ -1,128 +0,0 @@
1
- """
2
- Language detection module using character and word frequency analysis.
3
- """
4
-
5
- from typing import Dict, List, Set, Tuple
6
- from collections import Counter
7
- import re
8
-
9
-
10
- class LanguageDetector:
11
- """Language detection using character n-gram frequencies."""
12
-
13
- def __init__(self):
14
- # Language profiles based on common character sequences
15
- self.language_profiles = {
16
- 'ENGLISH': {
17
- 'chars': 'etaoinshrdlcumwfgypbvkjxqz',
18
- 'ngrams': {'th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd',
19
- 'ti', 'es', 'or', 'te', 'of', 'ed', 'is', 'it', 'al', 'ar',
20
- 'st', 'to', 'nt', 'ng', 'se', 'ha', 'as', 'ou', 'io', 'le'},
21
- 'words': {'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
22
- 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
23
- 'do', 'at'}
24
- },
25
- 'SPANISH': {
26
- 'chars': 'eaosrnidlctumpbgvyqhfzjñxwk',
27
- 'ngrams': {'de', 'en', 'el', 'la', 'os', 'es', 'as', 'ar', 'er', 'ra',
28
- 'al', 'an', 'do', 'or', 'ta', 'ue', 'io', 'on', 'ro', 'ad',
29
- 'te', 'co', 'st', 'ci', 'nt', 'to', 'lo', 'no', 'po', 'ac'},
30
- 'words': {'de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'se', 'del',
31
- 'las', 'un', 'por', 'con', 'no', 'una', 'su', 'para', 'es',
32
- 'al'}
33
- },
34
- 'FRENCH': {
35
- 'chars': 'esaitnrulodcpmévqfbghàjxèêyçwzùâîôûëïüœ',
36
- 'ngrams': {'es', 'le', 'en', 'de', 'nt', 'on', 're', 'er', 'ai', 'te',
37
- 'la', 'an', 'ou', 'it', 'ur', 'et', 'el', 'se', 'qu', 'me',
38
- 'is', 'ar', 'ce', 'ns', 'us', 'ue', 'ss', 'ie', 'em', 'tr'},
39
- 'words': {'le', 'de', 'un', 'être', 'et', 'à', 'il', 'avoir', 'ne',
40
- 'je', 'son', 'que', 'se', 'qui', 'ce', 'dans', 'en', 'du',
41
- 'elle', 'au'}
42
- },
43
- 'GERMAN': {
44
- 'chars': 'enisratdhulcgmobwfkzvüpäößjyqxéèêëàáâãåāăąćčĉċďđ',
45
- 'ngrams': {'en', 'er', 'ch', 'de', 'ei', 'in', 'te', 'nd', 'ie', 'ge',
46
- 'st', 'ne', 'be', 'es', 'un', 'zu', 'an', 'ng', 'au', 'it',
47
- 'is', 'he', 'ht', 'se', 'ck', 'ic', 're', 'ns', 'sc', 'tz'},
48
- 'words': {'der', 'die', 'und', 'in', 'den', 'von', 'zu', 'das', 'mit',
49
- 'sich', 'des', 'auf', 'für', 'ist', 'im', 'dem', 'nicht',
50
- 'ein', 'eine', 'als'}
51
- }
52
- }
53
-
54
- # Compile word patterns
55
- self.word_pattern = re.compile(r'\b\w+\b')
56
-
57
- def _extract_ngrams(self, text: str, n: int = 2) -> List[str]:
58
- """Extract character n-grams from text."""
59
- text = text.lower()
60
- return [text[i:i+n] for i in range(len(text)-n+1)]
61
-
62
- def _calculate_char_frequencies(self, text: str) -> Dict[str, float]:
63
- """Calculate character frequencies in text."""
64
- text = text.lower()
65
- char_count = Counter(c for c in text if c.isalpha())
66
- total = sum(char_count.values()) or 1
67
- return {char: count/total for char, count in char_count.items()}
68
-
69
- def _calculate_ngram_frequencies(self, text: str) -> Dict[str, float]:
70
- """Calculate n-gram frequencies in text."""
71
- ngrams = self._extract_ngrams(text)
72
- ngram_count = Counter(ngrams)
73
- total = sum(ngram_count.values()) or 1
74
- return {ngram: count/total for ngram, count in ngram_count.items()}
75
-
76
- def _calculate_word_frequencies(self, text: str) -> Dict[str, float]:
77
- """Calculate word frequencies in text."""
78
- words = self.word_pattern.findall(text.lower())
79
- word_count = Counter(words)
80
- total = sum(word_count.values()) or 1
81
- return {word: count/total for word, count in word_count.items()}
82
-
83
- def _calculate_similarity(self, freq1: Dict[str, float], freq2: Dict[str, float]) -> float:
84
- """Calculate similarity between two frequency distributions."""
85
- common_keys = set(freq1.keys()) & set(freq2.keys())
86
- if not common_keys:
87
- return 0.0
88
-
89
- similarity = sum(min(freq1.get(k, 0), freq2.get(k, 0)) for k in common_keys)
90
- return similarity
91
-
92
- def detect(self, text: str) -> List[Tuple[str, float]]:
93
- """
94
- Detect the language of text with confidence scores.
95
-
96
- Returns:
97
- List of (language, confidence) tuples, sorted by confidence
98
- """
99
- if not text:
100
- return []
101
-
102
- # Calculate frequencies for input text
103
- char_freqs = self._calculate_char_frequencies(text)
104
- ngram_freqs = self._calculate_ngram_frequencies(text)
105
- word_freqs = self._calculate_word_frequencies(text)
106
-
107
- # Calculate similarity scores for each language
108
- scores = []
109
- for lang, profile in self.language_profiles.items():
110
- # Character similarity
111
- char_sim = sum(char_freqs.get(c, 0) for c in profile['chars'])
112
-
113
- # N-gram similarity
114
- ngram_sim = sum(ngram_freqs.get(ng, 0) for ng in profile['ngrams'])
115
-
116
- # Word similarity
117
- word_sim = sum(word_freqs.get(w, 0) for w in profile['words'])
118
-
119
- # Combined score (weighted average)
120
- total_score = (0.3 * char_sim + 0.4 * ngram_sim + 0.3 * word_sim)
121
- scores.append((lang, total_score))
122
-
123
- # Normalize scores
124
- total = sum(score for _, score in scores) or 1
125
- normalized_scores = [(lang, score/total) for lang, score in scores]
126
-
127
- # Sort by confidence
128
- return sorted(normalized_scores, key=lambda x: x[1], reverse=True)
webstoken/ner.py DELETED
@@ -1,164 +0,0 @@
1
- """
2
- Named Entity Recognition (NER) module for identifying and classifying named entities.
3
- """
4
-
5
- from typing import List, Tuple, Dict, Set
6
- import re
7
-
8
-
9
- class NamedEntityRecognizer:
10
- """Rule-based Named Entity Recognition."""
11
-
12
- def __init__(self):
13
- # Common entity patterns
14
- self.PERSON_TITLES = {
15
- 'mr', 'mrs', 'ms', 'miss', 'dr', 'prof', 'sir', 'madam',
16
- 'lord', 'lady', 'president', 'ceo', 'director'
17
- }
18
-
19
- self.ORGANIZATION_SUFFIXES = {
20
- 'inc', 'corp', 'ltd', 'llc', 'company', 'corporation',
21
- 'associates', 'partners', 'foundation', 'institute'
22
- }
23
-
24
- self.LOCATION_INDICATORS = {
25
- 'street', 'road', 'avenue', 'boulevard', 'lane', 'drive',
26
- 'circle', 'square', 'park', 'bridge', 'river', 'lake',
27
- 'mountain', 'forest', 'city', 'town', 'village', 'country'
28
- }
29
-
30
- self.DATE_MONTHS = {
31
- 'january', 'february', 'march', 'april', 'may', 'june',
32
- 'july', 'august', 'september', 'october', 'november', 'december'
33
- }
34
-
35
- # Compile regex patterns
36
- self.patterns = {
37
- 'EMAIL': re.compile(r'\b[\w\.-]+@[\w\.-]+\.\w+\b'),
38
- 'URL': re.compile(r'https?://(?:[\w-]|(?:%[\da-fA-F]{2}))+'),
39
- 'PHONE': re.compile(r'\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'),
40
- 'DATE': re.compile(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b'),
41
- 'TIME': re.compile(r'\b\d{1,2}:\d{2}(?::\d{2})?(?:\s*[AaPp][Mm])?\b'),
42
- 'MONEY': re.compile(r'\$\d+(?:,\d{3})*(?:\.\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?\s*(?:dollars|USD|EUR|GBP)'),
43
- 'PERCENTAGE': re.compile(r'\b\d+(?:\.\d+)?%\b')
44
- }
45
-
46
- def is_capitalized(self, word: str) -> bool:
47
- """Check if a word is capitalized."""
48
- return word and word[0].isupper()
49
-
50
- def extract_entities(self, text: str) -> Dict[str, List[Tuple[str, str]]]:
51
- """
52
- Extract named entities from text.
53
-
54
- Returns:
55
- Dict mapping entity types to list of (text, label) tuples
56
- """
57
- entities = {
58
- 'PERSON': [],
59
- 'ORGANIZATION': [],
60
- 'LOCATION': [],
61
- 'DATE': [],
62
- 'TIME': [],
63
- 'MONEY': [],
64
- 'EMAIL': [],
65
- 'URL': [],
66
- 'PHONE': [],
67
- 'PERCENTAGE': []
68
- }
69
-
70
- # First find regex pattern matches
71
- for label, pattern in self.patterns.items():
72
- for match in pattern.finditer(text):
73
- entities[label].append((match.group(), label))
74
-
75
- # Process text word by word for other entities
76
- words = text.split()
77
- i = 0
78
- while i < len(words):
79
- word = words[i]
80
- next_word = words[i + 1] if i + 1 < len(words) else None
81
-
82
- # Check for person names
83
- if word.lower() in self.PERSON_TITLES and next_word and self.is_capitalized(next_word):
84
- name_parts = []
85
- j = i + 1
86
- while j < len(words) and self.is_capitalized(words[j]):
87
- name_parts.append(words[j])
88
- j += 1
89
- if name_parts:
90
- entities['PERSON'].append((' '.join(name_parts), 'PERSON'))
91
- i = j
92
- continue
93
-
94
- # Check for organizations
95
- if self.is_capitalized(word):
96
- org_parts = [word]
97
- j = i + 1
98
- while j < len(words) and (
99
- self.is_capitalized(words[j]) or
100
- words[j].lower() in self.ORGANIZATION_SUFFIXES
101
- ):
102
- org_parts.append(words[j])
103
- j += 1
104
- if len(org_parts) > 1 or (
105
- len(org_parts) == 1 and
106
- any(suff in word.lower() for suff in self.ORGANIZATION_SUFFIXES)
107
- ):
108
- entities['ORGANIZATION'].append((' '.join(org_parts), 'ORGANIZATION'))
109
- i = j
110
- continue
111
-
112
- # Check for locations
113
- if word.lower() in self.LOCATION_INDICATORS and i > 0:
114
- if self.is_capitalized(words[i - 1]):
115
- entities['LOCATION'].append((words[i - 1] + ' ' + word, 'LOCATION'))
116
-
117
- i += 1
118
-
119
- return entities
120
-
121
- def tag_text(self, text: str) -> List[Tuple[str, str]]:
122
- """
123
- Tag each word in text with its entity type.
124
-
125
- Returns:
126
- List of (word, entity_type) tuples
127
- """
128
- entities = self.extract_entities(text)
129
- tagged = []
130
-
131
- # Create a map of word positions to entity labels
132
- position_labels = {}
133
- text_lower = text.lower()
134
-
135
- for entity_type, entity_list in entities.items():
136
- for entity_text, _ in entity_list:
137
- start = text_lower.find(entity_text.lower())
138
- if start != -1:
139
- end = start + len(entity_text)
140
- for pos in range(start, end):
141
- position_labels[pos] = entity_type
142
-
143
- # Tag each character position
144
- current_pos = 0
145
- current_word = []
146
- current_label = 'O' # Outside any entity
147
-
148
- for char in text:
149
- if char.isspace():
150
- if current_word:
151
- tagged.append((''.join(current_word), current_label))
152
- current_word = []
153
- current_label = 'O'
154
- else:
155
- current_word.append(char)
156
- if current_pos in position_labels:
157
- current_label = position_labels[current_pos]
158
- current_pos += 1
159
-
160
- # Add last word if exists
161
- if current_word:
162
- tagged.append((''.join(current_word), current_label))
163
-
164
- return tagged
webstoken/normalizer.py DELETED
@@ -1,35 +0,0 @@
1
- """
2
- Text normalization utilities.
3
- """
4
-
5
- import re
6
- from typing import List, Set
7
-
8
-
9
- class TextNormalizer:
10
- """Text normalization utilities."""
11
-
12
- def __init__(self):
13
- self.stop_words: Set[str] = {
14
- 'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
15
- 'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
16
- 'to', 'was', 'were', 'will', 'with'
17
- }
18
-
19
- def remove_stop_words(self, tokens: List[str]) -> List[str]:
20
- """Remove common stop words from token list."""
21
- return [token for token in tokens if token.lower() not in self.stop_words]
22
-
23
- def normalize(self, text: str) -> str:
24
- """Apply various normalization steps to text."""
25
- # Convert to lowercase
26
- text = text.lower()
27
-
28
- # Replace multiple spaces with single space
29
- text = re.sub(r'\s+', ' ', text)
30
-
31
- # Remove special characters except apostrophes within words
32
- text = re.sub(r'[^a-z0-9\s\']', '', text)
33
- text = re.sub(r'\s\'|\'\s', ' ', text)
34
-
35
- return text.strip()
webstoken/processor.py DELETED
@@ -1,77 +0,0 @@
1
- """
2
- Main text processing utilities combining all NLP components.
3
- """
4
-
5
- from typing import Dict, Any, List, Tuple
6
-
7
- from .tokenizer import SentenceTokenizer, WordTokenizer
8
- from .tagger import POSTagger
9
- from .stemmer import Stemmer
10
- from .normalizer import TextNormalizer
11
-
12
-
13
- def process_text(text: str, normalize: bool = True, remove_stops: bool = True) -> Dict[str, Any]:
14
- """
15
- Process text using all available NLP tools.
16
-
17
- Args:
18
- text (str): Input text to process
19
- normalize (bool): Whether to normalize text
20
- remove_stops (bool): Whether to remove stop words
21
-
22
- Returns:
23
- Dict containing processed results with the following structure:
24
- {
25
- 'sentences': [
26
- {
27
- 'original': str, # Original sentence
28
- 'tokens': List[str], # Word tokens
29
- 'pos_tags': List[Tuple[str, str]], # (word, tag) pairs
30
- 'stems': List[Tuple[str, str]] # (word, stem) pairs
31
- },
32
- ...
33
- ],
34
- 'num_sentences': int, # Total number of sentences
35
- 'num_tokens': int # Total number of tokens
36
- }
37
- """
38
- # Initialize tools
39
- sentence_tokenizer = SentenceTokenizer()
40
- word_tokenizer = WordTokenizer()
41
- pos_tagger = POSTagger()
42
- stemmer = Stemmer()
43
- normalizer = TextNormalizer()
44
-
45
- # Process text
46
- if normalize:
47
- text = normalizer.normalize(text)
48
-
49
- # Get sentences
50
- sentences = sentence_tokenizer.tokenize(text)
51
-
52
- # Process each sentence
53
- processed_sentences = []
54
- for sentence in sentences:
55
- # Tokenize words
56
- tokens = word_tokenizer.tokenize(sentence)
57
-
58
- # Remove stop words if requested
59
- if remove_stops:
60
- tokens = normalizer.remove_stop_words(tokens)
61
-
62
- # Get POS tags and stems
63
- tagged = pos_tagger.tag(tokens)
64
- stems = [(token, stemmer.stem(token)) for token, _ in tagged]
65
-
66
- processed_sentences.append({
67
- 'original': sentence,
68
- 'tokens': tokens,
69
- 'pos_tags': tagged,
70
- 'stems': stems
71
- })
72
-
73
- return {
74
- 'sentences': processed_sentences,
75
- 'num_sentences': len(sentences),
76
- 'num_tokens': sum(len(s['tokens']) for s in processed_sentences)
77
- }