webscout 7.1__py3-none-any.whl → 7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (154) hide show
  1. webscout/AIauto.py +191 -191
  2. webscout/AIbase.py +122 -122
  3. webscout/AIutel.py +440 -440
  4. webscout/Bard.py +343 -161
  5. webscout/DWEBS.py +489 -492
  6. webscout/Extra/YTToolkit/YTdownloader.py +995 -995
  7. webscout/Extra/YTToolkit/__init__.py +2 -2
  8. webscout/Extra/YTToolkit/transcriber.py +476 -479
  9. webscout/Extra/YTToolkit/ytapi/channel.py +307 -307
  10. webscout/Extra/YTToolkit/ytapi/playlist.py +58 -58
  11. webscout/Extra/YTToolkit/ytapi/pool.py +7 -7
  12. webscout/Extra/YTToolkit/ytapi/utils.py +62 -62
  13. webscout/Extra/YTToolkit/ytapi/video.py +103 -103
  14. webscout/Extra/autocoder/__init__.py +9 -9
  15. webscout/Extra/autocoder/autocoder_utiles.py +199 -199
  16. webscout/Extra/autocoder/rawdog.py +5 -7
  17. webscout/Extra/autollama.py +230 -230
  18. webscout/Extra/gguf.py +3 -3
  19. webscout/Extra/weather.py +171 -171
  20. webscout/LLM.py +442 -442
  21. webscout/Litlogger/__init__.py +67 -681
  22. webscout/Litlogger/core/__init__.py +6 -0
  23. webscout/Litlogger/core/level.py +23 -0
  24. webscout/Litlogger/core/logger.py +166 -0
  25. webscout/Litlogger/handlers/__init__.py +12 -0
  26. webscout/Litlogger/handlers/console.py +33 -0
  27. webscout/Litlogger/handlers/file.py +143 -0
  28. webscout/Litlogger/handlers/network.py +173 -0
  29. webscout/Litlogger/styles/__init__.py +7 -0
  30. webscout/Litlogger/styles/colors.py +249 -0
  31. webscout/Litlogger/styles/formats.py +460 -0
  32. webscout/Litlogger/styles/text.py +87 -0
  33. webscout/Litlogger/utils/__init__.py +6 -0
  34. webscout/Litlogger/utils/detectors.py +154 -0
  35. webscout/Litlogger/utils/formatters.py +200 -0
  36. webscout/Provider/AISEARCH/DeepFind.py +250 -250
  37. webscout/Provider/AISEARCH/ISou.py +277 -0
  38. webscout/Provider/AISEARCH/__init__.py +2 -1
  39. webscout/Provider/Blackboxai.py +3 -3
  40. webscout/Provider/ChatGPTGratis.py +226 -0
  41. webscout/Provider/Cloudflare.py +3 -4
  42. webscout/Provider/DeepSeek.py +218 -0
  43. webscout/Provider/Deepinfra.py +40 -24
  44. webscout/Provider/Free2GPT.py +131 -124
  45. webscout/Provider/Gemini.py +100 -115
  46. webscout/Provider/Glider.py +3 -3
  47. webscout/Provider/Groq.py +5 -1
  48. webscout/Provider/Jadve.py +3 -3
  49. webscout/Provider/Marcus.py +191 -192
  50. webscout/Provider/Netwrck.py +3 -3
  51. webscout/Provider/PI.py +2 -2
  52. webscout/Provider/PizzaGPT.py +2 -3
  53. webscout/Provider/QwenLM.py +311 -0
  54. webscout/Provider/TTI/AiForce/__init__.py +22 -22
  55. webscout/Provider/TTI/AiForce/async_aiforce.py +257 -257
  56. webscout/Provider/TTI/AiForce/sync_aiforce.py +242 -242
  57. webscout/Provider/TTI/FreeAIPlayground/__init__.py +9 -0
  58. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +206 -0
  59. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +192 -0
  60. webscout/Provider/TTI/Nexra/__init__.py +22 -22
  61. webscout/Provider/TTI/Nexra/async_nexra.py +286 -286
  62. webscout/Provider/TTI/Nexra/sync_nexra.py +258 -258
  63. webscout/Provider/TTI/PollinationsAI/__init__.py +23 -23
  64. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +330 -330
  65. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +285 -285
  66. webscout/Provider/TTI/__init__.py +2 -1
  67. webscout/Provider/TTI/artbit/__init__.py +22 -22
  68. webscout/Provider/TTI/artbit/async_artbit.py +184 -184
  69. webscout/Provider/TTI/artbit/sync_artbit.py +176 -176
  70. webscout/Provider/TTI/blackbox/__init__.py +4 -4
  71. webscout/Provider/TTI/blackbox/async_blackbox.py +212 -212
  72. webscout/Provider/TTI/blackbox/sync_blackbox.py +199 -199
  73. webscout/Provider/TTI/deepinfra/__init__.py +4 -4
  74. webscout/Provider/TTI/deepinfra/async_deepinfra.py +227 -227
  75. webscout/Provider/TTI/deepinfra/sync_deepinfra.py +199 -199
  76. webscout/Provider/TTI/huggingface/__init__.py +22 -22
  77. webscout/Provider/TTI/huggingface/async_huggingface.py +199 -199
  78. webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -195
  79. webscout/Provider/TTI/imgninza/__init__.py +4 -4
  80. webscout/Provider/TTI/imgninza/async_ninza.py +214 -214
  81. webscout/Provider/TTI/imgninza/sync_ninza.py +209 -209
  82. webscout/Provider/TTI/talkai/__init__.py +4 -4
  83. webscout/Provider/TTI/talkai/async_talkai.py +229 -229
  84. webscout/Provider/TTI/talkai/sync_talkai.py +207 -207
  85. webscout/Provider/TTS/deepgram.py +182 -182
  86. webscout/Provider/TTS/elevenlabs.py +136 -136
  87. webscout/Provider/TTS/gesserit.py +150 -150
  88. webscout/Provider/TTS/murfai.py +138 -138
  89. webscout/Provider/TTS/parler.py +133 -134
  90. webscout/Provider/TTS/streamElements.py +360 -360
  91. webscout/Provider/TTS/utils.py +280 -280
  92. webscout/Provider/TTS/voicepod.py +116 -116
  93. webscout/Provider/TextPollinationsAI.py +28 -8
  94. webscout/Provider/WiseCat.py +193 -0
  95. webscout/Provider/__init__.py +146 -134
  96. webscout/Provider/cerebras.py +242 -227
  97. webscout/Provider/chatglm.py +204 -204
  98. webscout/Provider/dgaf.py +2 -3
  99. webscout/Provider/freeaichat.py +221 -0
  100. webscout/Provider/gaurish.py +2 -3
  101. webscout/Provider/geminiapi.py +208 -208
  102. webscout/Provider/granite.py +223 -0
  103. webscout/Provider/hermes.py +218 -218
  104. webscout/Provider/llama3mitril.py +179 -179
  105. webscout/Provider/llamatutor.py +3 -3
  106. webscout/Provider/llmchat.py +2 -3
  107. webscout/Provider/meta.py +794 -794
  108. webscout/Provider/multichat.py +331 -331
  109. webscout/Provider/typegpt.py +359 -359
  110. webscout/Provider/yep.py +3 -3
  111. webscout/__init__.py +1 -0
  112. webscout/__main__.py +5 -5
  113. webscout/cli.py +319 -319
  114. webscout/conversation.py +241 -242
  115. webscout/exceptions.py +328 -328
  116. webscout/litagent/__init__.py +28 -28
  117. webscout/litagent/agent.py +2 -3
  118. webscout/litprinter/__init__.py +0 -58
  119. webscout/scout/__init__.py +8 -8
  120. webscout/scout/core.py +884 -884
  121. webscout/scout/element.py +459 -459
  122. webscout/scout/parsers/__init__.py +69 -69
  123. webscout/scout/parsers/html5lib_parser.py +172 -172
  124. webscout/scout/parsers/html_parser.py +236 -236
  125. webscout/scout/parsers/lxml_parser.py +178 -178
  126. webscout/scout/utils.py +38 -38
  127. webscout/swiftcli/__init__.py +811 -811
  128. webscout/update_checker.py +2 -12
  129. webscout/version.py +1 -1
  130. webscout/webscout_search.py +87 -6
  131. webscout/webscout_search_async.py +58 -1
  132. webscout/yep_search.py +297 -0
  133. webscout/zeroart/__init__.py +54 -54
  134. webscout/zeroart/base.py +60 -60
  135. webscout/zeroart/effects.py +99 -99
  136. webscout/zeroart/fonts.py +816 -816
  137. {webscout-7.1.dist-info → webscout-7.3.dist-info}/METADATA +62 -22
  138. webscout-7.3.dist-info/RECORD +223 -0
  139. {webscout-7.1.dist-info → webscout-7.3.dist-info}/WHEEL +1 -1
  140. webstoken/__init__.py +30 -30
  141. webstoken/classifier.py +189 -189
  142. webstoken/keywords.py +216 -216
  143. webstoken/language.py +128 -128
  144. webstoken/ner.py +164 -164
  145. webstoken/normalizer.py +35 -35
  146. webstoken/processor.py +77 -77
  147. webstoken/sentiment.py +206 -206
  148. webstoken/stemmer.py +73 -73
  149. webstoken/tagger.py +60 -60
  150. webstoken/tokenizer.py +158 -158
  151. webscout-7.1.dist-info/RECORD +0 -198
  152. {webscout-7.1.dist-info → webscout-7.3.dist-info}/LICENSE.md +0 -0
  153. {webscout-7.1.dist-info → webscout-7.3.dist-info}/entry_points.txt +0 -0
  154. {webscout-7.1.dist-info → webscout-7.3.dist-info}/top_level.txt +0 -0
webstoken/keywords.py CHANGED
@@ -1,216 +1,216 @@
1
- """
2
- Keyword extraction module using statistical and graph-based approaches.
3
- """
4
-
5
- from typing import Dict, List, Set, Tuple
6
- from collections import Counter, defaultdict
7
- import math
8
- import re
9
-
10
- from .tokenizer import WordTokenizer
11
- from .normalizer import TextNormalizer
12
-
13
-
14
- class KeywordExtractor:
15
- """Keyword extraction using TF-IDF and TextRank-inspired algorithms."""
16
-
17
- def __init__(self):
18
- self.word_tokenizer = WordTokenizer()
19
- self.normalizer = TextNormalizer()
20
-
21
- # Common words to filter out beyond basic stop words
22
- self.filter_words: Set[str] = {
23
- 'would', 'could', 'should', 'said', 'also', 'may', 'might',
24
- 'must', 'need', 'shall', 'want', 'way', 'time', 'just',
25
- 'now', 'like', 'make', 'made', 'well', 'back', 'even',
26
- 'still', 'way', 'take', 'took', 'get', 'got', 'go', 'went'
27
- }
28
-
29
- def _split_into_sentences(self, text: str) -> List[str]:
30
- """Split text into sentences using simple rules."""
31
- text = re.sub(r'\s+', ' ', text)
32
- sentences = re.split(r'[.!?]+', text)
33
- return [s.strip() for s in sentences if s.strip()]
34
-
35
- def _calculate_word_scores(self, text: str) -> Dict[str, float]:
36
- """Calculate word importance scores using frequency and position."""
37
- # Normalize and tokenize text
38
- text = self.normalizer.normalize(text)
39
- sentences = self._split_into_sentences(text)
40
-
41
- word_scores: Dict[str, float] = defaultdict(float)
42
- word_positions: Dict[str, List[int]] = defaultdict(list)
43
-
44
- # Calculate word frequencies and positions
45
- for i, sentence in enumerate(sentences):
46
- words = self.word_tokenizer.tokenize(sentence)
47
- for j, word in enumerate(words):
48
- word = word.lower()
49
- if (word.isalnum() and
50
- len(word) > 2 and
51
- word not in self.filter_words and
52
- word not in self.normalizer.stop_words):
53
- word_scores[word] += 1
54
- word_positions[word].append(i)
55
-
56
- # Adjust scores based on position
57
- num_sentences = len(sentences)
58
- for word, positions in word_positions.items():
59
- # Words appearing in first or last sentences get bonus
60
- if 0 in positions:
61
- word_scores[word] *= 1.2
62
- if num_sentences - 1 in positions:
63
- word_scores[word] *= 1.1
64
-
65
- # Words appearing throughout text get bonus
66
- coverage = len(set(positions)) / num_sentences
67
- word_scores[word] *= (1 + coverage)
68
-
69
- return word_scores
70
-
71
- def _calculate_word_cooccurrence(self, text: str, window_size: int = 3) -> Dict[str, Dict[str, int]]:
72
- """Calculate word co-occurrence matrix."""
73
- # Normalize and tokenize text
74
- text = self.normalizer.normalize(text)
75
- words = self.word_tokenizer.tokenize(text)
76
-
77
- # Filter words
78
- filtered_words = [
79
- word.lower() for word in words
80
- if (word.isalnum() and
81
- len(word) > 2 and
82
- word.lower() not in self.filter_words and
83
- word.lower() not in self.normalizer.stop_words)
84
- ]
85
-
86
- # Build co-occurrence matrix
87
- cooccurrence: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
88
-
89
- for i, word in enumerate(filtered_words):
90
- for j in range(max(0, i - window_size), min(len(filtered_words), i + window_size + 1)):
91
- if i != j:
92
- cooccurrence[word][filtered_words[j]] += 1
93
- cooccurrence[filtered_words[j]][word] += 1
94
-
95
- return cooccurrence
96
-
97
- def _textrank_scores(self, cooccurrence: Dict[str, Dict[str, int]], damping: float = 0.85,
98
- iterations: int = 30) -> Dict[str, float]:
99
- """Calculate TextRank scores from co-occurrence matrix."""
100
- scores = {word: 1.0 for word in cooccurrence}
101
-
102
- for _ in range(iterations):
103
- new_scores = {}
104
- for word in scores:
105
- if not cooccurrence[word]:
106
- continue
107
-
108
- incoming_score = sum(
109
- scores[other] * cooccurrence[word][other] / sum(cooccurrence[other].values())
110
- for other in cooccurrence[word]
111
- )
112
- new_scores[word] = (1 - damping) + damping * incoming_score
113
-
114
- # Check convergence
115
- score_diff = sum(abs(new_scores[w] - scores[w]) for w in scores)
116
- scores = new_scores
117
- if score_diff < 0.0001:
118
- break
119
-
120
- return scores
121
-
122
- def extract_keywords(self, text: str, num_keywords: int = 10,
123
- use_textrank: bool = True) -> List[Tuple[str, float]]:
124
- """
125
- Extract keywords from text using combined frequency and graph-based approach.
126
-
127
- Args:
128
- text: Input text
129
- num_keywords: Number of keywords to return
130
- use_textrank: Whether to use TextRank algorithm
131
-
132
- Returns:
133
- List of (keyword, score) tuples, sorted by score
134
- """
135
- if not text:
136
- return []
137
-
138
- # Get frequency-based scores
139
- freq_scores = self._calculate_word_scores(text)
140
-
141
- if use_textrank:
142
- # Get TextRank scores
143
- cooccurrence = self._calculate_word_cooccurrence(text)
144
- textrank_scores = self._textrank_scores(cooccurrence)
145
-
146
- # Combine scores
147
- combined_scores = {
148
- word: freq_scores[word] * textrank_scores.get(word, 0)
149
- for word in freq_scores
150
- }
151
- else:
152
- combined_scores = freq_scores
153
-
154
- # Sort and return top keywords
155
- sorted_words = sorted(
156
- combined_scores.items(),
157
- key=lambda x: x[1],
158
- reverse=True
159
- )
160
-
161
- return sorted_words[:num_keywords]
162
-
163
- def extract_keyphrases(self, text: str, num_phrases: int = 5,
164
- min_words: int = 2, max_words: int = 4) -> List[Tuple[str, float]]:
165
- """
166
- Extract key phrases from text.
167
-
168
- Args:
169
- text: Input text
170
- num_phrases: Number of phrases to return
171
- min_words: Minimum words in phrase
172
- max_words: Maximum words in phrase
173
-
174
- Returns:
175
- List of (phrase, score) tuples, sorted by score
176
- """
177
- # Normalize and split into sentences
178
- text = self.normalizer.normalize(text)
179
- sentences = self._split_into_sentences(text)
180
-
181
- # Get word importance scores
182
- word_scores = self._calculate_word_scores(text)
183
-
184
- # Extract candidate phrases
185
- phrases: Dict[str, float] = {}
186
-
187
- for sentence in sentences:
188
- words = self.word_tokenizer.tokenize(sentence)
189
-
190
- # Generate phrases of different lengths
191
- for i in range(len(words)):
192
- for length in range(min_words, min(max_words + 1, len(words) - i + 1)):
193
- phrase_words = words[i:i+length]
194
-
195
- # Filter phrases
196
- if all(
197
- word.isalnum() and
198
- len(word) > 2 and
199
- word.lower() not in self.filter_words and
200
- word.lower() not in self.normalizer.stop_words
201
- for word in phrase_words
202
- ):
203
- phrase = ' '.join(phrase_words)
204
- # Score is average of word scores
205
- score = sum(word_scores.get(word.lower(), 0) for word in phrase_words)
206
- score /= len(phrase_words)
207
- phrases[phrase] = score
208
-
209
- # Sort and return top phrases
210
- sorted_phrases = sorted(
211
- phrases.items(),
212
- key=lambda x: x[1],
213
- reverse=True
214
- )
215
-
216
- return sorted_phrases[:num_phrases]
1
+ """
2
+ Keyword extraction module using statistical and graph-based approaches.
3
+ """
4
+
5
+ from typing import Dict, List, Set, Tuple
6
+ from collections import Counter, defaultdict
7
+ import math
8
+ import re
9
+
10
+ from .tokenizer import WordTokenizer
11
+ from .normalizer import TextNormalizer
12
+
13
+
14
+ class KeywordExtractor:
15
+ """Keyword extraction using TF-IDF and TextRank-inspired algorithms."""
16
+
17
+ def __init__(self):
18
+ self.word_tokenizer = WordTokenizer()
19
+ self.normalizer = TextNormalizer()
20
+
21
+ # Common words to filter out beyond basic stop words
22
+ self.filter_words: Set[str] = {
23
+ 'would', 'could', 'should', 'said', 'also', 'may', 'might',
24
+ 'must', 'need', 'shall', 'want', 'way', 'time', 'just',
25
+ 'now', 'like', 'make', 'made', 'well', 'back', 'even',
26
+ 'still', 'way', 'take', 'took', 'get', 'got', 'go', 'went'
27
+ }
28
+
29
+ def _split_into_sentences(self, text: str) -> List[str]:
30
+ """Split text into sentences using simple rules."""
31
+ text = re.sub(r'\s+', ' ', text)
32
+ sentences = re.split(r'[.!?]+', text)
33
+ return [s.strip() for s in sentences if s.strip()]
34
+
35
+ def _calculate_word_scores(self, text: str) -> Dict[str, float]:
36
+ """Calculate word importance scores using frequency and position."""
37
+ # Normalize and tokenize text
38
+ text = self.normalizer.normalize(text)
39
+ sentences = self._split_into_sentences(text)
40
+
41
+ word_scores: Dict[str, float] = defaultdict(float)
42
+ word_positions: Dict[str, List[int]] = defaultdict(list)
43
+
44
+ # Calculate word frequencies and positions
45
+ for i, sentence in enumerate(sentences):
46
+ words = self.word_tokenizer.tokenize(sentence)
47
+ for j, word in enumerate(words):
48
+ word = word.lower()
49
+ if (word.isalnum() and
50
+ len(word) > 2 and
51
+ word not in self.filter_words and
52
+ word not in self.normalizer.stop_words):
53
+ word_scores[word] += 1
54
+ word_positions[word].append(i)
55
+
56
+ # Adjust scores based on position
57
+ num_sentences = len(sentences)
58
+ for word, positions in word_positions.items():
59
+ # Words appearing in first or last sentences get bonus
60
+ if 0 in positions:
61
+ word_scores[word] *= 1.2
62
+ if num_sentences - 1 in positions:
63
+ word_scores[word] *= 1.1
64
+
65
+ # Words appearing throughout text get bonus
66
+ coverage = len(set(positions)) / num_sentences
67
+ word_scores[word] *= (1 + coverage)
68
+
69
+ return word_scores
70
+
71
+ def _calculate_word_cooccurrence(self, text: str, window_size: int = 3) -> Dict[str, Dict[str, int]]:
72
+ """Calculate word co-occurrence matrix."""
73
+ # Normalize and tokenize text
74
+ text = self.normalizer.normalize(text)
75
+ words = self.word_tokenizer.tokenize(text)
76
+
77
+ # Filter words
78
+ filtered_words = [
79
+ word.lower() for word in words
80
+ if (word.isalnum() and
81
+ len(word) > 2 and
82
+ word.lower() not in self.filter_words and
83
+ word.lower() not in self.normalizer.stop_words)
84
+ ]
85
+
86
+ # Build co-occurrence matrix
87
+ cooccurrence: Dict[str, Dict[str, int]] = defaultdict(lambda: defaultdict(int))
88
+
89
+ for i, word in enumerate(filtered_words):
90
+ for j in range(max(0, i - window_size), min(len(filtered_words), i + window_size + 1)):
91
+ if i != j:
92
+ cooccurrence[word][filtered_words[j]] += 1
93
+ cooccurrence[filtered_words[j]][word] += 1
94
+
95
+ return cooccurrence
96
+
97
+ def _textrank_scores(self, cooccurrence: Dict[str, Dict[str, int]], damping: float = 0.85,
98
+ iterations: int = 30) -> Dict[str, float]:
99
+ """Calculate TextRank scores from co-occurrence matrix."""
100
+ scores = {word: 1.0 for word in cooccurrence}
101
+
102
+ for _ in range(iterations):
103
+ new_scores = {}
104
+ for word in scores:
105
+ if not cooccurrence[word]:
106
+ continue
107
+
108
+ incoming_score = sum(
109
+ scores[other] * cooccurrence[word][other] / sum(cooccurrence[other].values())
110
+ for other in cooccurrence[word]
111
+ )
112
+ new_scores[word] = (1 - damping) + damping * incoming_score
113
+
114
+ # Check convergence
115
+ score_diff = sum(abs(new_scores[w] - scores[w]) for w in scores)
116
+ scores = new_scores
117
+ if score_diff < 0.0001:
118
+ break
119
+
120
+ return scores
121
+
122
+ def extract_keywords(self, text: str, num_keywords: int = 10,
123
+ use_textrank: bool = True) -> List[Tuple[str, float]]:
124
+ """
125
+ Extract keywords from text using combined frequency and graph-based approach.
126
+
127
+ Args:
128
+ text: Input text
129
+ num_keywords: Number of keywords to return
130
+ use_textrank: Whether to use TextRank algorithm
131
+
132
+ Returns:
133
+ List of (keyword, score) tuples, sorted by score
134
+ """
135
+ if not text:
136
+ return []
137
+
138
+ # Get frequency-based scores
139
+ freq_scores = self._calculate_word_scores(text)
140
+
141
+ if use_textrank:
142
+ # Get TextRank scores
143
+ cooccurrence = self._calculate_word_cooccurrence(text)
144
+ textrank_scores = self._textrank_scores(cooccurrence)
145
+
146
+ # Combine scores
147
+ combined_scores = {
148
+ word: freq_scores[word] * textrank_scores.get(word, 0)
149
+ for word in freq_scores
150
+ }
151
+ else:
152
+ combined_scores = freq_scores
153
+
154
+ # Sort and return top keywords
155
+ sorted_words = sorted(
156
+ combined_scores.items(),
157
+ key=lambda x: x[1],
158
+ reverse=True
159
+ )
160
+
161
+ return sorted_words[:num_keywords]
162
+
163
+ def extract_keyphrases(self, text: str, num_phrases: int = 5,
164
+ min_words: int = 2, max_words: int = 4) -> List[Tuple[str, float]]:
165
+ """
166
+ Extract key phrases from text.
167
+
168
+ Args:
169
+ text: Input text
170
+ num_phrases: Number of phrases to return
171
+ min_words: Minimum words in phrase
172
+ max_words: Maximum words in phrase
173
+
174
+ Returns:
175
+ List of (phrase, score) tuples, sorted by score
176
+ """
177
+ # Normalize and split into sentences
178
+ text = self.normalizer.normalize(text)
179
+ sentences = self._split_into_sentences(text)
180
+
181
+ # Get word importance scores
182
+ word_scores = self._calculate_word_scores(text)
183
+
184
+ # Extract candidate phrases
185
+ phrases: Dict[str, float] = {}
186
+
187
+ for sentence in sentences:
188
+ words = self.word_tokenizer.tokenize(sentence)
189
+
190
+ # Generate phrases of different lengths
191
+ for i in range(len(words)):
192
+ for length in range(min_words, min(max_words + 1, len(words) - i + 1)):
193
+ phrase_words = words[i:i+length]
194
+
195
+ # Filter phrases
196
+ if all(
197
+ word.isalnum() and
198
+ len(word) > 2 and
199
+ word.lower() not in self.filter_words and
200
+ word.lower() not in self.normalizer.stop_words
201
+ for word in phrase_words
202
+ ):
203
+ phrase = ' '.join(phrase_words)
204
+ # Score is average of word scores
205
+ score = sum(word_scores.get(word.lower(), 0) for word in phrase_words)
206
+ score /= len(phrase_words)
207
+ phrases[phrase] = score
208
+
209
+ # Sort and return top phrases
210
+ sorted_phrases = sorted(
211
+ phrases.items(),
212
+ key=lambda x: x[1],
213
+ reverse=True
214
+ )
215
+
216
+ return sorted_phrases[:num_phrases]