webscout 7.1__py3-none-any.whl → 7.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIauto.py +191 -191
- webscout/AIbase.py +122 -122
- webscout/AIutel.py +440 -440
- webscout/Bard.py +343 -161
- webscout/DWEBS.py +489 -492
- webscout/Extra/YTToolkit/YTdownloader.py +995 -995
- webscout/Extra/YTToolkit/__init__.py +2 -2
- webscout/Extra/YTToolkit/transcriber.py +476 -479
- webscout/Extra/YTToolkit/ytapi/channel.py +307 -307
- webscout/Extra/YTToolkit/ytapi/playlist.py +58 -58
- webscout/Extra/YTToolkit/ytapi/pool.py +7 -7
- webscout/Extra/YTToolkit/ytapi/utils.py +62 -62
- webscout/Extra/YTToolkit/ytapi/video.py +103 -103
- webscout/Extra/autocoder/__init__.py +9 -9
- webscout/Extra/autocoder/autocoder_utiles.py +199 -199
- webscout/Extra/autocoder/rawdog.py +5 -7
- webscout/Extra/autollama.py +230 -230
- webscout/Extra/gguf.py +3 -3
- webscout/Extra/weather.py +171 -171
- webscout/LLM.py +442 -442
- webscout/Litlogger/__init__.py +67 -681
- webscout/Litlogger/core/__init__.py +6 -0
- webscout/Litlogger/core/level.py +23 -0
- webscout/Litlogger/core/logger.py +166 -0
- webscout/Litlogger/handlers/__init__.py +12 -0
- webscout/Litlogger/handlers/console.py +33 -0
- webscout/Litlogger/handlers/file.py +143 -0
- webscout/Litlogger/handlers/network.py +173 -0
- webscout/Litlogger/styles/__init__.py +7 -0
- webscout/Litlogger/styles/colors.py +249 -0
- webscout/Litlogger/styles/formats.py +460 -0
- webscout/Litlogger/styles/text.py +87 -0
- webscout/Litlogger/utils/__init__.py +6 -0
- webscout/Litlogger/utils/detectors.py +154 -0
- webscout/Litlogger/utils/formatters.py +200 -0
- webscout/Provider/AISEARCH/DeepFind.py +250 -250
- webscout/Provider/AISEARCH/ISou.py +277 -0
- webscout/Provider/AISEARCH/__init__.py +2 -1
- webscout/Provider/Blackboxai.py +3 -3
- webscout/Provider/ChatGPTGratis.py +226 -0
- webscout/Provider/Cloudflare.py +3 -4
- webscout/Provider/DeepSeek.py +218 -0
- webscout/Provider/Deepinfra.py +40 -24
- webscout/Provider/Free2GPT.py +131 -124
- webscout/Provider/Gemini.py +100 -115
- webscout/Provider/Glider.py +3 -3
- webscout/Provider/Groq.py +5 -1
- webscout/Provider/Jadve.py +3 -3
- webscout/Provider/Marcus.py +191 -192
- webscout/Provider/Netwrck.py +3 -3
- webscout/Provider/PI.py +2 -2
- webscout/Provider/PizzaGPT.py +2 -3
- webscout/Provider/QwenLM.py +311 -0
- webscout/Provider/TTI/AiForce/__init__.py +22 -22
- webscout/Provider/TTI/AiForce/async_aiforce.py +257 -257
- webscout/Provider/TTI/AiForce/sync_aiforce.py +242 -242
- webscout/Provider/TTI/FreeAIPlayground/__init__.py +9 -0
- webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +206 -0
- webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +192 -0
- webscout/Provider/TTI/Nexra/__init__.py +22 -22
- webscout/Provider/TTI/Nexra/async_nexra.py +286 -286
- webscout/Provider/TTI/Nexra/sync_nexra.py +258 -258
- webscout/Provider/TTI/PollinationsAI/__init__.py +23 -23
- webscout/Provider/TTI/PollinationsAI/async_pollinations.py +330 -330
- webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +285 -285
- webscout/Provider/TTI/__init__.py +2 -1
- webscout/Provider/TTI/artbit/__init__.py +22 -22
- webscout/Provider/TTI/artbit/async_artbit.py +184 -184
- webscout/Provider/TTI/artbit/sync_artbit.py +176 -176
- webscout/Provider/TTI/blackbox/__init__.py +4 -4
- webscout/Provider/TTI/blackbox/async_blackbox.py +212 -212
- webscout/Provider/TTI/blackbox/sync_blackbox.py +199 -199
- webscout/Provider/TTI/deepinfra/__init__.py +4 -4
- webscout/Provider/TTI/deepinfra/async_deepinfra.py +227 -227
- webscout/Provider/TTI/deepinfra/sync_deepinfra.py +199 -199
- webscout/Provider/TTI/huggingface/__init__.py +22 -22
- webscout/Provider/TTI/huggingface/async_huggingface.py +199 -199
- webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -195
- webscout/Provider/TTI/imgninza/__init__.py +4 -4
- webscout/Provider/TTI/imgninza/async_ninza.py +214 -214
- webscout/Provider/TTI/imgninza/sync_ninza.py +209 -209
- webscout/Provider/TTI/talkai/__init__.py +4 -4
- webscout/Provider/TTI/talkai/async_talkai.py +229 -229
- webscout/Provider/TTI/talkai/sync_talkai.py +207 -207
- webscout/Provider/TTS/deepgram.py +182 -182
- webscout/Provider/TTS/elevenlabs.py +136 -136
- webscout/Provider/TTS/gesserit.py +150 -150
- webscout/Provider/TTS/murfai.py +138 -138
- webscout/Provider/TTS/parler.py +133 -134
- webscout/Provider/TTS/streamElements.py +360 -360
- webscout/Provider/TTS/utils.py +280 -280
- webscout/Provider/TTS/voicepod.py +116 -116
- webscout/Provider/TextPollinationsAI.py +28 -8
- webscout/Provider/WiseCat.py +193 -0
- webscout/Provider/__init__.py +146 -134
- webscout/Provider/cerebras.py +242 -227
- webscout/Provider/chatglm.py +204 -204
- webscout/Provider/dgaf.py +2 -3
- webscout/Provider/freeaichat.py +221 -0
- webscout/Provider/gaurish.py +2 -3
- webscout/Provider/geminiapi.py +208 -208
- webscout/Provider/granite.py +223 -0
- webscout/Provider/hermes.py +218 -218
- webscout/Provider/llama3mitril.py +179 -179
- webscout/Provider/llamatutor.py +3 -3
- webscout/Provider/llmchat.py +2 -3
- webscout/Provider/meta.py +794 -794
- webscout/Provider/multichat.py +331 -331
- webscout/Provider/typegpt.py +359 -359
- webscout/Provider/yep.py +3 -3
- webscout/__init__.py +1 -0
- webscout/__main__.py +5 -5
- webscout/cli.py +319 -319
- webscout/conversation.py +241 -242
- webscout/exceptions.py +328 -328
- webscout/litagent/__init__.py +28 -28
- webscout/litagent/agent.py +2 -3
- webscout/litprinter/__init__.py +0 -58
- webscout/scout/__init__.py +8 -8
- webscout/scout/core.py +884 -884
- webscout/scout/element.py +459 -459
- webscout/scout/parsers/__init__.py +69 -69
- webscout/scout/parsers/html5lib_parser.py +172 -172
- webscout/scout/parsers/html_parser.py +236 -236
- webscout/scout/parsers/lxml_parser.py +178 -178
- webscout/scout/utils.py +38 -38
- webscout/swiftcli/__init__.py +811 -811
- webscout/update_checker.py +2 -12
- webscout/version.py +1 -1
- webscout/webscout_search.py +87 -6
- webscout/webscout_search_async.py +58 -1
- webscout/yep_search.py +297 -0
- webscout/zeroart/__init__.py +54 -54
- webscout/zeroart/base.py +60 -60
- webscout/zeroart/effects.py +99 -99
- webscout/zeroart/fonts.py +816 -816
- {webscout-7.1.dist-info → webscout-7.3.dist-info}/METADATA +62 -22
- webscout-7.3.dist-info/RECORD +223 -0
- {webscout-7.1.dist-info → webscout-7.3.dist-info}/WHEEL +1 -1
- webstoken/__init__.py +30 -30
- webstoken/classifier.py +189 -189
- webstoken/keywords.py +216 -216
- webstoken/language.py +128 -128
- webstoken/ner.py +164 -164
- webstoken/normalizer.py +35 -35
- webstoken/processor.py +77 -77
- webstoken/sentiment.py +206 -206
- webstoken/stemmer.py +73 -73
- webstoken/tagger.py +60 -60
- webstoken/tokenizer.py +158 -158
- webscout-7.1.dist-info/RECORD +0 -198
- {webscout-7.1.dist-info → webscout-7.3.dist-info}/LICENSE.md +0 -0
- {webscout-7.1.dist-info → webscout-7.3.dist-info}/entry_points.txt +0 -0
- {webscout-7.1.dist-info → webscout-7.3.dist-info}/top_level.txt +0 -0
webstoken/classifier.py
CHANGED
|
@@ -1,189 +1,189 @@
|
|
|
1
|
-
"""
|
|
2
|
-
Text classification module using rule-based and statistical approaches.
|
|
3
|
-
"""
|
|
4
|
-
|
|
5
|
-
from typing import Dict, List, Set, Tuple
|
|
6
|
-
from collections import Counter
|
|
7
|
-
import math
|
|
8
|
-
import re
|
|
9
|
-
|
|
10
|
-
from .normalizer import TextNormalizer
|
|
11
|
-
from .tokenizer import WordTokenizer
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
class TextClassifier:
|
|
15
|
-
"""Simple text classifier using TF-IDF and cosine similarity."""
|
|
16
|
-
|
|
17
|
-
def __init__(self):
|
|
18
|
-
self.word_tokenizer = WordTokenizer()
|
|
19
|
-
self.normalizer = TextNormalizer()
|
|
20
|
-
self.documents: Dict[str, List[str]] = {} # category -> list of documents
|
|
21
|
-
self.vocabulary: Set[str] = set()
|
|
22
|
-
self.idf_scores: Dict[str, float] = {}
|
|
23
|
-
self.category_vectors: Dict[str, Dict[str, float]] = {}
|
|
24
|
-
|
|
25
|
-
def train(self, documents: Dict[str, List[str]]) -> None:
|
|
26
|
-
"""
|
|
27
|
-
Train the classifier on labeled documents.
|
|
28
|
-
|
|
29
|
-
Args:
|
|
30
|
-
documents: Dict mapping categories to lists of documents
|
|
31
|
-
"""
|
|
32
|
-
self.documents = documents
|
|
33
|
-
|
|
34
|
-
# Build vocabulary and document frequencies
|
|
35
|
-
doc_frequencies: Dict[str, int] = Counter()
|
|
36
|
-
total_docs = sum(len(docs) for docs in documents.values())
|
|
37
|
-
|
|
38
|
-
for category, docs in documents.items():
|
|
39
|
-
for doc in docs:
|
|
40
|
-
# Normalize and tokenize
|
|
41
|
-
doc = self.normalizer.normalize(doc)
|
|
42
|
-
tokens = self.word_tokenizer.tokenize(doc)
|
|
43
|
-
|
|
44
|
-
# Update vocabulary and document frequencies
|
|
45
|
-
unique_tokens = set(tokens)
|
|
46
|
-
self.vocabulary.update(unique_tokens)
|
|
47
|
-
doc_frequencies.update(unique_tokens)
|
|
48
|
-
|
|
49
|
-
# Calculate IDF scores
|
|
50
|
-
self.idf_scores = {
|
|
51
|
-
word: math.log(total_docs / (freq + 1))
|
|
52
|
-
for word, freq in doc_frequencies.items()
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
# Calculate TF-IDF vectors for each category
|
|
56
|
-
for category, docs in documents.items():
|
|
57
|
-
category_vector: Dict[str, float] = {word: 0.0 for word in self.vocabulary}
|
|
58
|
-
|
|
59
|
-
for doc in docs:
|
|
60
|
-
# Get term frequencies
|
|
61
|
-
doc = self.normalizer.normalize(doc)
|
|
62
|
-
tokens = self.word_tokenizer.tokenize(doc)
|
|
63
|
-
term_freqs = Counter(tokens)
|
|
64
|
-
|
|
65
|
-
# Update category vector with TF-IDF scores
|
|
66
|
-
for word, tf in term_freqs.items():
|
|
67
|
-
if word in self.idf_scores:
|
|
68
|
-
category_vector[word] += tf * self.idf_scores[word]
|
|
69
|
-
|
|
70
|
-
# Average the scores
|
|
71
|
-
for word in category_vector:
|
|
72
|
-
category_vector[word] /= len(docs)
|
|
73
|
-
|
|
74
|
-
self.category_vectors[category] = category_vector
|
|
75
|
-
|
|
76
|
-
def _calculate_vector(self, text: str) -> Dict[str, float]:
|
|
77
|
-
"""Calculate TF-IDF vector for input text."""
|
|
78
|
-
# Normalize and tokenize
|
|
79
|
-
text = self.normalizer.normalize(text)
|
|
80
|
-
tokens = self.word_tokenizer.tokenize(text)
|
|
81
|
-
term_freqs = Counter(tokens)
|
|
82
|
-
|
|
83
|
-
# Calculate TF-IDF scores
|
|
84
|
-
vector = {word: 0.0 for word in self.vocabulary}
|
|
85
|
-
for word, tf in term_freqs.items():
|
|
86
|
-
if word in self.idf_scores:
|
|
87
|
-
vector[word] = tf * self.idf_scores[word]
|
|
88
|
-
|
|
89
|
-
return vector
|
|
90
|
-
|
|
91
|
-
def _cosine_similarity(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> float:
|
|
92
|
-
"""Calculate cosine similarity between two vectors."""
|
|
93
|
-
dot_product = sum(vec1[word] * vec2[word] for word in vec1)
|
|
94
|
-
norm1 = math.sqrt(sum(score * score for score in vec1.values()))
|
|
95
|
-
norm2 = math.sqrt(sum(score * score for score in vec2.values()))
|
|
96
|
-
|
|
97
|
-
if norm1 == 0 or norm2 == 0:
|
|
98
|
-
return 0.0
|
|
99
|
-
return dot_product / (norm1 * norm2)
|
|
100
|
-
|
|
101
|
-
def classify(self, text: str) -> List[Tuple[str, float]]:
|
|
102
|
-
"""
|
|
103
|
-
Classify text into categories with confidence scores.
|
|
104
|
-
|
|
105
|
-
Returns:
|
|
106
|
-
List of (category, confidence) tuples, sorted by confidence
|
|
107
|
-
"""
|
|
108
|
-
if not self.category_vectors:
|
|
109
|
-
raise ValueError("Classifier must be trained before classification")
|
|
110
|
-
|
|
111
|
-
# Calculate vector for input text
|
|
112
|
-
text_vector = self._calculate_vector(text)
|
|
113
|
-
|
|
114
|
-
# Calculate similarity with each category
|
|
115
|
-
similarities = [
|
|
116
|
-
(category, self._cosine_similarity(text_vector, category_vec))
|
|
117
|
-
for category, category_vec in self.category_vectors.items()
|
|
118
|
-
]
|
|
119
|
-
|
|
120
|
-
# Sort by similarity score
|
|
121
|
-
return sorted(similarities, key=lambda x: x[1], reverse=True)
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
class TopicClassifier:
|
|
125
|
-
"""Rule-based topic classifier using keyword matching."""
|
|
126
|
-
|
|
127
|
-
def __init__(self):
|
|
128
|
-
# Define topic keywords
|
|
129
|
-
self.topic_keywords = {
|
|
130
|
-
'TECHNOLOGY': {
|
|
131
|
-
'computer', 'software', 'hardware', 'internet', 'programming',
|
|
132
|
-
'digital', 'data', 'algorithm', 'code', 'web', 'app', 'mobile',
|
|
133
|
-
'cyber', 'robot', 'ai', 'artificial intelligence', 'machine learning'
|
|
134
|
-
},
|
|
135
|
-
'SCIENCE': {
|
|
136
|
-
'research', 'experiment', 'laboratory', 'scientific', 'physics',
|
|
137
|
-
'chemistry', 'biology', 'mathematics', 'theory', 'hypothesis',
|
|
138
|
-
'study', 'discovery', 'innovation', 'analysis', 'observation'
|
|
139
|
-
},
|
|
140
|
-
'BUSINESS': {
|
|
141
|
-
'company', 'market', 'finance', 'investment', 'stock', 'trade',
|
|
142
|
-
'economy', 'business', 'corporate', 'startup', 'entrepreneur',
|
|
143
|
-
'profit', 'revenue', 'management', 'strategy', 'commercial'
|
|
144
|
-
},
|
|
145
|
-
'POLITICS': {
|
|
146
|
-
'government', 'policy', 'election', 'political', 'democracy',
|
|
147
|
-
'parliament', 'congress', 'law', 'legislation', 'party',
|
|
148
|
-
'vote', 'campaign', 'president', 'minister', 'diplomatic'
|
|
149
|
-
},
|
|
150
|
-
'SPORTS': {
|
|
151
|
-
'game', 'team', 'player', 'competition', 'tournament',
|
|
152
|
-
'championship', 'score', 'match', 'athlete', 'sport',
|
|
153
|
-
'win', 'lose', 'victory', 'defeat', 'coach', 'training'
|
|
154
|
-
},
|
|
155
|
-
'ENTERTAINMENT': {
|
|
156
|
-
'movie', 'film', 'music', 'song', 'concert', 'actor',
|
|
157
|
-
'actress', 'celebrity', 'show', 'performance', 'art',
|
|
158
|
-
'entertainment', 'theater', 'dance', 'festival', 'media'
|
|
159
|
-
}
|
|
160
|
-
}
|
|
161
|
-
|
|
162
|
-
# Compile regex patterns for each topic
|
|
163
|
-
self.topic_patterns = {
|
|
164
|
-
topic: re.compile(r'\b(' + '|'.join(re.escape(kw) for kw in keywords) + r')\b', re.IGNORECASE)
|
|
165
|
-
for topic, keywords in self.topic_keywords.items()
|
|
166
|
-
}
|
|
167
|
-
|
|
168
|
-
def classify(self, text: str) -> List[Tuple[str, float]]:
|
|
169
|
-
"""
|
|
170
|
-
Classify text into topics with confidence scores.
|
|
171
|
-
|
|
172
|
-
Returns:
|
|
173
|
-
List of (topic, confidence) tuples, sorted by confidence
|
|
174
|
-
"""
|
|
175
|
-
# Count keyword matches for each topic
|
|
176
|
-
topic_matches = {
|
|
177
|
-
topic: len(pattern.findall(text))
|
|
178
|
-
for topic, pattern in self.topic_patterns.items()
|
|
179
|
-
}
|
|
180
|
-
|
|
181
|
-
# Calculate confidence scores
|
|
182
|
-
total_matches = sum(topic_matches.values()) or 1 # Avoid division by zero
|
|
183
|
-
topic_scores = [
|
|
184
|
-
(topic, count / total_matches)
|
|
185
|
-
for topic, count in topic_matches.items()
|
|
186
|
-
]
|
|
187
|
-
|
|
188
|
-
# Sort by score
|
|
189
|
-
return sorted(topic_scores, key=lambda x: x[1], reverse=True)
|
|
1
|
+
"""
|
|
2
|
+
Text classification module using rule-based and statistical approaches.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from typing import Dict, List, Set, Tuple
|
|
6
|
+
from collections import Counter
|
|
7
|
+
import math
|
|
8
|
+
import re
|
|
9
|
+
|
|
10
|
+
from .normalizer import TextNormalizer
|
|
11
|
+
from .tokenizer import WordTokenizer
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class TextClassifier:
|
|
15
|
+
"""Simple text classifier using TF-IDF and cosine similarity."""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
self.word_tokenizer = WordTokenizer()
|
|
19
|
+
self.normalizer = TextNormalizer()
|
|
20
|
+
self.documents: Dict[str, List[str]] = {} # category -> list of documents
|
|
21
|
+
self.vocabulary: Set[str] = set()
|
|
22
|
+
self.idf_scores: Dict[str, float] = {}
|
|
23
|
+
self.category_vectors: Dict[str, Dict[str, float]] = {}
|
|
24
|
+
|
|
25
|
+
def train(self, documents: Dict[str, List[str]]) -> None:
|
|
26
|
+
"""
|
|
27
|
+
Train the classifier on labeled documents.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
documents: Dict mapping categories to lists of documents
|
|
31
|
+
"""
|
|
32
|
+
self.documents = documents
|
|
33
|
+
|
|
34
|
+
# Build vocabulary and document frequencies
|
|
35
|
+
doc_frequencies: Dict[str, int] = Counter()
|
|
36
|
+
total_docs = sum(len(docs) for docs in documents.values())
|
|
37
|
+
|
|
38
|
+
for category, docs in documents.items():
|
|
39
|
+
for doc in docs:
|
|
40
|
+
# Normalize and tokenize
|
|
41
|
+
doc = self.normalizer.normalize(doc)
|
|
42
|
+
tokens = self.word_tokenizer.tokenize(doc)
|
|
43
|
+
|
|
44
|
+
# Update vocabulary and document frequencies
|
|
45
|
+
unique_tokens = set(tokens)
|
|
46
|
+
self.vocabulary.update(unique_tokens)
|
|
47
|
+
doc_frequencies.update(unique_tokens)
|
|
48
|
+
|
|
49
|
+
# Calculate IDF scores
|
|
50
|
+
self.idf_scores = {
|
|
51
|
+
word: math.log(total_docs / (freq + 1))
|
|
52
|
+
for word, freq in doc_frequencies.items()
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
# Calculate TF-IDF vectors for each category
|
|
56
|
+
for category, docs in documents.items():
|
|
57
|
+
category_vector: Dict[str, float] = {word: 0.0 for word in self.vocabulary}
|
|
58
|
+
|
|
59
|
+
for doc in docs:
|
|
60
|
+
# Get term frequencies
|
|
61
|
+
doc = self.normalizer.normalize(doc)
|
|
62
|
+
tokens = self.word_tokenizer.tokenize(doc)
|
|
63
|
+
term_freqs = Counter(tokens)
|
|
64
|
+
|
|
65
|
+
# Update category vector with TF-IDF scores
|
|
66
|
+
for word, tf in term_freqs.items():
|
|
67
|
+
if word in self.idf_scores:
|
|
68
|
+
category_vector[word] += tf * self.idf_scores[word]
|
|
69
|
+
|
|
70
|
+
# Average the scores
|
|
71
|
+
for word in category_vector:
|
|
72
|
+
category_vector[word] /= len(docs)
|
|
73
|
+
|
|
74
|
+
self.category_vectors[category] = category_vector
|
|
75
|
+
|
|
76
|
+
def _calculate_vector(self, text: str) -> Dict[str, float]:
|
|
77
|
+
"""Calculate TF-IDF vector for input text."""
|
|
78
|
+
# Normalize and tokenize
|
|
79
|
+
text = self.normalizer.normalize(text)
|
|
80
|
+
tokens = self.word_tokenizer.tokenize(text)
|
|
81
|
+
term_freqs = Counter(tokens)
|
|
82
|
+
|
|
83
|
+
# Calculate TF-IDF scores
|
|
84
|
+
vector = {word: 0.0 for word in self.vocabulary}
|
|
85
|
+
for word, tf in term_freqs.items():
|
|
86
|
+
if word in self.idf_scores:
|
|
87
|
+
vector[word] = tf * self.idf_scores[word]
|
|
88
|
+
|
|
89
|
+
return vector
|
|
90
|
+
|
|
91
|
+
def _cosine_similarity(self, vec1: Dict[str, float], vec2: Dict[str, float]) -> float:
|
|
92
|
+
"""Calculate cosine similarity between two vectors."""
|
|
93
|
+
dot_product = sum(vec1[word] * vec2[word] for word in vec1)
|
|
94
|
+
norm1 = math.sqrt(sum(score * score for score in vec1.values()))
|
|
95
|
+
norm2 = math.sqrt(sum(score * score for score in vec2.values()))
|
|
96
|
+
|
|
97
|
+
if norm1 == 0 or norm2 == 0:
|
|
98
|
+
return 0.0
|
|
99
|
+
return dot_product / (norm1 * norm2)
|
|
100
|
+
|
|
101
|
+
def classify(self, text: str) -> List[Tuple[str, float]]:
|
|
102
|
+
"""
|
|
103
|
+
Classify text into categories with confidence scores.
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
List of (category, confidence) tuples, sorted by confidence
|
|
107
|
+
"""
|
|
108
|
+
if not self.category_vectors:
|
|
109
|
+
raise ValueError("Classifier must be trained before classification")
|
|
110
|
+
|
|
111
|
+
# Calculate vector for input text
|
|
112
|
+
text_vector = self._calculate_vector(text)
|
|
113
|
+
|
|
114
|
+
# Calculate similarity with each category
|
|
115
|
+
similarities = [
|
|
116
|
+
(category, self._cosine_similarity(text_vector, category_vec))
|
|
117
|
+
for category, category_vec in self.category_vectors.items()
|
|
118
|
+
]
|
|
119
|
+
|
|
120
|
+
# Sort by similarity score
|
|
121
|
+
return sorted(similarities, key=lambda x: x[1], reverse=True)
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
class TopicClassifier:
|
|
125
|
+
"""Rule-based topic classifier using keyword matching."""
|
|
126
|
+
|
|
127
|
+
def __init__(self):
|
|
128
|
+
# Define topic keywords
|
|
129
|
+
self.topic_keywords = {
|
|
130
|
+
'TECHNOLOGY': {
|
|
131
|
+
'computer', 'software', 'hardware', 'internet', 'programming',
|
|
132
|
+
'digital', 'data', 'algorithm', 'code', 'web', 'app', 'mobile',
|
|
133
|
+
'cyber', 'robot', 'ai', 'artificial intelligence', 'machine learning'
|
|
134
|
+
},
|
|
135
|
+
'SCIENCE': {
|
|
136
|
+
'research', 'experiment', 'laboratory', 'scientific', 'physics',
|
|
137
|
+
'chemistry', 'biology', 'mathematics', 'theory', 'hypothesis',
|
|
138
|
+
'study', 'discovery', 'innovation', 'analysis', 'observation'
|
|
139
|
+
},
|
|
140
|
+
'BUSINESS': {
|
|
141
|
+
'company', 'market', 'finance', 'investment', 'stock', 'trade',
|
|
142
|
+
'economy', 'business', 'corporate', 'startup', 'entrepreneur',
|
|
143
|
+
'profit', 'revenue', 'management', 'strategy', 'commercial'
|
|
144
|
+
},
|
|
145
|
+
'POLITICS': {
|
|
146
|
+
'government', 'policy', 'election', 'political', 'democracy',
|
|
147
|
+
'parliament', 'congress', 'law', 'legislation', 'party',
|
|
148
|
+
'vote', 'campaign', 'president', 'minister', 'diplomatic'
|
|
149
|
+
},
|
|
150
|
+
'SPORTS': {
|
|
151
|
+
'game', 'team', 'player', 'competition', 'tournament',
|
|
152
|
+
'championship', 'score', 'match', 'athlete', 'sport',
|
|
153
|
+
'win', 'lose', 'victory', 'defeat', 'coach', 'training'
|
|
154
|
+
},
|
|
155
|
+
'ENTERTAINMENT': {
|
|
156
|
+
'movie', 'film', 'music', 'song', 'concert', 'actor',
|
|
157
|
+
'actress', 'celebrity', 'show', 'performance', 'art',
|
|
158
|
+
'entertainment', 'theater', 'dance', 'festival', 'media'
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
# Compile regex patterns for each topic
|
|
163
|
+
self.topic_patterns = {
|
|
164
|
+
topic: re.compile(r'\b(' + '|'.join(re.escape(kw) for kw in keywords) + r')\b', re.IGNORECASE)
|
|
165
|
+
for topic, keywords in self.topic_keywords.items()
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
def classify(self, text: str) -> List[Tuple[str, float]]:
|
|
169
|
+
"""
|
|
170
|
+
Classify text into topics with confidence scores.
|
|
171
|
+
|
|
172
|
+
Returns:
|
|
173
|
+
List of (topic, confidence) tuples, sorted by confidence
|
|
174
|
+
"""
|
|
175
|
+
# Count keyword matches for each topic
|
|
176
|
+
topic_matches = {
|
|
177
|
+
topic: len(pattern.findall(text))
|
|
178
|
+
for topic, pattern in self.topic_patterns.items()
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
# Calculate confidence scores
|
|
182
|
+
total_matches = sum(topic_matches.values()) or 1 # Avoid division by zero
|
|
183
|
+
topic_scores = [
|
|
184
|
+
(topic, count / total_matches)
|
|
185
|
+
for topic, count in topic_matches.items()
|
|
186
|
+
]
|
|
187
|
+
|
|
188
|
+
# Sort by score
|
|
189
|
+
return sorted(topic_scores, key=lambda x: x[1], reverse=True)
|