webscout 7.1__py3-none-any.whl → 7.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

Files changed (154) hide show
  1. webscout/AIauto.py +191 -191
  2. webscout/AIbase.py +122 -122
  3. webscout/AIutel.py +440 -440
  4. webscout/Bard.py +343 -161
  5. webscout/DWEBS.py +489 -492
  6. webscout/Extra/YTToolkit/YTdownloader.py +995 -995
  7. webscout/Extra/YTToolkit/__init__.py +2 -2
  8. webscout/Extra/YTToolkit/transcriber.py +476 -479
  9. webscout/Extra/YTToolkit/ytapi/channel.py +307 -307
  10. webscout/Extra/YTToolkit/ytapi/playlist.py +58 -58
  11. webscout/Extra/YTToolkit/ytapi/pool.py +7 -7
  12. webscout/Extra/YTToolkit/ytapi/utils.py +62 -62
  13. webscout/Extra/YTToolkit/ytapi/video.py +103 -103
  14. webscout/Extra/autocoder/__init__.py +9 -9
  15. webscout/Extra/autocoder/autocoder_utiles.py +199 -199
  16. webscout/Extra/autocoder/rawdog.py +5 -7
  17. webscout/Extra/autollama.py +230 -230
  18. webscout/Extra/gguf.py +3 -3
  19. webscout/Extra/weather.py +171 -171
  20. webscout/LLM.py +442 -442
  21. webscout/Litlogger/__init__.py +67 -681
  22. webscout/Litlogger/core/__init__.py +6 -0
  23. webscout/Litlogger/core/level.py +23 -0
  24. webscout/Litlogger/core/logger.py +166 -0
  25. webscout/Litlogger/handlers/__init__.py +12 -0
  26. webscout/Litlogger/handlers/console.py +33 -0
  27. webscout/Litlogger/handlers/file.py +143 -0
  28. webscout/Litlogger/handlers/network.py +173 -0
  29. webscout/Litlogger/styles/__init__.py +7 -0
  30. webscout/Litlogger/styles/colors.py +249 -0
  31. webscout/Litlogger/styles/formats.py +460 -0
  32. webscout/Litlogger/styles/text.py +87 -0
  33. webscout/Litlogger/utils/__init__.py +6 -0
  34. webscout/Litlogger/utils/detectors.py +154 -0
  35. webscout/Litlogger/utils/formatters.py +200 -0
  36. webscout/Provider/AISEARCH/DeepFind.py +250 -250
  37. webscout/Provider/AISEARCH/ISou.py +277 -0
  38. webscout/Provider/AISEARCH/__init__.py +2 -1
  39. webscout/Provider/Blackboxai.py +3 -3
  40. webscout/Provider/ChatGPTGratis.py +226 -0
  41. webscout/Provider/Cloudflare.py +3 -4
  42. webscout/Provider/DeepSeek.py +218 -0
  43. webscout/Provider/Deepinfra.py +40 -24
  44. webscout/Provider/Free2GPT.py +131 -124
  45. webscout/Provider/Gemini.py +100 -115
  46. webscout/Provider/Glider.py +3 -3
  47. webscout/Provider/Groq.py +5 -1
  48. webscout/Provider/Jadve.py +3 -3
  49. webscout/Provider/Marcus.py +191 -192
  50. webscout/Provider/Netwrck.py +3 -3
  51. webscout/Provider/PI.py +2 -2
  52. webscout/Provider/PizzaGPT.py +2 -3
  53. webscout/Provider/QwenLM.py +311 -0
  54. webscout/Provider/TTI/AiForce/__init__.py +22 -22
  55. webscout/Provider/TTI/AiForce/async_aiforce.py +257 -257
  56. webscout/Provider/TTI/AiForce/sync_aiforce.py +242 -242
  57. webscout/Provider/TTI/FreeAIPlayground/__init__.py +9 -0
  58. webscout/Provider/TTI/FreeAIPlayground/async_freeaiplayground.py +206 -0
  59. webscout/Provider/TTI/FreeAIPlayground/sync_freeaiplayground.py +192 -0
  60. webscout/Provider/TTI/Nexra/__init__.py +22 -22
  61. webscout/Provider/TTI/Nexra/async_nexra.py +286 -286
  62. webscout/Provider/TTI/Nexra/sync_nexra.py +258 -258
  63. webscout/Provider/TTI/PollinationsAI/__init__.py +23 -23
  64. webscout/Provider/TTI/PollinationsAI/async_pollinations.py +330 -330
  65. webscout/Provider/TTI/PollinationsAI/sync_pollinations.py +285 -285
  66. webscout/Provider/TTI/__init__.py +2 -1
  67. webscout/Provider/TTI/artbit/__init__.py +22 -22
  68. webscout/Provider/TTI/artbit/async_artbit.py +184 -184
  69. webscout/Provider/TTI/artbit/sync_artbit.py +176 -176
  70. webscout/Provider/TTI/blackbox/__init__.py +4 -4
  71. webscout/Provider/TTI/blackbox/async_blackbox.py +212 -212
  72. webscout/Provider/TTI/blackbox/sync_blackbox.py +199 -199
  73. webscout/Provider/TTI/deepinfra/__init__.py +4 -4
  74. webscout/Provider/TTI/deepinfra/async_deepinfra.py +227 -227
  75. webscout/Provider/TTI/deepinfra/sync_deepinfra.py +199 -199
  76. webscout/Provider/TTI/huggingface/__init__.py +22 -22
  77. webscout/Provider/TTI/huggingface/async_huggingface.py +199 -199
  78. webscout/Provider/TTI/huggingface/sync_huggingface.py +195 -195
  79. webscout/Provider/TTI/imgninza/__init__.py +4 -4
  80. webscout/Provider/TTI/imgninza/async_ninza.py +214 -214
  81. webscout/Provider/TTI/imgninza/sync_ninza.py +209 -209
  82. webscout/Provider/TTI/talkai/__init__.py +4 -4
  83. webscout/Provider/TTI/talkai/async_talkai.py +229 -229
  84. webscout/Provider/TTI/talkai/sync_talkai.py +207 -207
  85. webscout/Provider/TTS/deepgram.py +182 -182
  86. webscout/Provider/TTS/elevenlabs.py +136 -136
  87. webscout/Provider/TTS/gesserit.py +150 -150
  88. webscout/Provider/TTS/murfai.py +138 -138
  89. webscout/Provider/TTS/parler.py +133 -134
  90. webscout/Provider/TTS/streamElements.py +360 -360
  91. webscout/Provider/TTS/utils.py +280 -280
  92. webscout/Provider/TTS/voicepod.py +116 -116
  93. webscout/Provider/TextPollinationsAI.py +28 -8
  94. webscout/Provider/WiseCat.py +193 -0
  95. webscout/Provider/__init__.py +146 -134
  96. webscout/Provider/cerebras.py +242 -227
  97. webscout/Provider/chatglm.py +204 -204
  98. webscout/Provider/dgaf.py +2 -3
  99. webscout/Provider/freeaichat.py +221 -0
  100. webscout/Provider/gaurish.py +2 -3
  101. webscout/Provider/geminiapi.py +208 -208
  102. webscout/Provider/granite.py +223 -0
  103. webscout/Provider/hermes.py +218 -218
  104. webscout/Provider/llama3mitril.py +179 -179
  105. webscout/Provider/llamatutor.py +3 -3
  106. webscout/Provider/llmchat.py +2 -3
  107. webscout/Provider/meta.py +794 -794
  108. webscout/Provider/multichat.py +331 -331
  109. webscout/Provider/typegpt.py +359 -359
  110. webscout/Provider/yep.py +3 -3
  111. webscout/__init__.py +1 -0
  112. webscout/__main__.py +5 -5
  113. webscout/cli.py +319 -319
  114. webscout/conversation.py +241 -242
  115. webscout/exceptions.py +328 -328
  116. webscout/litagent/__init__.py +28 -28
  117. webscout/litagent/agent.py +2 -3
  118. webscout/litprinter/__init__.py +0 -58
  119. webscout/scout/__init__.py +8 -8
  120. webscout/scout/core.py +884 -884
  121. webscout/scout/element.py +459 -459
  122. webscout/scout/parsers/__init__.py +69 -69
  123. webscout/scout/parsers/html5lib_parser.py +172 -172
  124. webscout/scout/parsers/html_parser.py +236 -236
  125. webscout/scout/parsers/lxml_parser.py +178 -178
  126. webscout/scout/utils.py +38 -38
  127. webscout/swiftcli/__init__.py +811 -811
  128. webscout/update_checker.py +2 -12
  129. webscout/version.py +1 -1
  130. webscout/webscout_search.py +87 -6
  131. webscout/webscout_search_async.py +58 -1
  132. webscout/yep_search.py +297 -0
  133. webscout/zeroart/__init__.py +54 -54
  134. webscout/zeroart/base.py +60 -60
  135. webscout/zeroart/effects.py +99 -99
  136. webscout/zeroart/fonts.py +816 -816
  137. {webscout-7.1.dist-info → webscout-7.3.dist-info}/METADATA +62 -22
  138. webscout-7.3.dist-info/RECORD +223 -0
  139. {webscout-7.1.dist-info → webscout-7.3.dist-info}/WHEEL +1 -1
  140. webstoken/__init__.py +30 -30
  141. webstoken/classifier.py +189 -189
  142. webstoken/keywords.py +216 -216
  143. webstoken/language.py +128 -128
  144. webstoken/ner.py +164 -164
  145. webstoken/normalizer.py +35 -35
  146. webstoken/processor.py +77 -77
  147. webstoken/sentiment.py +206 -206
  148. webstoken/stemmer.py +73 -73
  149. webstoken/tagger.py +60 -60
  150. webstoken/tokenizer.py +158 -158
  151. webscout-7.1.dist-info/RECORD +0 -198
  152. {webscout-7.1.dist-info → webscout-7.3.dist-info}/LICENSE.md +0 -0
  153. {webscout-7.1.dist-info → webscout-7.3.dist-info}/entry_points.txt +0 -0
  154. {webscout-7.1.dist-info → webscout-7.3.dist-info}/top_level.txt +0 -0
webstoken/language.py CHANGED
@@ -1,128 +1,128 @@
1
- """
2
- Language detection module using character and word frequency analysis.
3
- """
4
-
5
- from typing import Dict, List, Set, Tuple
6
- from collections import Counter
7
- import re
8
-
9
-
10
- class LanguageDetector:
11
- """Language detection using character n-gram frequencies."""
12
-
13
- def __init__(self):
14
- # Language profiles based on common character sequences
15
- self.language_profiles = {
16
- 'ENGLISH': {
17
- 'chars': 'etaoinshrdlcumwfgypbvkjxqz',
18
- 'ngrams': {'th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd',
19
- 'ti', 'es', 'or', 'te', 'of', 'ed', 'is', 'it', 'al', 'ar',
20
- 'st', 'to', 'nt', 'ng', 'se', 'ha', 'as', 'ou', 'io', 'le'},
21
- 'words': {'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
22
- 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
23
- 'do', 'at'}
24
- },
25
- 'SPANISH': {
26
- 'chars': 'eaosrnidlctumpbgvyqhfzjñxwk',
27
- 'ngrams': {'de', 'en', 'el', 'la', 'os', 'es', 'as', 'ar', 'er', 'ra',
28
- 'al', 'an', 'do', 'or', 'ta', 'ue', 'io', 'on', 'ro', 'ad',
29
- 'te', 'co', 'st', 'ci', 'nt', 'to', 'lo', 'no', 'po', 'ac'},
30
- 'words': {'de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'se', 'del',
31
- 'las', 'un', 'por', 'con', 'no', 'una', 'su', 'para', 'es',
32
- 'al'}
33
- },
34
- 'FRENCH': {
35
- 'chars': 'esaitnrulodcpmévqfbghàjxèêyçwzùâîôûëïüœ',
36
- 'ngrams': {'es', 'le', 'en', 'de', 'nt', 'on', 're', 'er', 'ai', 'te',
37
- 'la', 'an', 'ou', 'it', 'ur', 'et', 'el', 'se', 'qu', 'me',
38
- 'is', 'ar', 'ce', 'ns', 'us', 'ue', 'ss', 'ie', 'em', 'tr'},
39
- 'words': {'le', 'de', 'un', 'être', 'et', 'à', 'il', 'avoir', 'ne',
40
- 'je', 'son', 'que', 'se', 'qui', 'ce', 'dans', 'en', 'du',
41
- 'elle', 'au'}
42
- },
43
- 'GERMAN': {
44
- 'chars': 'enisratdhulcgmobwfkzvüpäößjyqxéèêëàáâãåāăąćčĉċďđ',
45
- 'ngrams': {'en', 'er', 'ch', 'de', 'ei', 'in', 'te', 'nd', 'ie', 'ge',
46
- 'st', 'ne', 'be', 'es', 'un', 'zu', 'an', 'ng', 'au', 'it',
47
- 'is', 'he', 'ht', 'se', 'ck', 'ic', 're', 'ns', 'sc', 'tz'},
48
- 'words': {'der', 'die', 'und', 'in', 'den', 'von', 'zu', 'das', 'mit',
49
- 'sich', 'des', 'auf', 'für', 'ist', 'im', 'dem', 'nicht',
50
- 'ein', 'eine', 'als'}
51
- }
52
- }
53
-
54
- # Compile word patterns
55
- self.word_pattern = re.compile(r'\b\w+\b')
56
-
57
- def _extract_ngrams(self, text: str, n: int = 2) -> List[str]:
58
- """Extract character n-grams from text."""
59
- text = text.lower()
60
- return [text[i:i+n] for i in range(len(text)-n+1)]
61
-
62
- def _calculate_char_frequencies(self, text: str) -> Dict[str, float]:
63
- """Calculate character frequencies in text."""
64
- text = text.lower()
65
- char_count = Counter(c for c in text if c.isalpha())
66
- total = sum(char_count.values()) or 1
67
- return {char: count/total for char, count in char_count.items()}
68
-
69
- def _calculate_ngram_frequencies(self, text: str) -> Dict[str, float]:
70
- """Calculate n-gram frequencies in text."""
71
- ngrams = self._extract_ngrams(text)
72
- ngram_count = Counter(ngrams)
73
- total = sum(ngram_count.values()) or 1
74
- return {ngram: count/total for ngram, count in ngram_count.items()}
75
-
76
- def _calculate_word_frequencies(self, text: str) -> Dict[str, float]:
77
- """Calculate word frequencies in text."""
78
- words = self.word_pattern.findall(text.lower())
79
- word_count = Counter(words)
80
- total = sum(word_count.values()) or 1
81
- return {word: count/total for word, count in word_count.items()}
82
-
83
- def _calculate_similarity(self, freq1: Dict[str, float], freq2: Dict[str, float]) -> float:
84
- """Calculate similarity between two frequency distributions."""
85
- common_keys = set(freq1.keys()) & set(freq2.keys())
86
- if not common_keys:
87
- return 0.0
88
-
89
- similarity = sum(min(freq1.get(k, 0), freq2.get(k, 0)) for k in common_keys)
90
- return similarity
91
-
92
- def detect(self, text: str) -> List[Tuple[str, float]]:
93
- """
94
- Detect the language of text with confidence scores.
95
-
96
- Returns:
97
- List of (language, confidence) tuples, sorted by confidence
98
- """
99
- if not text:
100
- return []
101
-
102
- # Calculate frequencies for input text
103
- char_freqs = self._calculate_char_frequencies(text)
104
- ngram_freqs = self._calculate_ngram_frequencies(text)
105
- word_freqs = self._calculate_word_frequencies(text)
106
-
107
- # Calculate similarity scores for each language
108
- scores = []
109
- for lang, profile in self.language_profiles.items():
110
- # Character similarity
111
- char_sim = sum(char_freqs.get(c, 0) for c in profile['chars'])
112
-
113
- # N-gram similarity
114
- ngram_sim = sum(ngram_freqs.get(ng, 0) for ng in profile['ngrams'])
115
-
116
- # Word similarity
117
- word_sim = sum(word_freqs.get(w, 0) for w in profile['words'])
118
-
119
- # Combined score (weighted average)
120
- total_score = (0.3 * char_sim + 0.4 * ngram_sim + 0.3 * word_sim)
121
- scores.append((lang, total_score))
122
-
123
- # Normalize scores
124
- total = sum(score for _, score in scores) or 1
125
- normalized_scores = [(lang, score/total) for lang, score in scores]
126
-
127
- # Sort by confidence
128
- return sorted(normalized_scores, key=lambda x: x[1], reverse=True)
1
+ """
2
+ Language detection module using character and word frequency analysis.
3
+ """
4
+
5
+ from typing import Dict, List, Set, Tuple
6
+ from collections import Counter
7
+ import re
8
+
9
+
10
+ class LanguageDetector:
11
+ """Language detection using character n-gram frequencies."""
12
+
13
+ def __init__(self):
14
+ # Language profiles based on common character sequences
15
+ self.language_profiles = {
16
+ 'ENGLISH': {
17
+ 'chars': 'etaoinshrdlcumwfgypbvkjxqz',
18
+ 'ngrams': {'th', 'he', 'in', 'er', 'an', 're', 'on', 'at', 'en', 'nd',
19
+ 'ti', 'es', 'or', 'te', 'of', 'ed', 'is', 'it', 'al', 'ar',
20
+ 'st', 'to', 'nt', 'ng', 'se', 'ha', 'as', 'ou', 'io', 'le'},
21
+ 'words': {'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
22
+ 'i', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
23
+ 'do', 'at'}
24
+ },
25
+ 'SPANISH': {
26
+ 'chars': 'eaosrnidlctumpbgvyqhfzjñxwk',
27
+ 'ngrams': {'de', 'en', 'el', 'la', 'os', 'es', 'as', 'ar', 'er', 'ra',
28
+ 'al', 'an', 'do', 'or', 'ta', 'ue', 'io', 'on', 'ro', 'ad',
29
+ 'te', 'co', 'st', 'ci', 'nt', 'to', 'lo', 'no', 'po', 'ac'},
30
+ 'words': {'de', 'la', 'que', 'el', 'en', 'y', 'a', 'los', 'se', 'del',
31
+ 'las', 'un', 'por', 'con', 'no', 'una', 'su', 'para', 'es',
32
+ 'al'}
33
+ },
34
+ 'FRENCH': {
35
+ 'chars': 'esaitnrulodcpmévqfbghàjxèêyçwzùâîôûëïüœ',
36
+ 'ngrams': {'es', 'le', 'en', 'de', 'nt', 'on', 're', 'er', 'ai', 'te',
37
+ 'la', 'an', 'ou', 'it', 'ur', 'et', 'el', 'se', 'qu', 'me',
38
+ 'is', 'ar', 'ce', 'ns', 'us', 'ue', 'ss', 'ie', 'em', 'tr'},
39
+ 'words': {'le', 'de', 'un', 'être', 'et', 'à', 'il', 'avoir', 'ne',
40
+ 'je', 'son', 'que', 'se', 'qui', 'ce', 'dans', 'en', 'du',
41
+ 'elle', 'au'}
42
+ },
43
+ 'GERMAN': {
44
+ 'chars': 'enisratdhulcgmobwfkzvüpäößjyqxéèêëàáâãåāăąćčĉċďđ',
45
+ 'ngrams': {'en', 'er', 'ch', 'de', 'ei', 'in', 'te', 'nd', 'ie', 'ge',
46
+ 'st', 'ne', 'be', 'es', 'un', 'zu', 'an', 'ng', 'au', 'it',
47
+ 'is', 'he', 'ht', 'se', 'ck', 'ic', 're', 'ns', 'sc', 'tz'},
48
+ 'words': {'der', 'die', 'und', 'in', 'den', 'von', 'zu', 'das', 'mit',
49
+ 'sich', 'des', 'auf', 'für', 'ist', 'im', 'dem', 'nicht',
50
+ 'ein', 'eine', 'als'}
51
+ }
52
+ }
53
+
54
+ # Compile word patterns
55
+ self.word_pattern = re.compile(r'\b\w+\b')
56
+
57
+ def _extract_ngrams(self, text: str, n: int = 2) -> List[str]:
58
+ """Extract character n-grams from text."""
59
+ text = text.lower()
60
+ return [text[i:i+n] for i in range(len(text)-n+1)]
61
+
62
+ def _calculate_char_frequencies(self, text: str) -> Dict[str, float]:
63
+ """Calculate character frequencies in text."""
64
+ text = text.lower()
65
+ char_count = Counter(c for c in text if c.isalpha())
66
+ total = sum(char_count.values()) or 1
67
+ return {char: count/total for char, count in char_count.items()}
68
+
69
+ def _calculate_ngram_frequencies(self, text: str) -> Dict[str, float]:
70
+ """Calculate n-gram frequencies in text."""
71
+ ngrams = self._extract_ngrams(text)
72
+ ngram_count = Counter(ngrams)
73
+ total = sum(ngram_count.values()) or 1
74
+ return {ngram: count/total for ngram, count in ngram_count.items()}
75
+
76
+ def _calculate_word_frequencies(self, text: str) -> Dict[str, float]:
77
+ """Calculate word frequencies in text."""
78
+ words = self.word_pattern.findall(text.lower())
79
+ word_count = Counter(words)
80
+ total = sum(word_count.values()) or 1
81
+ return {word: count/total for word, count in word_count.items()}
82
+
83
+ def _calculate_similarity(self, freq1: Dict[str, float], freq2: Dict[str, float]) -> float:
84
+ """Calculate similarity between two frequency distributions."""
85
+ common_keys = set(freq1.keys()) & set(freq2.keys())
86
+ if not common_keys:
87
+ return 0.0
88
+
89
+ similarity = sum(min(freq1.get(k, 0), freq2.get(k, 0)) for k in common_keys)
90
+ return similarity
91
+
92
+ def detect(self, text: str) -> List[Tuple[str, float]]:
93
+ """
94
+ Detect the language of text with confidence scores.
95
+
96
+ Returns:
97
+ List of (language, confidence) tuples, sorted by confidence
98
+ """
99
+ if not text:
100
+ return []
101
+
102
+ # Calculate frequencies for input text
103
+ char_freqs = self._calculate_char_frequencies(text)
104
+ ngram_freqs = self._calculate_ngram_frequencies(text)
105
+ word_freqs = self._calculate_word_frequencies(text)
106
+
107
+ # Calculate similarity scores for each language
108
+ scores = []
109
+ for lang, profile in self.language_profiles.items():
110
+ # Character similarity
111
+ char_sim = sum(char_freqs.get(c, 0) for c in profile['chars'])
112
+
113
+ # N-gram similarity
114
+ ngram_sim = sum(ngram_freqs.get(ng, 0) for ng in profile['ngrams'])
115
+
116
+ # Word similarity
117
+ word_sim = sum(word_freqs.get(w, 0) for w in profile['words'])
118
+
119
+ # Combined score (weighted average)
120
+ total_score = (0.3 * char_sim + 0.4 * ngram_sim + 0.3 * word_sim)
121
+ scores.append((lang, total_score))
122
+
123
+ # Normalize scores
124
+ total = sum(score for _, score in scores) or 1
125
+ normalized_scores = [(lang, score/total) for lang, score in scores]
126
+
127
+ # Sort by confidence
128
+ return sorted(normalized_scores, key=lambda x: x[1], reverse=True)
webstoken/ner.py CHANGED
@@ -1,164 +1,164 @@
1
- """
2
- Named Entity Recognition (NER) module for identifying and classifying named entities.
3
- """
4
-
5
- from typing import List, Tuple, Dict, Set
6
- import re
7
-
8
-
9
- class NamedEntityRecognizer:
10
- """Rule-based Named Entity Recognition."""
11
-
12
- def __init__(self):
13
- # Common entity patterns
14
- self.PERSON_TITLES = {
15
- 'mr', 'mrs', 'ms', 'miss', 'dr', 'prof', 'sir', 'madam',
16
- 'lord', 'lady', 'president', 'ceo', 'director'
17
- }
18
-
19
- self.ORGANIZATION_SUFFIXES = {
20
- 'inc', 'corp', 'ltd', 'llc', 'company', 'corporation',
21
- 'associates', 'partners', 'foundation', 'institute'
22
- }
23
-
24
- self.LOCATION_INDICATORS = {
25
- 'street', 'road', 'avenue', 'boulevard', 'lane', 'drive',
26
- 'circle', 'square', 'park', 'bridge', 'river', 'lake',
27
- 'mountain', 'forest', 'city', 'town', 'village', 'country'
28
- }
29
-
30
- self.DATE_MONTHS = {
31
- 'january', 'february', 'march', 'april', 'may', 'june',
32
- 'july', 'august', 'september', 'october', 'november', 'december'
33
- }
34
-
35
- # Compile regex patterns
36
- self.patterns = {
37
- 'EMAIL': re.compile(r'\b[\w\.-]+@[\w\.-]+\.\w+\b'),
38
- 'URL': re.compile(r'https?://(?:[\w-]|(?:%[\da-fA-F]{2}))+'),
39
- 'PHONE': re.compile(r'\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'),
40
- 'DATE': re.compile(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b'),
41
- 'TIME': re.compile(r'\b\d{1,2}:\d{2}(?::\d{2})?(?:\s*[AaPp][Mm])?\b'),
42
- 'MONEY': re.compile(r'\$\d+(?:,\d{3})*(?:\.\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?\s*(?:dollars|USD|EUR|GBP)'),
43
- 'PERCENTAGE': re.compile(r'\b\d+(?:\.\d+)?%\b')
44
- }
45
-
46
- def is_capitalized(self, word: str) -> bool:
47
- """Check if a word is capitalized."""
48
- return word and word[0].isupper()
49
-
50
- def extract_entities(self, text: str) -> Dict[str, List[Tuple[str, str]]]:
51
- """
52
- Extract named entities from text.
53
-
54
- Returns:
55
- Dict mapping entity types to list of (text, label) tuples
56
- """
57
- entities = {
58
- 'PERSON': [],
59
- 'ORGANIZATION': [],
60
- 'LOCATION': [],
61
- 'DATE': [],
62
- 'TIME': [],
63
- 'MONEY': [],
64
- 'EMAIL': [],
65
- 'URL': [],
66
- 'PHONE': [],
67
- 'PERCENTAGE': []
68
- }
69
-
70
- # First find regex pattern matches
71
- for label, pattern in self.patterns.items():
72
- for match in pattern.finditer(text):
73
- entities[label].append((match.group(), label))
74
-
75
- # Process text word by word for other entities
76
- words = text.split()
77
- i = 0
78
- while i < len(words):
79
- word = words[i]
80
- next_word = words[i + 1] if i + 1 < len(words) else None
81
-
82
- # Check for person names
83
- if word.lower() in self.PERSON_TITLES and next_word and self.is_capitalized(next_word):
84
- name_parts = []
85
- j = i + 1
86
- while j < len(words) and self.is_capitalized(words[j]):
87
- name_parts.append(words[j])
88
- j += 1
89
- if name_parts:
90
- entities['PERSON'].append((' '.join(name_parts), 'PERSON'))
91
- i = j
92
- continue
93
-
94
- # Check for organizations
95
- if self.is_capitalized(word):
96
- org_parts = [word]
97
- j = i + 1
98
- while j < len(words) and (
99
- self.is_capitalized(words[j]) or
100
- words[j].lower() in self.ORGANIZATION_SUFFIXES
101
- ):
102
- org_parts.append(words[j])
103
- j += 1
104
- if len(org_parts) > 1 or (
105
- len(org_parts) == 1 and
106
- any(suff in word.lower() for suff in self.ORGANIZATION_SUFFIXES)
107
- ):
108
- entities['ORGANIZATION'].append((' '.join(org_parts), 'ORGANIZATION'))
109
- i = j
110
- continue
111
-
112
- # Check for locations
113
- if word.lower() in self.LOCATION_INDICATORS and i > 0:
114
- if self.is_capitalized(words[i - 1]):
115
- entities['LOCATION'].append((words[i - 1] + ' ' + word, 'LOCATION'))
116
-
117
- i += 1
118
-
119
- return entities
120
-
121
- def tag_text(self, text: str) -> List[Tuple[str, str]]:
122
- """
123
- Tag each word in text with its entity type.
124
-
125
- Returns:
126
- List of (word, entity_type) tuples
127
- """
128
- entities = self.extract_entities(text)
129
- tagged = []
130
-
131
- # Create a map of word positions to entity labels
132
- position_labels = {}
133
- text_lower = text.lower()
134
-
135
- for entity_type, entity_list in entities.items():
136
- for entity_text, _ in entity_list:
137
- start = text_lower.find(entity_text.lower())
138
- if start != -1:
139
- end = start + len(entity_text)
140
- for pos in range(start, end):
141
- position_labels[pos] = entity_type
142
-
143
- # Tag each character position
144
- current_pos = 0
145
- current_word = []
146
- current_label = 'O' # Outside any entity
147
-
148
- for char in text:
149
- if char.isspace():
150
- if current_word:
151
- tagged.append((''.join(current_word), current_label))
152
- current_word = []
153
- current_label = 'O'
154
- else:
155
- current_word.append(char)
156
- if current_pos in position_labels:
157
- current_label = position_labels[current_pos]
158
- current_pos += 1
159
-
160
- # Add last word if exists
161
- if current_word:
162
- tagged.append((''.join(current_word), current_label))
163
-
164
- return tagged
1
+ """
2
+ Named Entity Recognition (NER) module for identifying and classifying named entities.
3
+ """
4
+
5
+ from typing import List, Tuple, Dict, Set
6
+ import re
7
+
8
+
9
+ class NamedEntityRecognizer:
10
+ """Rule-based Named Entity Recognition."""
11
+
12
+ def __init__(self):
13
+ # Common entity patterns
14
+ self.PERSON_TITLES = {
15
+ 'mr', 'mrs', 'ms', 'miss', 'dr', 'prof', 'sir', 'madam',
16
+ 'lord', 'lady', 'president', 'ceo', 'director'
17
+ }
18
+
19
+ self.ORGANIZATION_SUFFIXES = {
20
+ 'inc', 'corp', 'ltd', 'llc', 'company', 'corporation',
21
+ 'associates', 'partners', 'foundation', 'institute'
22
+ }
23
+
24
+ self.LOCATION_INDICATORS = {
25
+ 'street', 'road', 'avenue', 'boulevard', 'lane', 'drive',
26
+ 'circle', 'square', 'park', 'bridge', 'river', 'lake',
27
+ 'mountain', 'forest', 'city', 'town', 'village', 'country'
28
+ }
29
+
30
+ self.DATE_MONTHS = {
31
+ 'january', 'february', 'march', 'april', 'may', 'june',
32
+ 'july', 'august', 'september', 'october', 'november', 'december'
33
+ }
34
+
35
+ # Compile regex patterns
36
+ self.patterns = {
37
+ 'EMAIL': re.compile(r'\b[\w\.-]+@[\w\.-]+\.\w+\b'),
38
+ 'URL': re.compile(r'https?://(?:[\w-]|(?:%[\da-fA-F]{2}))+'),
39
+ 'PHONE': re.compile(r'\+?\d{1,3}[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'),
40
+ 'DATE': re.compile(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b|\b(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{1,2},? \d{4}\b'),
41
+ 'TIME': re.compile(r'\b\d{1,2}:\d{2}(?::\d{2})?(?:\s*[AaPp][Mm])?\b'),
42
+ 'MONEY': re.compile(r'\$\d+(?:,\d{3})*(?:\.\d{2})?|\d+(?:,\d{3})*(?:\.\d{2})?\s*(?:dollars|USD|EUR|GBP)'),
43
+ 'PERCENTAGE': re.compile(r'\b\d+(?:\.\d+)?%\b')
44
+ }
45
+
46
+ def is_capitalized(self, word: str) -> bool:
47
+ """Check if a word is capitalized."""
48
+ return word and word[0].isupper()
49
+
50
+ def extract_entities(self, text: str) -> Dict[str, List[Tuple[str, str]]]:
51
+ """
52
+ Extract named entities from text.
53
+
54
+ Returns:
55
+ Dict mapping entity types to list of (text, label) tuples
56
+ """
57
+ entities = {
58
+ 'PERSON': [],
59
+ 'ORGANIZATION': [],
60
+ 'LOCATION': [],
61
+ 'DATE': [],
62
+ 'TIME': [],
63
+ 'MONEY': [],
64
+ 'EMAIL': [],
65
+ 'URL': [],
66
+ 'PHONE': [],
67
+ 'PERCENTAGE': []
68
+ }
69
+
70
+ # First find regex pattern matches
71
+ for label, pattern in self.patterns.items():
72
+ for match in pattern.finditer(text):
73
+ entities[label].append((match.group(), label))
74
+
75
+ # Process text word by word for other entities
76
+ words = text.split()
77
+ i = 0
78
+ while i < len(words):
79
+ word = words[i]
80
+ next_word = words[i + 1] if i + 1 < len(words) else None
81
+
82
+ # Check for person names
83
+ if word.lower() in self.PERSON_TITLES and next_word and self.is_capitalized(next_word):
84
+ name_parts = []
85
+ j = i + 1
86
+ while j < len(words) and self.is_capitalized(words[j]):
87
+ name_parts.append(words[j])
88
+ j += 1
89
+ if name_parts:
90
+ entities['PERSON'].append((' '.join(name_parts), 'PERSON'))
91
+ i = j
92
+ continue
93
+
94
+ # Check for organizations
95
+ if self.is_capitalized(word):
96
+ org_parts = [word]
97
+ j = i + 1
98
+ while j < len(words) and (
99
+ self.is_capitalized(words[j]) or
100
+ words[j].lower() in self.ORGANIZATION_SUFFIXES
101
+ ):
102
+ org_parts.append(words[j])
103
+ j += 1
104
+ if len(org_parts) > 1 or (
105
+ len(org_parts) == 1 and
106
+ any(suff in word.lower() for suff in self.ORGANIZATION_SUFFIXES)
107
+ ):
108
+ entities['ORGANIZATION'].append((' '.join(org_parts), 'ORGANIZATION'))
109
+ i = j
110
+ continue
111
+
112
+ # Check for locations
113
+ if word.lower() in self.LOCATION_INDICATORS and i > 0:
114
+ if self.is_capitalized(words[i - 1]):
115
+ entities['LOCATION'].append((words[i - 1] + ' ' + word, 'LOCATION'))
116
+
117
+ i += 1
118
+
119
+ return entities
120
+
121
+ def tag_text(self, text: str) -> List[Tuple[str, str]]:
122
+ """
123
+ Tag each word in text with its entity type.
124
+
125
+ Returns:
126
+ List of (word, entity_type) tuples
127
+ """
128
+ entities = self.extract_entities(text)
129
+ tagged = []
130
+
131
+ # Create a map of word positions to entity labels
132
+ position_labels = {}
133
+ text_lower = text.lower()
134
+
135
+ for entity_type, entity_list in entities.items():
136
+ for entity_text, _ in entity_list:
137
+ start = text_lower.find(entity_text.lower())
138
+ if start != -1:
139
+ end = start + len(entity_text)
140
+ for pos in range(start, end):
141
+ position_labels[pos] = entity_type
142
+
143
+ # Tag each character position
144
+ current_pos = 0
145
+ current_word = []
146
+ current_label = 'O' # Outside any entity
147
+
148
+ for char in text:
149
+ if char.isspace():
150
+ if current_word:
151
+ tagged.append((''.join(current_word), current_label))
152
+ current_word = []
153
+ current_label = 'O'
154
+ else:
155
+ current_word.append(char)
156
+ if current_pos in position_labels:
157
+ current_label = position_labels[current_pos]
158
+ current_pos += 1
159
+
160
+ # Add last word if exists
161
+ if current_word:
162
+ tagged.append((''.join(current_word), current_label))
163
+
164
+ return tagged