nlp-package-mitb-2025 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
|
|
1
|
+
from .nlp import NLP_MITB_2025
|
nlp_mitb_2025/nlp.py
ADDED
@@ -0,0 +1,149 @@
|
|
1
|
+
import re
|
2
|
+
import random
|
3
|
+
from collections import Counter
|
4
|
+
from nltk.util import ngrams
|
5
|
+
from nltk.tokenize import word_tokenize
|
6
|
+
import nltk
|
7
|
+
from nltk.stem import WordNetLemmatizer, PorterStemmer
|
8
|
+
from nltk.tokenize import word_tokenize
|
9
|
+
from nltk.corpus import wordnet
|
10
|
+
|
11
|
+
|
12
|
+
nltk.download('punkt')
|
13
|
+
nltk.download('wordnet')
|
14
|
+
nltk.download('averaged_perceptron_tagger')
|
15
|
+
nltk.download('omw-1.4')
|
16
|
+
nltk.download('stopwords')
|
17
|
+
|
18
|
+
class NLP_MITB_2025:
|
19
|
+
|
20
|
+
|
21
|
+
def split_text(self , text):
|
22
|
+
return re.findall(r'\b\w+\b', text)
|
23
|
+
|
24
|
+
def extract_dates(self , text):
|
25
|
+
return re.findall(r'\b\d{2}/\d{2}/\d{4}\b|\b\d{2}-\d{2}-\d{4}\b', text)
|
26
|
+
|
27
|
+
def extract_phones(self , text):
|
28
|
+
return re.findall(r'\+91-\d{10}|\d{3}-\d{3}-\d{4}|\(\d{3}\)\s\d{3}-\d{4} ', text)
|
29
|
+
|
30
|
+
def clean_text(self , text):
|
31
|
+
return re.sub(r'^\W+|\W+$', '', text)
|
32
|
+
|
33
|
+
def count_non_alnum(self , text):
|
34
|
+
return len(re.findall(r'\W', text))
|
35
|
+
|
36
|
+
def replace_non_alnum(self, s, ch):
|
37
|
+
return re.sub(r'\W', ch, s)
|
38
|
+
|
39
|
+
def split_pairs(self , word):
|
40
|
+
return [(word[:i], word[i:]) for i in range(1, len(word))]
|
41
|
+
|
42
|
+
def prefixes_suffixes(self , word):
|
43
|
+
return [word[:i] for i in range(1, len(word)+1)], [word[i:] for i in range(len(word))]
|
44
|
+
|
45
|
+
def random_split(self , word):
|
46
|
+
i = random.randint(1, len(word)-1)
|
47
|
+
return (word[:i], word[i:])
|
48
|
+
|
49
|
+
def ngram_frequencies(text, n):
|
50
|
+
return Counter(ngrams(text.split(), n))
|
51
|
+
|
52
|
+
def ngram_probabilities(text, n):
|
53
|
+
words = text.split()
|
54
|
+
ngram_counts = Counter(ngrams(words, n))
|
55
|
+
total_ngrams = sum(ngram_counts.values())
|
56
|
+
return {ngram: count / total_ngrams for ngram, count in ngram_counts.items()}
|
57
|
+
|
58
|
+
def reverse_ngrams(text, n):
|
59
|
+
return list(ngrams(text.split()[::-1], n))
|
60
|
+
|
61
|
+
def remove_digits(sentence):
|
62
|
+
return ' '.join(w for w in word_tokenize(sentence) if not w.isdigit())
|
63
|
+
|
64
|
+
def count_digits(sentence):
|
65
|
+
return sum(len(w) for w in word_tokenize(sentence) if w.isdigit())
|
66
|
+
|
67
|
+
def extract_digits(sentence):
|
68
|
+
word = word_tokenize(sentence)
|
69
|
+
ans = []
|
70
|
+
for w in word:
|
71
|
+
if w.isdigit():
|
72
|
+
ans.extend(list(w))
|
73
|
+
print(ans)
|
74
|
+
|
75
|
+
def custom_tokenizer(sentence):
|
76
|
+
pattern = r'\d{1,2}/\d{1,2}/\d{2,4}|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
|
77
|
+
special_tokens = re.findall(pattern, sentence)
|
78
|
+
remaining_text = re.sub(pattern, ' ', sentence)
|
79
|
+
return special_tokens + word_tokenize(remaining_text)
|
80
|
+
|
81
|
+
def clean_tweet(text):
|
82
|
+
cleaned = re.sub(r'#\w+|[^\w\s]| +', ' ', text).strip().lower()
|
83
|
+
return cleaned
|
84
|
+
|
85
|
+
def remove_emojis(text):
|
86
|
+
|
87
|
+
emoji_pattern = re.compile(
|
88
|
+
"["
|
89
|
+
"\U0001F600-\U0001F64F" # Emoticons
|
90
|
+
"\U0001F300-\U0001F5FF" # Symbols & Pictographs
|
91
|
+
"\U0001F680-\U0001F6FF" # Transport & Map Symbols
|
92
|
+
"\U0001F1E0-\U0001F1FF" # Flags
|
93
|
+
"\U00002702-\U000027B0" # Dingbats
|
94
|
+
"\U000024C2-\U0001F251" # Enclosed characters
|
95
|
+
"]+",
|
96
|
+
flags=re.UNICODE
|
97
|
+
)
|
98
|
+
return emoji_pattern.sub(r'', text)
|
99
|
+
|
100
|
+
def normalize(text):
|
101
|
+
return ' '.join(text.lower().split())
|
102
|
+
|
103
|
+
|
104
|
+
def extract_dates(text):
|
105
|
+
|
106
|
+
# Regular expression pattern to match different date formats
|
107
|
+
pattern = r'\b\d{2}/\d{2}/\d{4}\b' # Matches 'DD/MM/YYYY'
|
108
|
+
pattern += r'|\b\d{2}-\d{2}-\d{4}\b' # Matches 'MM-DD-YYYY'
|
109
|
+
pattern += r'|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},\s\d{4}\b' # Matches 'Month DD, YYYY'
|
110
|
+
|
111
|
+
# Find all matches in the text
|
112
|
+
dates = re.findall(pattern, text)
|
113
|
+
return dates
|
114
|
+
|
115
|
+
def extract_and_standardize(text):
|
116
|
+
pattern = r'(\+91[\-\s]?|91[\-\s]?|\(?\+91\)?[\s\-]?|0)?(\d{5})[\s\-]?(\d{5})'
|
117
|
+
matches = re.findall(pattern, text)
|
118
|
+
print(matches)
|
119
|
+
return [f'+91-{m[1]}{m[2]}' for m in matches]
|
120
|
+
|
121
|
+
|
122
|
+
def process_text(text):
|
123
|
+
|
124
|
+
# Tokenize the text
|
125
|
+
tokens = word_tokenize(text)
|
126
|
+
|
127
|
+
# Initialize stemmer and lemmatizer
|
128
|
+
stemmer = PorterStemmer()
|
129
|
+
lemmatizer = WordNetLemmatizer()
|
130
|
+
|
131
|
+
# Apply stemming
|
132
|
+
stems = [stemmer.stem(word) for word in tokens]
|
133
|
+
|
134
|
+
# Apply lemmatization
|
135
|
+
lemmas = [lemmatizer.lemmatize(word) for word in tokens]
|
136
|
+
|
137
|
+
return {
|
138
|
+
'Original': tokens,
|
139
|
+
'Stemming': stems,
|
140
|
+
'Lemmatization': lemmas
|
141
|
+
}
|
142
|
+
|
143
|
+
|
144
|
+
|
145
|
+
|
146
|
+
|
147
|
+
|
148
|
+
|
149
|
+
|
@@ -0,0 +1,16 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: nlp_package_mitb_2025
|
3
|
+
Version: 0.1.0
|
4
|
+
Summary: A Python package for various NLP utilities.
|
5
|
+
Author: Your Name
|
6
|
+
Author-email: your.email@example.com
|
7
|
+
Classifier: Programming Language :: Python :: 3
|
8
|
+
Classifier: Operating System :: OS Independent
|
9
|
+
Requires-Python: >=3.6
|
10
|
+
Requires-Dist: nltk>=3.6.0
|
11
|
+
Dynamic: author
|
12
|
+
Dynamic: author-email
|
13
|
+
Dynamic: classifier
|
14
|
+
Dynamic: requires-dist
|
15
|
+
Dynamic: requires-python
|
16
|
+
Dynamic: summary
|
@@ -0,0 +1,6 @@
|
|
1
|
+
nlp_mitb_2025/__init__.py,sha256=-3NjWqy8N1Doyv31XApU3yCM9l9GICxHFTbJA9dNeLo,30
|
2
|
+
nlp_mitb_2025/nlp.py,sha256=kWli7B9nIphd_-sTX8PqLB1KLHOyKZDEI095bg5uDdI,4762
|
3
|
+
nlp_package_mitb_2025-0.1.0.dist-info/METADATA,sha256=Y18KTdxu85Th-QtX-YvPaWRWutfrMlHKu7u9QqxREkU,457
|
4
|
+
nlp_package_mitb_2025-0.1.0.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
|
5
|
+
nlp_package_mitb_2025-0.1.0.dist-info/top_level.txt,sha256=WdE25aXak4L0BaO16g78NfYGxXSWz4PuHxZM5NyvH0Q,14
|
6
|
+
nlp_package_mitb_2025-0.1.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
nlp_mitb_2025
|