nlp-package-mitb-2025 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1 @@
1
+ from .nlp import NLP_MITB_2025
nlp_mitb_2025/nlp.py ADDED
@@ -0,0 +1,149 @@
1
+ import re
2
+ import random
3
+ from collections import Counter
4
+ from nltk.util import ngrams
5
+ from nltk.tokenize import word_tokenize
6
+ import nltk
7
+ from nltk.stem import WordNetLemmatizer, PorterStemmer
8
+ from nltk.tokenize import word_tokenize
9
+ from nltk.corpus import wordnet
10
+
11
+
12
+ nltk.download('punkt')
13
+ nltk.download('wordnet')
14
+ nltk.download('averaged_perceptron_tagger')
15
+ nltk.download('omw-1.4')
16
+ nltk.download('stopwords')
17
+
18
+ class NLP_MITB_2025:
19
+
20
+
21
+ def split_text(self , text):
22
+ return re.findall(r'\b\w+\b', text)
23
+
24
+ def extract_dates(self , text):
25
+ return re.findall(r'\b\d{2}/\d{2}/\d{4}\b|\b\d{2}-\d{2}-\d{4}\b', text)
26
+
27
+ def extract_phones(self , text):
28
+ return re.findall(r'\+91-\d{10}|\d{3}-\d{3}-\d{4}|\(\d{3}\)\s\d{3}-\d{4} ', text)
29
+
30
+ def clean_text(self , text):
31
+ return re.sub(r'^\W+|\W+$', '', text)
32
+
33
+ def count_non_alnum(self , text):
34
+ return len(re.findall(r'\W', text))
35
+
36
+ def replace_non_alnum(self, s, ch):
37
+ return re.sub(r'\W', ch, s)
38
+
39
+ def split_pairs(self , word):
40
+ return [(word[:i], word[i:]) for i in range(1, len(word))]
41
+
42
+ def prefixes_suffixes(self , word):
43
+ return [word[:i] for i in range(1, len(word)+1)], [word[i:] for i in range(len(word))]
44
+
45
+ def random_split(self , word):
46
+ i = random.randint(1, len(word)-1)
47
+ return (word[:i], word[i:])
48
+
49
+ def ngram_frequencies(text, n):
50
+ return Counter(ngrams(text.split(), n))
51
+
52
+ def ngram_probabilities(text, n):
53
+ words = text.split()
54
+ ngram_counts = Counter(ngrams(words, n))
55
+ total_ngrams = sum(ngram_counts.values())
56
+ return {ngram: count / total_ngrams for ngram, count in ngram_counts.items()}
57
+
58
+ def reverse_ngrams(text, n):
59
+ return list(ngrams(text.split()[::-1], n))
60
+
61
+ def remove_digits(sentence):
62
+ return ' '.join(w for w in word_tokenize(sentence) if not w.isdigit())
63
+
64
+ def count_digits(sentence):
65
+ return sum(len(w) for w in word_tokenize(sentence) if w.isdigit())
66
+
67
+ def extract_digits(sentence):
68
+ word = word_tokenize(sentence)
69
+ ans = []
70
+ for w in word:
71
+ if w.isdigit():
72
+ ans.extend(list(w))
73
+ print(ans)
74
+
75
+ def custom_tokenizer(sentence):
76
+ pattern = r'\d{1,2}/\d{1,2}/\d{2,4}|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
77
+ special_tokens = re.findall(pattern, sentence)
78
+ remaining_text = re.sub(pattern, ' ', sentence)
79
+ return special_tokens + word_tokenize(remaining_text)
80
+
81
+ def clean_tweet(text):
82
+ cleaned = re.sub(r'#\w+|[^\w\s]| +', ' ', text).strip().lower()
83
+ return cleaned
84
+
85
+ def remove_emojis(text):
86
+
87
+ emoji_pattern = re.compile(
88
+ "["
89
+ "\U0001F600-\U0001F64F" # Emoticons
90
+ "\U0001F300-\U0001F5FF" # Symbols & Pictographs
91
+ "\U0001F680-\U0001F6FF" # Transport & Map Symbols
92
+ "\U0001F1E0-\U0001F1FF" # Flags
93
+ "\U00002702-\U000027B0" # Dingbats
94
+ "\U000024C2-\U0001F251" # Enclosed characters
95
+ "]+",
96
+ flags=re.UNICODE
97
+ )
98
+ return emoji_pattern.sub(r'', text)
99
+
100
+ def normalize(text):
101
+ return ' '.join(text.lower().split())
102
+
103
+
104
+ def extract_dates(text):
105
+
106
+ # Regular expression pattern to match different date formats
107
+ pattern = r'\b\d{2}/\d{2}/\d{4}\b' # Matches 'DD/MM/YYYY'
108
+ pattern += r'|\b\d{2}-\d{2}-\d{4}\b' # Matches 'MM-DD-YYYY'
109
+ pattern += r'|\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{1,2},\s\d{4}\b' # Matches 'Month DD, YYYY'
110
+
111
+ # Find all matches in the text
112
+ dates = re.findall(pattern, text)
113
+ return dates
114
+
115
+ def extract_and_standardize(text):
116
+ pattern = r'(\+91[\-\s]?|91[\-\s]?|\(?\+91\)?[\s\-]?|0)?(\d{5})[\s\-]?(\d{5})'
117
+ matches = re.findall(pattern, text)
118
+ print(matches)
119
+ return [f'+91-{m[1]}{m[2]}' for m in matches]
120
+
121
+
122
+ def process_text(text):
123
+
124
+ # Tokenize the text
125
+ tokens = word_tokenize(text)
126
+
127
+ # Initialize stemmer and lemmatizer
128
+ stemmer = PorterStemmer()
129
+ lemmatizer = WordNetLemmatizer()
130
+
131
+ # Apply stemming
132
+ stems = [stemmer.stem(word) for word in tokens]
133
+
134
+ # Apply lemmatization
135
+ lemmas = [lemmatizer.lemmatize(word) for word in tokens]
136
+
137
+ return {
138
+ 'Original': tokens,
139
+ 'Stemming': stems,
140
+ 'Lemmatization': lemmas
141
+ }
142
+
143
+
144
+
145
+
146
+
147
+
148
+
149
+
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.2
2
+ Name: nlp_package_mitb_2025
3
+ Version: 0.1.0
4
+ Summary: A Python package for various NLP utilities.
5
+ Author: Your Name
6
+ Author-email: your.email@example.com
7
+ Classifier: Programming Language :: Python :: 3
8
+ Classifier: Operating System :: OS Independent
9
+ Requires-Python: >=3.6
10
+ Requires-Dist: nltk>=3.6.0
11
+ Dynamic: author
12
+ Dynamic: author-email
13
+ Dynamic: classifier
14
+ Dynamic: requires-dist
15
+ Dynamic: requires-python
16
+ Dynamic: summary
@@ -0,0 +1,6 @@
1
+ nlp_mitb_2025/__init__.py,sha256=-3NjWqy8N1Doyv31XApU3yCM9l9GICxHFTbJA9dNeLo,30
2
+ nlp_mitb_2025/nlp.py,sha256=kWli7B9nIphd_-sTX8PqLB1KLHOyKZDEI095bg5uDdI,4762
3
+ nlp_package_mitb_2025-0.1.0.dist-info/METADATA,sha256=Y18KTdxu85Th-QtX-YvPaWRWutfrMlHKu7u9QqxREkU,457
4
+ nlp_package_mitb_2025-0.1.0.dist-info/WHEEL,sha256=HiCZjzuy6Dw0hdX5R3LCFPDmFS4BWl8H-8W39XfmgX4,91
5
+ nlp_package_mitb_2025-0.1.0.dist-info/top_level.txt,sha256=WdE25aXak4L0BaO16g78NfYGxXSWz4PuHxZM5NyvH0Q,14
6
+ nlp_package_mitb_2025-0.1.0.dist-info/RECORD,,
@@ -0,0 +1,5 @@
1
+ Wheel-Version: 1.0
2
+ Generator: setuptools (72.2.0)
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any
5
+
@@ -0,0 +1 @@
1
+ nlp_mitb_2025