SqueakyCleanText 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) [2024] [Rehan Fazal]
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,2 @@
1
+ include README.md
2
+ include LICENSE
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.1
2
+ Name: SqueakyCleanText
3
+ Version: 0.1.0
4
+ Summary: A comprehensive text cleaning and preprocessing pipeline.
5
+ Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
+ Author: Rehan Fazal
7
+ License: MIT
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: lingua-language-detector
15
+ Requires-Dist: nltk
16
+ Requires-Dist: emoji
17
+ Requires-Dist: ftfy
18
+ Requires-Dist: Unidecode
19
+ Requires-Dist: beautifulsoup4
20
+ Requires-Dist: transformers
21
+ Requires-Dist: torch
22
+ Provides-Extra: dev
23
+ Requires-Dist: hypothesis; extra == "dev"
24
+ Requires-Dist: faker; extra == "dev"
25
+ Requires-Dist: flake8; extra == "dev"
26
+ Requires-Dist: pytest; extra == "dev"
27
+
28
+ # SqueakyCleanText
29
+ Clean your Text for Classical ML and Language Model
30
+
31
+ # TODO
32
+ - Ability to change the NER MODELS from the config file, which supports AutoModel and AutoTokenizer
33
+ - Add more language support in stopwords
@@ -0,0 +1,6 @@
1
+ # SqueakyCleanText
2
+ Clean your Text for Classical ML and Language Model
3
+
4
+ # TODO
5
+ - Ability to change the NER MODELS from the config file, which supports AutoModel and AutoTokenizer
6
+ - Add more language support in stopwords
@@ -0,0 +1,33 @@
1
+ Metadata-Version: 2.1
2
+ Name: SqueakyCleanText
3
+ Version: 0.1.0
4
+ Summary: A comprehensive text cleaning and preprocessing pipeline.
5
+ Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
+ Author: Rehan Fazal
7
+ License: MIT
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: MIT License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.7
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ Requires-Dist: lingua-language-detector
15
+ Requires-Dist: nltk
16
+ Requires-Dist: emoji
17
+ Requires-Dist: ftfy
18
+ Requires-Dist: Unidecode
19
+ Requires-Dist: beautifulsoup4
20
+ Requires-Dist: transformers
21
+ Requires-Dist: torch
22
+ Provides-Extra: dev
23
+ Requires-Dist: hypothesis; extra == "dev"
24
+ Requires-Dist: faker; extra == "dev"
25
+ Requires-Dist: flake8; extra == "dev"
26
+ Requires-Dist: pytest; extra == "dev"
27
+
28
+ # SqueakyCleanText
29
+ Clean your Text for Classical ML and Language Model
30
+
31
+ # TODO
32
+ - Ability to change the NER MODELS from the config file, which supports AutoModel and AutoTokenizer
33
+ - Add more language support in stopwords
@@ -0,0 +1,26 @@
1
+ LICENSE
2
+ MANIFEST.in
3
+ README.md
4
+ setup.py
5
+ SqueakyCleanText.egg-info/PKG-INFO
6
+ SqueakyCleanText.egg-info/SOURCES.txt
7
+ SqueakyCleanText.egg-info/dependency_links.txt
8
+ SqueakyCleanText.egg-info/entry_points.txt
9
+ SqueakyCleanText.egg-info/requires.txt
10
+ SqueakyCleanText.egg-info/top_level.txt
11
+ sct/__init__.py
12
+ sct/config.py
13
+ sct/sct.py
14
+ sct/scripts/__init__.py
15
+ sct/scripts/download_nltk_stopwords.py
16
+ sct/utils/__init__.py
17
+ sct/utils/constants.py
18
+ sct/utils/contact.py
19
+ sct/utils/datetime.py
20
+ sct/utils/ner.py
21
+ sct/utils/normtext.py
22
+ sct/utils/resources.py
23
+ sct/utils/special.py
24
+ sct/utils/stopwords.py
25
+ tests/__init__.py
26
+ tests/test_sct.py
@@ -0,0 +1,2 @@
1
+ [console_scripts]
2
+ nltk_downloader = sct.scripts.download_nltk_stopwords:main
@@ -0,0 +1,14 @@
1
+ lingua-language-detector
2
+ nltk
3
+ emoji
4
+ ftfy
5
+ Unidecode
6
+ beautifulsoup4
7
+ transformers
8
+ torch
9
+
10
+ [dev]
11
+ hypothesis
12
+ faker
13
+ flake8
14
+ pytest
File without changes
@@ -0,0 +1,51 @@
1
+ """
2
+ detect_language : to detect the language automatically, but would consume more time if done on a batch
3
+ fix_bad_unicode : if True, fix "broken" unicode such as mojibake and garbled HTML entities
4
+ to_ascii_unicode : if True, convert non-to_ascii characters into their closest to_ascii equivalents
5
+ replace_with_url : special URL token, default "",
6
+ replace_with_email : special EMAIL token, default "",
7
+ replace_years : replace year, default "",
8
+ replace_with_phone_number : special PHONE token, default "",
9
+ replace_with_number : special NUMBER token, default "",
10
+ no_currency_symbols : if True, replace all currency symbols with the respective alphabetical ones,
11
+ ner_process : To execute NER Process to remove the positpositional tags, PER, LOC, ORG, MISC
12
+ remove_isolated_letters : remove any isolated letters which doesn't add any value to the text
13
+ remove_isolated_symbols : remove any isolated symbols which shouldn't be present in the text, usually which isn't
14
+ immediatly prefixed and suffixed by letter or number
15
+ normalize_whitespace : remove any unnecessary whitespace
16
+ statistical_model_processing : to get the statistical model text, like for fastText, SVM, LR etc
17
+ casefold : to lower the text
18
+ remove_stopwords : remove stopwords based on the language, usues NLTK stopwords
19
+ remove_punctuation : removes all the special symbols
20
+ """
21
+
22
+ CHECK_DETECT_LANGUAGE = True
23
+ CHECK_FIX_BAD_UNICODE = True
24
+ CHECK_TO_ASCII_UNICODE = True
25
+ CHECK_REPLACE_HTML = True
26
+ CHECK_REPLACE_URLS = True
27
+ CHECK_REPLACE_EMAILS = True
28
+ CHECK_REPLACE_YEARS = True
29
+ CHECK_REPLACE_PHONE_NUMBERS = True
30
+ CHECK_REPLACE_NUMBERS = True
31
+ CHECK_REPLACE_CURRENCY_SYMBOLS = True
32
+ CHECK_NER_PROCESS = True
33
+ CHECK_REMOVE_CUSTOM_STOP_WORDS = True
34
+ CHECK_REMOVE_ISOLATED_LETTERS = True
35
+ CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
36
+ CHECK_NORMALIZE_WHITESPACE = True
37
+ CHECK_STATISTICAL_MODEL_PROCESSING = True
38
+ CHECK_CASEFOLD = True
39
+ CHECK_REMOVE_STOPWORDS = True
40
+ CHECK_REMOVE_PUNCTUATION = True
41
+ CHECK_REMOVE_STEXT_CUSTOM_STOP_WORDS = True
42
+ REPLACE_WITH_URL = ""
43
+ REPLACE_WITH_HTML = ""
44
+ REPLACE_WITH_EMAIL = ""
45
+ REPLACE_WITH_YEARS = ""
46
+ REPLACE_WITH_PHONE_NUMBERS = ""
47
+ REPLACE_WITH_NUMBERS = ""
48
+ REPLACE_WITH_CURRENCY_SYMBOLS = None
49
+ POSITIONAL_TAGS = ['PER', 'LOC']
50
+ NER_CONFIDENCE_THRESHOLD = 0.75
51
+ LANGUAGE = None
File without changes
@@ -0,0 +1,7 @@
1
+ import nltk
2
+
3
+ def main():
4
+ nltk.download('stopwords')
5
+
6
+ if __name__ == "__main__":
7
+ main()
@@ -0,0 +1,126 @@
1
+ """
2
+ This code provides a comprehensive text cleaning and preprocessing pipeline.
3
+ It includes functions to normalize, remove personal information and clean text data,
4
+ which is crucial for natural language processing tasks.
5
+ """
6
+ from sct import config
7
+ from sct.utils import contact, datetime, ner, normtext, resources, special, stopwords
8
+
9
+ class TextCleaner:
10
+
11
+ def __init__(self):
12
+ self.ProcessContacts = contact.ProcessContacts()
13
+ self.ProcessDateTime = datetime.ProcessDateTime()
14
+ self.ProcessSpecialSymbols = special.ProcessSpecialSymbols()
15
+ self.NormaliseText = normtext.NormaliseText()
16
+ self.ProcessStopwords = stopwords.ProcessStopwords()
17
+ self.GeneralNER = ner.GeneralNER()
18
+ self.pipeline = []
19
+ self.language = None
20
+ self.init_pipeline()
21
+
22
+ def init_pipeline(self):
23
+ # Initialize pipeline steps based on config
24
+ if config.CHECK_DETECT_LANGUAGE:
25
+ self.pipeline.append(self.detect_language)
26
+
27
+ if config.CHECK_FIX_BAD_UNICODE:
28
+ self.pipeline.append(self.fix_bad_unicode)
29
+ if config.CHECK_TO_ASCII_UNICODE:
30
+ self.pipeline.append(self.to_ascii_unicode)
31
+
32
+ if config.CHECK_REPLACE_HTML:
33
+ self.pipeline.append(self.replace_html)
34
+ if config.CHECK_REPLACE_URLS:
35
+ self.pipeline.append(self.replace_urls)
36
+ if config.CHECK_REPLACE_EMAILS:
37
+ self.pipeline.append(self.replace_emails)
38
+ if config.CHECK_REPLACE_YEARS:
39
+ self.pipeline.append(self.replace_years)
40
+ if config.CHECK_REPLACE_PHONE_NUMBERS:
41
+ self.pipeline.append(self.replace_phone_numbers)
42
+ if config.CHECK_REPLACE_NUMBERS:
43
+ self.pipeline.append(self.replace_numbers)
44
+ if config.CHECK_REPLACE_CURRENCY_SYMBOLS:
45
+ self.pipeline.append(self.replace_currency_symbols)
46
+
47
+ if config.CHECK_NER_PROCESS:
48
+ self.pipeline.append(self.ner_process)
49
+
50
+ if config.CHECK_REMOVE_ISOLATED_LETTERS:
51
+ self.pipeline.append(self.remove_isolated_letters)
52
+ if config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS:
53
+ self.pipeline.append(self.remove_isolated_special_symbols)
54
+ if config.CHECK_NORMALIZE_WHITESPACE:
55
+ self.pipeline.append(self.normalize_whitespace)
56
+
57
+ def process(self, text):
58
+ text = str(text)
59
+
60
+ for step in self.pipeline:
61
+ # print(f"Before {step.__name__}: {text}") # Debug print
62
+ text = step(text)
63
+ # print(f"After {step.__name__}: {text}") # Debug print
64
+
65
+ if config.CHECK_STATISTICAL_MODEL_PROCESSING:
66
+ stext = self.statistical_model_processing(text)
67
+ return text, stext, self.language
68
+
69
+ return text, self.language
70
+
71
+ def detect_language(self, text):
72
+ self.language = str(resources.DETECTOR.detect_language_of(text)).split(".")[-1]
73
+ return text
74
+
75
+ def fix_bad_unicode(self, text):
76
+ return self.NormaliseText.fix_bad_unicode(text)
77
+
78
+ def to_ascii_unicode(self, text):
79
+ return self.NormaliseText.to_ascii_unicode(text)
80
+
81
+ def replace_html(self, text):
82
+ return self.ProcessContacts.replace_html(text, replace_with=config.REPLACE_WITH_HTML)
83
+
84
+ def replace_urls(self, text):
85
+ return self.ProcessContacts.replace_urls(text, replace_with=config.REPLACE_WITH_URL)
86
+
87
+ def replace_emails(self, text):
88
+ return self.ProcessContacts.replace_emails(text, replace_with=config.REPLACE_WITH_EMAIL)
89
+
90
+ def replace_years(self, text):
91
+ return self.ProcessDateTime.replace_years(text, replace_with=config.REPLACE_WITH_YEARS)
92
+
93
+ def replace_phone_numbers(self, text):
94
+ return self.ProcessContacts.replace_phone_numbers(text, replace_with=config.REPLACE_WITH_PHONE_NUMBERS)
95
+
96
+ def replace_numbers(self, text):
97
+ return self.ProcessContacts.replace_numbers(text, replace_with=config.REPLACE_WITH_NUMBERS)
98
+
99
+ def replace_currency_symbols(self, text):
100
+ return self.ProcessSpecialSymbols.replace_currency_symbols(text, replace_with=config.REPLACE_WITH_CURRENCY_SYMBOLS)
101
+
102
+ def ner_process(self, text):
103
+ ner_words = self.GeneralNER.ner_process(text, config.POSITIONAL_TAGS, config.NER_CONFIDENCE_THRESHOLD, self.language)
104
+ return self.ProcessStopwords.remove_words_from_string(text, ner_words)
105
+
106
+ def remove_isolated_letters(self, text):
107
+ return self.ProcessSpecialSymbols.remove_isolated_letters(text)
108
+
109
+ def remove_isolated_special_symbols(self, text):
110
+ return self.ProcessSpecialSymbols.remove_isolated_special_symbols(text)
111
+
112
+ def normalize_whitespace(self, text):
113
+ return self.NormaliseText.normalize_whitespace(text, no_line_breaks=True)
114
+
115
+ def statistical_model_processing(self, text):
116
+ if config.CHECK_CASEFOLD:
117
+ stext = text.casefold() # lowercase
118
+ if config.CHECK_REMOVE_STOPWORDS:
119
+ stext = self.ProcessStopwords.remove_stopwords(stext, self.language)
120
+ if config.CHECK_REMOVE_PUNCTUATION:
121
+ stext = self.ProcessSpecialSymbols.remove_punctuation(stext)
122
+ if config.CHECK_REMOVE_ISOLATED_LETTERS:
123
+ stext = self.ProcessSpecialSymbols.remove_isolated_letters(stext)
124
+ if config.CHECK_NORMALIZE_WHITESPACE:
125
+ stext = self.NormaliseText.normalize_whitespace(stext)
126
+ return stext
File without changes
@@ -0,0 +1,131 @@
1
+ """
2
+ Constant symbols and compiled RegExs use for cleaning.
3
+ """
4
+
5
+ import re
6
+
7
+ CURRENCIES = {
8
+ "$": "USD",
9
+ "zł": "PLN",
10
+ "£": "GBP",
11
+ "¥": "JPY",
12
+ "฿": "THB",
13
+ "₡": "CRC",
14
+ "₦": "NGN",
15
+ "₩": "KRW",
16
+ "₪": "ILS",
17
+ "₫": "VND",
18
+ "€": "EUR",
19
+ "₱": "PHP",
20
+ "₲": "PYG",
21
+ "₴": "UAH",
22
+ "₹": "INR",
23
+ }
24
+ CURRENCY_REGEX = re.compile(
25
+ "({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
26
+ )
27
+
28
+ ACRONYM_REGEX = re.compile(
29
+ r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))",
30
+ flags=re.UNICODE,
31
+ )
32
+
33
+ # taken hostname, domainname, tld from URL regex below
34
+ EMAIL_REGEX = re.compile(
35
+ r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-](@|[(<{\[]at[)>}\]])(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))",
36
+ flags=re.IGNORECASE | re.UNICODE,
37
+ )
38
+
39
+ # for more information: https://github.com/jfilter/clean-text/issues/10
40
+ # PHONE_REGEX = re.compile(
41
+ # r"((?:^|(?<=[^\w)]))(((\+?[01])|(\+\d{2}))[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
42
+ # )
43
+ # PHONE_REGEX = re.compile(
44
+ # r"((?:^|(?<=[^\w)]))((\+?[01]|0{1,2}\d{0,1}|\+\d{2})[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
45
+ # )
46
+
47
+ PHONE_REGEX = re.compile(
48
+ r"((?:^|(?<=[^\w)]))((\+?\d+|0{1,2}\d*?)[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
49
+ )
50
+
51
+ NUMBERS_REGEX = re.compile(
52
+ r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))"
53
+ )
54
+
55
+ LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
56
+ TWO_LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+((\r\n)|[\n\v])+")
57
+ MULTI_WHITESPACE_TO_ONE_REGEX = re.compile(r"\s+")
58
+ NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")
59
+
60
+
61
+ HTML_REGEX = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', flags=re.UNICODE | re.IGNORECASE,)
62
+
63
+ # source: https://gist.github.com/dperini/729294
64
+ URL_REGEX = re.compile(
65
+ r"(?:^|(?<![\w\/\.]))"
66
+ # protocol identifier
67
+ # r"(?:(?:https?|ftp)://)" # <-- alt?
68
+ r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))"
69
+ # user:pass authentication
70
+ r"(?:\S+(?::\S*)?@)?" r"(?:"
71
+ # IP address exclusion
72
+ # private & local networks
73
+ r"(?!(?:10|127)(?:\.\d{1,3}){3})"
74
+ r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
75
+ r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
76
+ # IP address dotted notation octets
77
+ # excludes loopback network 0.0.0.0
78
+ # excludes reserved space >= 224.0.0.0
79
+ # excludes network & broadcast addresses
80
+ # (first & last IP address of each class)
81
+ r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
82
+ r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
83
+ r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
84
+ r"|"
85
+ # host name
86
+ r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)"
87
+ # domain name
88
+ r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*"
89
+ # TLD identifier
90
+ r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")"
91
+ # port number
92
+ # r"(?::\d{2,5})?"
93
+ r"(?::\d{2,5}\b)?"
94
+ # resource path
95
+ r"(?:\/[^\)\]\}\s]*)?",
96
+ # r"(?:$|(?![\w?!+&\/\)]))",
97
+ flags=re.UNICODE | re.IGNORECASE,
98
+ )
99
+
100
+
101
+ strange_double_quotes = [
102
+ "«",
103
+ "‹",
104
+ "»",
105
+ "›",
106
+ "„",
107
+ "“",
108
+ "‟",
109
+ "”",
110
+ "❝",
111
+ "❞",
112
+ "❮",
113
+ "❯",
114
+ "〝",
115
+ "〞",
116
+ "〟",
117
+ """,
118
+ ]
119
+ strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", "‘", "’"]
120
+
121
+ DOUBLE_QUOTE_REGEX = re.compile("|".join(strange_double_quotes))
122
+ SINGLE_QUOTE_REGEX = re.compile("|".join(strange_single_quotes))
123
+
124
+ YEAR_REGEX = re.compile(r"\b(19|20)\d{2}\b") # Matches years from 1900 to 2099
125
+
126
+ ISOLATED_LETTERS_REGEX = re.compile(r"(?:^|\s)[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, V, W, X, Y, Z](?=\s|$)", flags=re.UNICODE | re.IGNORECASE)
127
+
128
+ ISOLATED_SPECIAL_SYMBOLS_REGEX = re.compile(r"(?<![a-zA-Z0-9])[:_.|><;·}@~!?+#)({,/\\\\^]+(?![a-zA-Z0-9])", flags=re.UNICODE | re.IGNORECASE)
129
+
130
+
131
+ SENTENCE_BOUNDARY_PATTERN = re.compile('(?<=[.!?])\s+(?=[^\d])')
@@ -0,0 +1,50 @@
1
+ from sct.utils import constants
2
+ from bs4 import BeautifulSoup
3
+
4
+ class ProcessContacts:
5
+
6
+ def __init__(self):
7
+ pass
8
+
9
+ def replace_urls(self, text, replace_with="<URL>"):
10
+ """
11
+ Replace all URLs in ``text`` str with ``replace_with`` str.
12
+ """
13
+ # matches = constants.URL_REGEX.finditer(text)
14
+ # result = text
15
+ # # Iterate through matches in reverse order (to avoid index issues)
16
+ # for match in reversed(list(matches)):
17
+ # # Check if the matched substring contains non-ASCII characters
18
+ # if not any(ord(char) > 127 for char in match.group()):
19
+ # result = text[:match.start()] + replace_with + text[match.end():]
20
+ return constants.URL_REGEX.sub(replace_with, text)
21
+
22
+ def replace_html(self, text, replace_with="<HTML>"):
23
+ """
24
+ Replace all html tags in ``text`` str with ``replace_with`` str.
25
+ """
26
+ try:
27
+ soup = BeautifulSoup(text, 'html.parser')
28
+ text = soup.get_text()
29
+ except:
30
+ text = constants.HTML_REGEX.sub(replace_with, text)
31
+
32
+ return text
33
+
34
+ def replace_emails(self, text, replace_with="<EMAIL>"):
35
+ """
36
+ Replace all emails in ``text`` str with ``replace_with`` str.
37
+ """
38
+ return constants.EMAIL_REGEX.sub(replace_with, text)
39
+
40
+ def replace_phone_numbers(self, text, replace_with="<PHONE>"):
41
+ """
42
+ Replace all phone numbers in ``text`` str with ``replace_with`` str.
43
+ """
44
+ return constants.PHONE_REGEX.sub(replace_with, text)
45
+
46
+ def replace_numbers(self, text, replace_with="<NUMBER>"):
47
+ """
48
+ Replace all numbers in ``text`` str with ``replace_with`` str.
49
+ """
50
+ return constants.NUMBERS_REGEX.sub(replace_with, text)
@@ -0,0 +1,15 @@
1
+ from sct.utils import constants
2
+
3
+
4
+ class ProcessDateTime:
5
+
6
+ def __init__(self):
7
+ pass
8
+
9
+ def replace_years(self, text, replace_with ="<YEAR>"):
10
+ """
11
+ Replaces years between 1900 to 2099 in the text with a special token.
12
+ """
13
+ cleaned_string = constants.YEAR_REGEX.sub(replace_with, text)
14
+
15
+ return cleaned_string
@@ -0,0 +1,147 @@
1
+ import math
2
+ import itertools
3
+ from collections import defaultdict
4
+
5
+ import transformers
6
+ from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
7
+
8
+ from sct.utils import constants
9
+ transformers.logging.set_verbosity_error()
10
+
11
+ class GeneralNER:
12
+
13
+ """
14
+ To tag [PER, LOC, ORG, MISC] postional tags using ensemble technique
15
+ """
16
+
17
+ def __init__(self):
18
+
19
+ #---- NER Models
20
+ TOKENIZER_EN = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
21
+ MODEL_EN = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
22
+
23
+ TOKENIZER_NL = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll02-dutch")
24
+ MODEL_NL = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll02-dutch")
25
+
26
+ TOKENIZER_DE = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-german")
27
+ MODEL_DE = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-german")
28
+
29
+ TOKENIZER_ES = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll02-spanish")
30
+ MODEL_ES = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll02-spanish")
31
+
32
+ TOKENIZER_MULTI = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
33
+ MODEL_MULTI = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
34
+
35
+ # Length with buffer of 10%
36
+ self.len_single_sentence = [TOKENIZER_EN.max_len_single_sentence, TOKENIZER_MULTI.max_len_single_sentence]
37
+ self.min_token_length = math.ceil(min(self.len_single_sentence) * 0.9)
38
+ tokenizer_indicator = self.len_single_sentence.index(min(self.len_single_sentence))
39
+
40
+ if tokenizer_indicator == 0:
41
+ self.tokenizer = TOKENIZER_EN
42
+ elif tokenizer_indicator == 1:
43
+ self.tokenizer = TOKENIZER_MULTI
44
+
45
+ self.nlp_en = pipeline("ner", model=MODEL_EN, tokenizer=TOKENIZER_EN, aggregation_strategy="simple")
46
+ self.nlp_nl = pipeline("ner", model=MODEL_NL, tokenizer=TOKENIZER_NL, aggregation_strategy="simple")
47
+ self.nlp_de = pipeline("ner", model=MODEL_DE, tokenizer=TOKENIZER_DE, aggregation_strategy="simple")
48
+ self.nlp_es = pipeline("ner", model=MODEL_ES, tokenizer=TOKENIZER_ES, aggregation_strategy="simple")
49
+ self.nlp_multi = pipeline("ner", model=MODEL_MULTI, tokenizer=TOKENIZER_MULTI, aggregation_strategy="simple")
50
+
51
+ def ner_data(self, data, pos):
52
+ """
53
+ Formats NER (Named Entity Recognition) files.
54
+ """
55
+ return [{'entity_group': ix['entity_group'], 'score': ix['score'], 'word': ix['word'], 'key': str(ix['start']) + str(ix['end'])} for ix in data if ix['entity_group'] in pos]
56
+
57
+ def ner_ensemble(self, ner_results, t):
58
+ """
59
+ Applies an ensemble method for NER.
60
+ """
61
+ ner_keys = defaultdict(lambda: [0, 0])
62
+ ner_words = set()
63
+
64
+ for entity in ner_results:
65
+ ner_keys[entity['key']][0] += 1
66
+ ner_keys[entity['key']][1] += entity['score']
67
+
68
+ ner_keys = [key for key, val in ner_keys.items() if val[1] / val[0] >= t]
69
+
70
+ for entity in ner_results:
71
+ if entity['key'] in ner_keys:
72
+ ner_words.add(entity['word'])
73
+
74
+ ner_words = sorted(list(ner_words), key=len, reverse=True)
75
+ return ner_words
76
+
77
+ def ner_process(self, text, positional_tags, ner_confidence_threshold, language):
78
+ """_summary_
79
+ Executes NER Process to remove the positional tags, PER, LOC, ORG, MISC.
80
+ Args:
81
+ text (string): text from which postional tags need to be recognised
82
+ positional_tags (list): pass tags as ['PER', 'LOC'] (default), also supports 'ORG', 'MISC'
83
+ ner_confidence_threshold (int): NER Confidence
84
+ language (string): language model which need to be used, currently supports ENGLISH, DUTCH
85
+
86
+ Returns:
87
+ list: a list of words sorted based on length for the provided positional tags which meets the threshold
88
+ """
89
+ ner_results = []
90
+
91
+ # text length
92
+ text_token_length = len(self.tokenizer.tokenize(text))
93
+
94
+ # parts it need to get split
95
+ num_parts = math.ceil(text_token_length/self.min_token_length)
96
+
97
+ if num_parts == 0:
98
+ texts = [text]
99
+ else:
100
+ texts = self.split_text(text, self.min_token_length, self.tokenizer)
101
+
102
+ for text in texts:
103
+ ner_results.append(self.ner_data(self.nlp_multi(text), positional_tags))
104
+ ner_results.append(self.ner_data(self.nlp_en(text), positional_tags))
105
+
106
+ if language == 'DUTCH':
107
+ ner_results.append(self.ner_data(self.nlp_nl(text), positional_tags))
108
+ elif language == 'GERMAN':
109
+ ner_results.append(self.ner_data(self.nlp_de(text), positional_tags))
110
+ elif language == 'SPANISH':
111
+ ner_results.append(self.ner_data(self.nlp_es(text), positional_tags))
112
+
113
+ # flat out the list
114
+ ner_results = list(itertools.chain.from_iterable(ner_results))
115
+ ner_words = self.ner_ensemble(ner_results, ner_confidence_threshold)
116
+ return ner_words
117
+
118
+
119
+ def split_text(self, text, max_tokens, tokenizer):
120
+
121
+ sentence_boundaries = [(m.start(), m.end()) for m in constants.SENTENCE_BOUNDARY_PATTERN.finditer(text)]
122
+
123
+ chunks = []
124
+ current_chunk = []
125
+ current_token_count = 0
126
+ current_position = 0
127
+
128
+ for boundary_start, boundary_end in sentence_boundaries:
129
+ sentence = text[current_position:boundary_start+1]
130
+ current_position = boundary_end
131
+
132
+ token_count = len(tokenizer(sentence)["input_ids"])
133
+
134
+ if current_token_count + token_count <= max_tokens:
135
+ current_chunk.append(sentence)
136
+ current_token_count += token_count
137
+ else:
138
+ chunks.append(''.join(current_chunk))
139
+ current_chunk = [sentence]
140
+ current_token_count = token_count
141
+
142
+ # Append the last sentence
143
+ last_sentence = text[current_position:]
144
+ current_chunk.append(last_sentence)
145
+ chunks.append(''.join(current_chunk))
146
+
147
+ return chunks
@@ -0,0 +1,83 @@
1
+ from ftfy import fix_text
2
+ from unidecode import unidecode
3
+ from emoji import demojize, emojize
4
+ #---
5
+ from sct.utils import constants
6
+
7
+
8
+ class NormaliseText:
9
+
10
+ def __init__(self):
11
+
12
+ pass
13
+
14
+ def fix_bad_unicode(self, text, normalization="NFC"):
15
+ """
16
+ Fix unicode text that's "broken" using `ftfy <http://ftfy.readthedocs.org/>`_;
17
+ this includes mojibake, HTML entities and other code cruft,
18
+ and non-standard forms for display purposes.
19
+ Args:
20
+ text (str): raw text
21
+ normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}): if 'NFC',
22
+ combines characters and diacritics written using separate code points,
23
+ e.g. converting "e" plus an acute accent modifier into "é"; unicode
24
+ can be converted to NFC form without any change in its meaning!
25
+ if 'NFKC', additional normalizations are applied that can change
26
+ the meanings of characters, e.g. ellipsis characters will be replaced
27
+ with three periods
28
+ """
29
+ # trying to fix backslash-replaced strings (via https://stackoverflow.com/a/57192592/4028896)
30
+ try:
31
+ text = text.encode("latin", "backslashreplace").decode("unicode-escape")
32
+ except:
33
+ pass
34
+
35
+ return fix_text(text, normalization=normalization)
36
+
37
+ def fix_strange_quotes(self, text):
38
+ """
39
+ Replace strange quotes, i.e., 〞with a single quote ' or a double quote " if it fits better.
40
+ """
41
+ text = constants.SINGLE_QUOTE_REGEX.sub("'", text)
42
+ text = constants.DOUBLE_QUOTE_REGEX.sub('"', text)
43
+ return text
44
+
45
+ def to_ascii_unicode(self, text, no_emoji=True):
46
+ """
47
+ Try to represent unicode data in ascii characters similar to what a human
48
+ with a US keyboard would choose.
49
+ Works great for languages of Western origin, worse the farther the language
50
+ gets from Latin-based alphabets. It's based on hand-tuned character mappings
51
+ that also contain ascii approximations for symbols and non-Latin alphabets.
52
+ """
53
+ # normalize quotes before since this improves transliteration quality
54
+ text = self.fix_strange_quotes(text)
55
+
56
+ if not no_emoji:
57
+ text = demojize(text, use_aliases=True)
58
+
59
+ text = unidecode(text)
60
+
61
+ return text
62
+
63
+ def normalize_whitespace(self, text, strip_lines=True, no_line_breaks=False, keep_two_line_breaks=False):
64
+ """
65
+ Given ``text`` str, replace one or more spacings with a single space, and one
66
+ or more line breaks with a single newline. Also strip leading/trailing whitespace.
67
+ """
68
+ if strip_lines:
69
+ text = "\n".join([x.strip() for x in text.splitlines()])
70
+
71
+ if no_line_breaks:
72
+ text = constants.MULTI_WHITESPACE_TO_ONE_REGEX.sub(" ", text)
73
+ else:
74
+ if keep_two_line_breaks:
75
+ text = constants.NONBREAKING_SPACE_REGEX.sub(
76
+ " ", constants.TWO_LINEBREAK_REGEX.sub(r"\n\n", text)
77
+ )
78
+ else:
79
+ text = constants.NONBREAKING_SPACE_REGEX.sub(
80
+ " ", constants.LINEBREAK_REGEX.sub(r"\n", text)
81
+ )
82
+
83
+ return text.strip()
@@ -0,0 +1,8 @@
1
+ import warnings
2
+ warnings.filterwarnings('ignore')
3
+
4
+ from lingua import Language, LanguageDetectorBuilder
5
+
6
+ #---- Detect the Language / Also add languages to support in Future
7
+ LANGUAGES = [Language.DUTCH, Language.ENGLISH]
8
+ DETECTOR = LanguageDetectorBuilder.from_languages(*LANGUAGES).build()
@@ -0,0 +1,50 @@
1
+ import re
2
+ import string
3
+ #---
4
+ from sct.utils import constants
5
+
6
+ class ProcessSpecialSymbols:
7
+
8
+ def __init__(self):
9
+ pass
10
+
11
+ def replace_currency_symbols(self, text, replace_with="<CUR>"):
12
+ """
13
+ Replace currency symbols in ``text`` str with string specified by ``replace_with`` str.
14
+ Args:
15
+ text (str): raw text
16
+ replace_with (str): if None (default), replace symbols with
17
+ their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP');
18
+ otherwise, pass in a string with which to replace all symbols
19
+ (e.g. "*CURRENCY*")
20
+ """
21
+ if replace_with is None:
22
+ for k, v in constants.CURRENCIES.items():
23
+ text = text.replace(k, v)
24
+ return text
25
+ else:
26
+ return constants.CURRENCY_REGEX.sub(replace_with, text)
27
+
28
+
29
+ def remove_isolated_letters(self, text):
30
+ """
31
+ Removes any isolated letters which doesn't add any value to the text.
32
+ """
33
+ cleaned_text = constants.ISOLATED_LETTERS_REGEX.sub('', text)
34
+
35
+ return cleaned_text
36
+
37
+ def remove_isolated_special_symbols(self, text):
38
+ """
39
+ Removes any isolated symbols which shouldn't be present in the text.
40
+ """
41
+ cleaned_text = re.sub(r'\[[^\]]+\]', '', text) # to remove [] content, usianlly they are image or file loc text
42
+ cleaned_text = re.sub(r'\{[^}]+\}', '', cleaned_text) # to remove {}} content, usianlly they are html links
43
+ cleaned_text = constants.ISOLATED_SPECIAL_SYMBOLS_REGEX.sub('', cleaned_text)
44
+ cleaned_text = re.sub(r"(?<![a-zA-Z0-9])['\"\-*%](?![a-zA-Z0-9])", '', cleaned_text, flags=re.UNICODE | re.IGNORECASE)
45
+
46
+ return cleaned_text
47
+
48
+ def remove_punctuation(self, text):
49
+ chars = re.escape(string.punctuation)
50
+ return re.sub('['+chars+']', '',text)
@@ -0,0 +1,35 @@
1
+ import re
2
+ from nltk.corpus import stopwords as sw
3
+
4
+ class ProcessStopwords:
5
+
6
+ def __init__(self):
7
+ ### stop words
8
+ self.STOP_WORDS_EN = sw.words('english')
9
+ self.STOP_WORDS_NL = sw.words('dutch')
10
+ self.STOP_WORDS_DE = sw.words('german')
11
+ self.STOP_WORDS_ES = sw.words('spanish')
12
+
13
+ def remove_stopwords(self, text, lan):
14
+ """
15
+ Removes stopwords based on the language.
16
+ """
17
+ if lan == 'DUTCH':
18
+ stop_words = self.STOP_WORDS_NL
19
+ elif lan == 'ENGLISH':
20
+ stop_words = self.STOP_WORDS_EN
21
+ elif lan == 'GERMAN':
22
+ stop_words = self.STOP_WORDS_DE
23
+ elif lan == 'SPANISH':
24
+ stop_words = self.STOP_WORDS_ES
25
+
26
+ text = text.split()
27
+ return " ".join([word for word in text if word not in stop_words])
28
+
29
+ def remove_words_from_string(self, text, words_to_remove):
30
+ # Join the words with the "|" (OR) operator in regex to create a pattern
31
+ pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
32
+
33
+ # Use re.sub() to replace all matches with an empty string
34
+ result_text = re.sub(pattern, '', text)
35
+ return result_text
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,43 @@
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name='SqueakyCleanText',
5
+ version='0.1.0',
6
+ author='Rehan Fazal',
7
+ description='A comprehensive text cleaning and preprocessing pipeline.',
8
+ long_description=open('README.md').read(),
9
+ long_description_content_type='text/markdown',
10
+ url='https://github.com/rhnfzl/SqueakyCleanText',
11
+ license='MIT',
12
+ packages=find_packages(),
13
+ install_requires=[
14
+ 'lingua-language-detector',
15
+ 'nltk',
16
+ 'emoji',
17
+ 'ftfy',
18
+ 'Unidecode',
19
+ 'beautifulsoup4',
20
+ 'transformers',
21
+ 'torch',
22
+ ],
23
+ extras_require={
24
+ 'dev': [
25
+ 'hypothesis',
26
+ 'faker',
27
+ 'flake8',
28
+ 'pytest',
29
+ ],
30
+ },
31
+ classifiers=[
32
+ 'Programming Language :: Python :: 3',
33
+ 'License :: OSI Approved :: MIT License',
34
+ 'Operating System :: OS Independent',
35
+ ],
36
+ python_requires='>=3.7',
37
+ entry_points={
38
+ 'console_scripts': [
39
+ 'nltk_downloader=sct.scripts.download_nltk_stopwords:main'
40
+ ],
41
+ },
42
+ test_suite='tests',
43
+ )
File without changes
@@ -0,0 +1,84 @@
1
+ import unittest
2
+ import random
3
+ import string
4
+ from hypothesis import given, settings
5
+ from hypothesis.strategies import text, from_regex
6
+ from faker import Faker
7
+ from sct import config
8
+ from sct.utils import contact, datetime, special, normtext, stopwords, constants
9
+
10
+ class TextCleanerTest(unittest.TestCase):
11
+
12
+ @classmethod
13
+ def setUpClass(cls):
14
+ config.CHECK_NER_PROCESS = False
15
+ cls.ProcessContacts = contact.ProcessContacts()
16
+ cls.ProcessDateTime = datetime.ProcessDateTime()
17
+ cls.ProcessSpecialSymbols = special.ProcessSpecialSymbols()
18
+ cls.NormaliseText = normtext.NormaliseText()
19
+ cls.ProcessStopwords = stopwords.ProcessStopwords()
20
+ cls.fake = Faker()
21
+
22
+ @settings(deadline=None)
23
+ @given(from_regex(constants.EMAIL_REGEX, fullmatch=True))
24
+ def test_email_regex(self, rx):
25
+ self.assertEqual("", self.ProcessContacts.replace_emails(rx, ""))
26
+
27
+ @settings(deadline=None)
28
+ @given(from_regex(constants.PHONE_REGEX, fullmatch=True))
29
+ def test_phone_regex(self, rx):
30
+ self.assertEqual("", self.ProcessContacts.replace_phone_numbers(rx, ""))
31
+
32
+ @settings(deadline=None)
33
+ @given(from_regex(constants.NUMBERS_REGEX, fullmatch=True))
34
+ def test_number_regex(self, rx):
35
+ self.assertEqual("", self.ProcessContacts.replace_numbers(rx, ""))
36
+
37
+ @settings(deadline=None)
38
+ @given(from_regex(constants.URL_REGEX, fullmatch=True))
39
+ def test_url_regex(self, rx):
40
+ self.assertNotEqual(rx, self.ProcessContacts.replace_urls(rx, ""))
41
+
42
+ @settings(deadline=None)
43
+ @given(from_regex(constants.YEAR_REGEX, fullmatch=True))
44
+ def test_year_regex(self, rx):
45
+ self.assertEqual("", self.ProcessDateTime.replace_years(rx, ""))
46
+
47
+ @settings(deadline=None)
48
+ @given(from_regex(constants.ISOLATED_LETTERS_REGEX, fullmatch=True))
49
+ def test_isolated_letters_regex(self, rx):
50
+ rx = self.ProcessSpecialSymbols.remove_isolated_letters(rx)
51
+ rx = self.NormaliseText.normalize_whitespace(rx)
52
+ self.assertEqual("", rx)
53
+
54
+ @settings(deadline=None)
55
+ @given(from_regex(constants.ISOLATED_SPECIAL_SYMBOLS_REGEX, fullmatch=True))
56
+ def test_isolated_symbols_regex(self, rx):
57
+ self.assertEqual("", self.ProcessSpecialSymbols.remove_isolated_special_symbols(rx))
58
+
59
+ @settings(deadline=None)
60
+ @given(text(alphabet=string.ascii_letters + string.digits, min_size=0, max_size=4))
61
+ def test_faker_email(self, fkw):
62
+ """Check if generated emails are replaced correctly."""
63
+ email = self.fake.email()
64
+ clean_email = self.ProcessContacts.replace_emails(email, replace_with=fkw)
65
+ self.assertEqual(clean_email, fkw)
66
+
67
+ @settings(deadline=None)
68
+ @given(text(alphabet=string.ascii_letters + string.digits, min_size=0, max_size=4))
69
+ def test_faker_phone_number(self, fkw):
70
+ """Check if generated phone numbers are replaced correctly."""
71
+ phonenum = self.fake.phone_number()
72
+ clean_phonenum = self.ProcessContacts.replace_phone_numbers(phonenum, replace_with=fkw)
73
+ self.assertEqual(clean_phonenum, fkw)
74
+
75
+ @settings(deadline=None)
76
+ @given(text(alphabet=string.ascii_letters + string.digits, min_size=0, max_size=4))
77
+ def test_faker_url(self, fkw):
78
+ """Check if generated URLs are replaced correctly."""
79
+ url = self.fake.url()
80
+ clean_url = self.ProcessContacts.replace_urls(url, replace_with=fkw)
81
+ self.assertEqual(clean_url, fkw)
82
+
83
+ if __name__ == "__main__":
84
+ unittest.main(verbosity=2)