SqueakyCleanText 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- squeakycleantext-0.1.0/LICENSE +21 -0
- squeakycleantext-0.1.0/MANIFEST.in +2 -0
- squeakycleantext-0.1.0/PKG-INFO +33 -0
- squeakycleantext-0.1.0/README.md +6 -0
- squeakycleantext-0.1.0/SqueakyCleanText.egg-info/PKG-INFO +33 -0
- squeakycleantext-0.1.0/SqueakyCleanText.egg-info/SOURCES.txt +26 -0
- squeakycleantext-0.1.0/SqueakyCleanText.egg-info/dependency_links.txt +1 -0
- squeakycleantext-0.1.0/SqueakyCleanText.egg-info/entry_points.txt +2 -0
- squeakycleantext-0.1.0/SqueakyCleanText.egg-info/requires.txt +14 -0
- squeakycleantext-0.1.0/SqueakyCleanText.egg-info/top_level.txt +2 -0
- squeakycleantext-0.1.0/sct/__init__.py +0 -0
- squeakycleantext-0.1.0/sct/config.py +51 -0
- squeakycleantext-0.1.0/sct/scripts/__init__.py +0 -0
- squeakycleantext-0.1.0/sct/scripts/download_nltk_stopwords.py +7 -0
- squeakycleantext-0.1.0/sct/sct.py +126 -0
- squeakycleantext-0.1.0/sct/utils/__init__.py +0 -0
- squeakycleantext-0.1.0/sct/utils/constants.py +131 -0
- squeakycleantext-0.1.0/sct/utils/contact.py +50 -0
- squeakycleantext-0.1.0/sct/utils/datetime.py +15 -0
- squeakycleantext-0.1.0/sct/utils/ner.py +147 -0
- squeakycleantext-0.1.0/sct/utils/normtext.py +83 -0
- squeakycleantext-0.1.0/sct/utils/resources.py +8 -0
- squeakycleantext-0.1.0/sct/utils/special.py +50 -0
- squeakycleantext-0.1.0/sct/utils/stopwords.py +35 -0
- squeakycleantext-0.1.0/setup.cfg +4 -0
- squeakycleantext-0.1.0/setup.py +43 -0
- squeakycleantext-0.1.0/tests/__init__.py +0 -0
- squeakycleantext-0.1.0/tests/test_sct.py +84 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) [2024] [Rehan Fazal]
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: SqueakyCleanText
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A comprehensive text cleaning and preprocessing pipeline.
|
|
5
|
+
Home-page: https://github.com/rhnfzl/SqueakyCleanText
|
|
6
|
+
Author: Rehan Fazal
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.7
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: lingua-language-detector
|
|
15
|
+
Requires-Dist: nltk
|
|
16
|
+
Requires-Dist: emoji
|
|
17
|
+
Requires-Dist: ftfy
|
|
18
|
+
Requires-Dist: Unidecode
|
|
19
|
+
Requires-Dist: beautifulsoup4
|
|
20
|
+
Requires-Dist: transformers
|
|
21
|
+
Requires-Dist: torch
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: hypothesis; extra == "dev"
|
|
24
|
+
Requires-Dist: faker; extra == "dev"
|
|
25
|
+
Requires-Dist: flake8; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest; extra == "dev"
|
|
27
|
+
|
|
28
|
+
# SqueakyCleanText
|
|
29
|
+
Clean your Text for Classical ML and Language Model
|
|
30
|
+
|
|
31
|
+
# TODO
|
|
32
|
+
- Ability to change the NER MODELS from the config file, which supports AutoModel and AutoTokenizer
|
|
33
|
+
- Add more language support in stopwords
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: SqueakyCleanText
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: A comprehensive text cleaning and preprocessing pipeline.
|
|
5
|
+
Home-page: https://github.com/rhnfzl/SqueakyCleanText
|
|
6
|
+
Author: Rehan Fazal
|
|
7
|
+
License: MIT
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.7
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
Requires-Dist: lingua-language-detector
|
|
15
|
+
Requires-Dist: nltk
|
|
16
|
+
Requires-Dist: emoji
|
|
17
|
+
Requires-Dist: ftfy
|
|
18
|
+
Requires-Dist: Unidecode
|
|
19
|
+
Requires-Dist: beautifulsoup4
|
|
20
|
+
Requires-Dist: transformers
|
|
21
|
+
Requires-Dist: torch
|
|
22
|
+
Provides-Extra: dev
|
|
23
|
+
Requires-Dist: hypothesis; extra == "dev"
|
|
24
|
+
Requires-Dist: faker; extra == "dev"
|
|
25
|
+
Requires-Dist: flake8; extra == "dev"
|
|
26
|
+
Requires-Dist: pytest; extra == "dev"
|
|
27
|
+
|
|
28
|
+
# SqueakyCleanText
|
|
29
|
+
Clean your Text for Classical ML and Language Model
|
|
30
|
+
|
|
31
|
+
# TODO
|
|
32
|
+
- Ability to change the NER MODELS from the config file, which supports AutoModel and AutoTokenizer
|
|
33
|
+
- Add more language support in stopwords
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
LICENSE
|
|
2
|
+
MANIFEST.in
|
|
3
|
+
README.md
|
|
4
|
+
setup.py
|
|
5
|
+
SqueakyCleanText.egg-info/PKG-INFO
|
|
6
|
+
SqueakyCleanText.egg-info/SOURCES.txt
|
|
7
|
+
SqueakyCleanText.egg-info/dependency_links.txt
|
|
8
|
+
SqueakyCleanText.egg-info/entry_points.txt
|
|
9
|
+
SqueakyCleanText.egg-info/requires.txt
|
|
10
|
+
SqueakyCleanText.egg-info/top_level.txt
|
|
11
|
+
sct/__init__.py
|
|
12
|
+
sct/config.py
|
|
13
|
+
sct/sct.py
|
|
14
|
+
sct/scripts/__init__.py
|
|
15
|
+
sct/scripts/download_nltk_stopwords.py
|
|
16
|
+
sct/utils/__init__.py
|
|
17
|
+
sct/utils/constants.py
|
|
18
|
+
sct/utils/contact.py
|
|
19
|
+
sct/utils/datetime.py
|
|
20
|
+
sct/utils/ner.py
|
|
21
|
+
sct/utils/normtext.py
|
|
22
|
+
sct/utils/resources.py
|
|
23
|
+
sct/utils/special.py
|
|
24
|
+
sct/utils/stopwords.py
|
|
25
|
+
tests/__init__.py
|
|
26
|
+
tests/test_sct.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
File without changes
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
"""
|
|
2
|
+
detect_language : to detect the language automatically, but would consume more time if done on a batch
|
|
3
|
+
fix_bad_unicode : if True, fix "broken" unicode such as mojibake and garbled HTML entities
|
|
4
|
+
to_ascii_unicode : if True, convert non-to_ascii characters into their closest to_ascii equivalents
|
|
5
|
+
replace_with_url : special URL token, default "",
|
|
6
|
+
replace_with_email : special EMAIL token, default "",
|
|
7
|
+
replace_years : replace year, default "",
|
|
8
|
+
replace_with_phone_number : special PHONE token, default "",
|
|
9
|
+
replace_with_number : special NUMBER token, default "",
|
|
10
|
+
no_currency_symbols : if True, replace all currency symbols with the respective alphabetical ones,
|
|
11
|
+
ner_process : To execute NER Process to remove the positpositional tags, PER, LOC, ORG, MISC
|
|
12
|
+
remove_isolated_letters : remove any isolated letters which doesn't add any value to the text
|
|
13
|
+
remove_isolated_symbols : remove any isolated symbols which shouldn't be present in the text, usually which isn't
|
|
14
|
+
immediatly prefixed and suffixed by letter or number
|
|
15
|
+
normalize_whitespace : remove any unnecessary whitespace
|
|
16
|
+
statistical_model_processing : to get the statistical model text, like for fastText, SVM, LR etc
|
|
17
|
+
casefold : to lower the text
|
|
18
|
+
remove_stopwords : remove stopwords based on the language, usues NLTK stopwords
|
|
19
|
+
remove_punctuation : removes all the special symbols
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
CHECK_DETECT_LANGUAGE = True
|
|
23
|
+
CHECK_FIX_BAD_UNICODE = True
|
|
24
|
+
CHECK_TO_ASCII_UNICODE = True
|
|
25
|
+
CHECK_REPLACE_HTML = True
|
|
26
|
+
CHECK_REPLACE_URLS = True
|
|
27
|
+
CHECK_REPLACE_EMAILS = True
|
|
28
|
+
CHECK_REPLACE_YEARS = True
|
|
29
|
+
CHECK_REPLACE_PHONE_NUMBERS = True
|
|
30
|
+
CHECK_REPLACE_NUMBERS = True
|
|
31
|
+
CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
32
|
+
CHECK_NER_PROCESS = True
|
|
33
|
+
CHECK_REMOVE_CUSTOM_STOP_WORDS = True
|
|
34
|
+
CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
35
|
+
CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
36
|
+
CHECK_NORMALIZE_WHITESPACE = True
|
|
37
|
+
CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
38
|
+
CHECK_CASEFOLD = True
|
|
39
|
+
CHECK_REMOVE_STOPWORDS = True
|
|
40
|
+
CHECK_REMOVE_PUNCTUATION = True
|
|
41
|
+
CHECK_REMOVE_STEXT_CUSTOM_STOP_WORDS = True
|
|
42
|
+
REPLACE_WITH_URL = ""
|
|
43
|
+
REPLACE_WITH_HTML = ""
|
|
44
|
+
REPLACE_WITH_EMAIL = ""
|
|
45
|
+
REPLACE_WITH_YEARS = ""
|
|
46
|
+
REPLACE_WITH_PHONE_NUMBERS = ""
|
|
47
|
+
REPLACE_WITH_NUMBERS = ""
|
|
48
|
+
REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
49
|
+
POSITIONAL_TAGS = ['PER', 'LOC']
|
|
50
|
+
NER_CONFIDENCE_THRESHOLD = 0.75
|
|
51
|
+
LANGUAGE = None
|
|
File without changes
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This code provides a comprehensive text cleaning and preprocessing pipeline.
|
|
3
|
+
It includes functions to normalize, remove personal information and clean text data,
|
|
4
|
+
which is crucial for natural language processing tasks.
|
|
5
|
+
"""
|
|
6
|
+
from sct import config
|
|
7
|
+
from sct.utils import contact, datetime, ner, normtext, resources, special, stopwords
|
|
8
|
+
|
|
9
|
+
class TextCleaner:
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self.ProcessContacts = contact.ProcessContacts()
|
|
13
|
+
self.ProcessDateTime = datetime.ProcessDateTime()
|
|
14
|
+
self.ProcessSpecialSymbols = special.ProcessSpecialSymbols()
|
|
15
|
+
self.NormaliseText = normtext.NormaliseText()
|
|
16
|
+
self.ProcessStopwords = stopwords.ProcessStopwords()
|
|
17
|
+
self.GeneralNER = ner.GeneralNER()
|
|
18
|
+
self.pipeline = []
|
|
19
|
+
self.language = None
|
|
20
|
+
self.init_pipeline()
|
|
21
|
+
|
|
22
|
+
def init_pipeline(self):
|
|
23
|
+
# Initialize pipeline steps based on config
|
|
24
|
+
if config.CHECK_DETECT_LANGUAGE:
|
|
25
|
+
self.pipeline.append(self.detect_language)
|
|
26
|
+
|
|
27
|
+
if config.CHECK_FIX_BAD_UNICODE:
|
|
28
|
+
self.pipeline.append(self.fix_bad_unicode)
|
|
29
|
+
if config.CHECK_TO_ASCII_UNICODE:
|
|
30
|
+
self.pipeline.append(self.to_ascii_unicode)
|
|
31
|
+
|
|
32
|
+
if config.CHECK_REPLACE_HTML:
|
|
33
|
+
self.pipeline.append(self.replace_html)
|
|
34
|
+
if config.CHECK_REPLACE_URLS:
|
|
35
|
+
self.pipeline.append(self.replace_urls)
|
|
36
|
+
if config.CHECK_REPLACE_EMAILS:
|
|
37
|
+
self.pipeline.append(self.replace_emails)
|
|
38
|
+
if config.CHECK_REPLACE_YEARS:
|
|
39
|
+
self.pipeline.append(self.replace_years)
|
|
40
|
+
if config.CHECK_REPLACE_PHONE_NUMBERS:
|
|
41
|
+
self.pipeline.append(self.replace_phone_numbers)
|
|
42
|
+
if config.CHECK_REPLACE_NUMBERS:
|
|
43
|
+
self.pipeline.append(self.replace_numbers)
|
|
44
|
+
if config.CHECK_REPLACE_CURRENCY_SYMBOLS:
|
|
45
|
+
self.pipeline.append(self.replace_currency_symbols)
|
|
46
|
+
|
|
47
|
+
if config.CHECK_NER_PROCESS:
|
|
48
|
+
self.pipeline.append(self.ner_process)
|
|
49
|
+
|
|
50
|
+
if config.CHECK_REMOVE_ISOLATED_LETTERS:
|
|
51
|
+
self.pipeline.append(self.remove_isolated_letters)
|
|
52
|
+
if config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS:
|
|
53
|
+
self.pipeline.append(self.remove_isolated_special_symbols)
|
|
54
|
+
if config.CHECK_NORMALIZE_WHITESPACE:
|
|
55
|
+
self.pipeline.append(self.normalize_whitespace)
|
|
56
|
+
|
|
57
|
+
def process(self, text):
|
|
58
|
+
text = str(text)
|
|
59
|
+
|
|
60
|
+
for step in self.pipeline:
|
|
61
|
+
# print(f"Before {step.__name__}: {text}") # Debug print
|
|
62
|
+
text = step(text)
|
|
63
|
+
# print(f"After {step.__name__}: {text}") # Debug print
|
|
64
|
+
|
|
65
|
+
if config.CHECK_STATISTICAL_MODEL_PROCESSING:
|
|
66
|
+
stext = self.statistical_model_processing(text)
|
|
67
|
+
return text, stext, self.language
|
|
68
|
+
|
|
69
|
+
return text, self.language
|
|
70
|
+
|
|
71
|
+
def detect_language(self, text):
|
|
72
|
+
self.language = str(resources.DETECTOR.detect_language_of(text)).split(".")[-1]
|
|
73
|
+
return text
|
|
74
|
+
|
|
75
|
+
def fix_bad_unicode(self, text):
|
|
76
|
+
return self.NormaliseText.fix_bad_unicode(text)
|
|
77
|
+
|
|
78
|
+
def to_ascii_unicode(self, text):
|
|
79
|
+
return self.NormaliseText.to_ascii_unicode(text)
|
|
80
|
+
|
|
81
|
+
def replace_html(self, text):
|
|
82
|
+
return self.ProcessContacts.replace_html(text, replace_with=config.REPLACE_WITH_HTML)
|
|
83
|
+
|
|
84
|
+
def replace_urls(self, text):
|
|
85
|
+
return self.ProcessContacts.replace_urls(text, replace_with=config.REPLACE_WITH_URL)
|
|
86
|
+
|
|
87
|
+
def replace_emails(self, text):
|
|
88
|
+
return self.ProcessContacts.replace_emails(text, replace_with=config.REPLACE_WITH_EMAIL)
|
|
89
|
+
|
|
90
|
+
def replace_years(self, text):
|
|
91
|
+
return self.ProcessDateTime.replace_years(text, replace_with=config.REPLACE_WITH_YEARS)
|
|
92
|
+
|
|
93
|
+
def replace_phone_numbers(self, text):
|
|
94
|
+
return self.ProcessContacts.replace_phone_numbers(text, replace_with=config.REPLACE_WITH_PHONE_NUMBERS)
|
|
95
|
+
|
|
96
|
+
def replace_numbers(self, text):
|
|
97
|
+
return self.ProcessContacts.replace_numbers(text, replace_with=config.REPLACE_WITH_NUMBERS)
|
|
98
|
+
|
|
99
|
+
def replace_currency_symbols(self, text):
|
|
100
|
+
return self.ProcessSpecialSymbols.replace_currency_symbols(text, replace_with=config.REPLACE_WITH_CURRENCY_SYMBOLS)
|
|
101
|
+
|
|
102
|
+
def ner_process(self, text):
|
|
103
|
+
ner_words = self.GeneralNER.ner_process(text, config.POSITIONAL_TAGS, config.NER_CONFIDENCE_THRESHOLD, self.language)
|
|
104
|
+
return self.ProcessStopwords.remove_words_from_string(text, ner_words)
|
|
105
|
+
|
|
106
|
+
def remove_isolated_letters(self, text):
|
|
107
|
+
return self.ProcessSpecialSymbols.remove_isolated_letters(text)
|
|
108
|
+
|
|
109
|
+
def remove_isolated_special_symbols(self, text):
|
|
110
|
+
return self.ProcessSpecialSymbols.remove_isolated_special_symbols(text)
|
|
111
|
+
|
|
112
|
+
def normalize_whitespace(self, text):
|
|
113
|
+
return self.NormaliseText.normalize_whitespace(text, no_line_breaks=True)
|
|
114
|
+
|
|
115
|
+
def statistical_model_processing(self, text):
|
|
116
|
+
if config.CHECK_CASEFOLD:
|
|
117
|
+
stext = text.casefold() # lowercase
|
|
118
|
+
if config.CHECK_REMOVE_STOPWORDS:
|
|
119
|
+
stext = self.ProcessStopwords.remove_stopwords(stext, self.language)
|
|
120
|
+
if config.CHECK_REMOVE_PUNCTUATION:
|
|
121
|
+
stext = self.ProcessSpecialSymbols.remove_punctuation(stext)
|
|
122
|
+
if config.CHECK_REMOVE_ISOLATED_LETTERS:
|
|
123
|
+
stext = self.ProcessSpecialSymbols.remove_isolated_letters(stext)
|
|
124
|
+
if config.CHECK_NORMALIZE_WHITESPACE:
|
|
125
|
+
stext = self.NormaliseText.normalize_whitespace(stext)
|
|
126
|
+
return stext
|
|
File without changes
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Constant symbols and compiled RegExs use for cleaning.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
CURRENCIES = {
|
|
8
|
+
"$": "USD",
|
|
9
|
+
"zł": "PLN",
|
|
10
|
+
"£": "GBP",
|
|
11
|
+
"¥": "JPY",
|
|
12
|
+
"฿": "THB",
|
|
13
|
+
"₡": "CRC",
|
|
14
|
+
"₦": "NGN",
|
|
15
|
+
"₩": "KRW",
|
|
16
|
+
"₪": "ILS",
|
|
17
|
+
"₫": "VND",
|
|
18
|
+
"€": "EUR",
|
|
19
|
+
"₱": "PHP",
|
|
20
|
+
"₲": "PYG",
|
|
21
|
+
"₴": "UAH",
|
|
22
|
+
"₹": "INR",
|
|
23
|
+
}
|
|
24
|
+
CURRENCY_REGEX = re.compile(
|
|
25
|
+
"({})+".format("|".join(re.escape(c) for c in CURRENCIES.keys()))
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
ACRONYM_REGEX = re.compile(
|
|
29
|
+
r"(?:^|(?<=\W))(?:(?:(?:(?:[A-Z]\.?)+[a-z0-9&/-]?)+(?:[A-Z][s.]?|[0-9]s?))|(?:[0-9](?:\-?[A-Z])+))(?:$|(?=\W))",
|
|
30
|
+
flags=re.UNICODE,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# taken hostname, domainname, tld from URL regex below
|
|
34
|
+
EMAIL_REGEX = re.compile(
|
|
35
|
+
r"(?:^|(?<=[^\w@.)]))([\w+-](\.(?!\.))?)*?[\w+-](@|[(<{\[]at[)>}\]])(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))",
|
|
36
|
+
flags=re.IGNORECASE | re.UNICODE,
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
# for more information: https://github.com/jfilter/clean-text/issues/10
|
|
40
|
+
# PHONE_REGEX = re.compile(
|
|
41
|
+
# r"((?:^|(?<=[^\w)]))(((\+?[01])|(\+\d{2}))[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
|
|
42
|
+
# )
|
|
43
|
+
# PHONE_REGEX = re.compile(
|
|
44
|
+
# r"((?:^|(?<=[^\w)]))((\+?[01]|0{1,2}\d{0,1}|\+\d{2})[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
|
|
45
|
+
# )
|
|
46
|
+
|
|
47
|
+
PHONE_REGEX = re.compile(
|
|
48
|
+
r"((?:^|(?<=[^\w)]))((\+?\d+|0{1,2}\d*?)[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
NUMBERS_REGEX = re.compile(
|
|
52
|
+
r"(?:^|(?<=[^\w,.]))[+–-]?(([1-9]\d{0,2}(,\d{3})+(\.\d*)?)|([1-9]\d{0,2}([ .]\d{3})+(,\d*)?)|(\d*?[.,]\d+)|\d+)(?:$|(?=\b))"
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+")
|
|
56
|
+
TWO_LINEBREAK_REGEX = re.compile(r"((\r\n)|[\n\v])+((\r\n)|[\n\v])+")
|
|
57
|
+
MULTI_WHITESPACE_TO_ONE_REGEX = re.compile(r"\s+")
|
|
58
|
+
NONBREAKING_SPACE_REGEX = re.compile(r"(?!\n)\s+")
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
HTML_REGEX = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', flags=re.UNICODE | re.IGNORECASE,)
|
|
62
|
+
|
|
63
|
+
# source: https://gist.github.com/dperini/729294
|
|
64
|
+
URL_REGEX = re.compile(
|
|
65
|
+
r"(?:^|(?<![\w\/\.]))"
|
|
66
|
+
# protocol identifier
|
|
67
|
+
# r"(?:(?:https?|ftp)://)" # <-- alt?
|
|
68
|
+
r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))"
|
|
69
|
+
# user:pass authentication
|
|
70
|
+
r"(?:\S+(?::\S*)?@)?" r"(?:"
|
|
71
|
+
# IP address exclusion
|
|
72
|
+
# private & local networks
|
|
73
|
+
r"(?!(?:10|127)(?:\.\d{1,3}){3})"
|
|
74
|
+
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})"
|
|
75
|
+
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})"
|
|
76
|
+
# IP address dotted notation octets
|
|
77
|
+
# excludes loopback network 0.0.0.0
|
|
78
|
+
# excludes reserved space >= 224.0.0.0
|
|
79
|
+
# excludes network & broadcast addresses
|
|
80
|
+
# (first & last IP address of each class)
|
|
81
|
+
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])"
|
|
82
|
+
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}"
|
|
83
|
+
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))"
|
|
84
|
+
r"|"
|
|
85
|
+
# host name
|
|
86
|
+
r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)"
|
|
87
|
+
# domain name
|
|
88
|
+
r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*"
|
|
89
|
+
# TLD identifier
|
|
90
|
+
r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")"
|
|
91
|
+
# port number
|
|
92
|
+
# r"(?::\d{2,5})?"
|
|
93
|
+
r"(?::\d{2,5}\b)?"
|
|
94
|
+
# resource path
|
|
95
|
+
r"(?:\/[^\)\]\}\s]*)?",
|
|
96
|
+
# r"(?:$|(?![\w?!+&\/\)]))",
|
|
97
|
+
flags=re.UNICODE | re.IGNORECASE,
|
|
98
|
+
)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
strange_double_quotes = [
|
|
102
|
+
"«",
|
|
103
|
+
"‹",
|
|
104
|
+
"»",
|
|
105
|
+
"›",
|
|
106
|
+
"„",
|
|
107
|
+
"“",
|
|
108
|
+
"‟",
|
|
109
|
+
"”",
|
|
110
|
+
"❝",
|
|
111
|
+
"❞",
|
|
112
|
+
"❮",
|
|
113
|
+
"❯",
|
|
114
|
+
"〝",
|
|
115
|
+
"〞",
|
|
116
|
+
"〟",
|
|
117
|
+
""",
|
|
118
|
+
]
|
|
119
|
+
strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", "‘", "’"]
|
|
120
|
+
|
|
121
|
+
DOUBLE_QUOTE_REGEX = re.compile("|".join(strange_double_quotes))
|
|
122
|
+
SINGLE_QUOTE_REGEX = re.compile("|".join(strange_single_quotes))
|
|
123
|
+
|
|
124
|
+
YEAR_REGEX = re.compile(r"\b(19|20)\d{2}\b") # Matches years from 1900 to 2099
|
|
125
|
+
|
|
126
|
+
ISOLATED_LETTERS_REGEX = re.compile(r"(?:^|\s)[B, C, D, E, F, G, H, I, J, K, L, M, N, O, P, Q, R, S, T, V, W, X, Y, Z](?=\s|$)", flags=re.UNICODE | re.IGNORECASE)
|
|
127
|
+
|
|
128
|
+
ISOLATED_SPECIAL_SYMBOLS_REGEX = re.compile(r"(?<![a-zA-Z0-9])[:_.|><;·}@~!?+#)({,/\\\\^]+(?![a-zA-Z0-9])", flags=re.UNICODE | re.IGNORECASE)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
SENTENCE_BOUNDARY_PATTERN = re.compile('(?<=[.!?])\s+(?=[^\d])')
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from sct.utils import constants
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
|
|
4
|
+
class ProcessContacts:
|
|
5
|
+
|
|
6
|
+
def __init__(self):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
def replace_urls(self, text, replace_with="<URL>"):
|
|
10
|
+
"""
|
|
11
|
+
Replace all URLs in ``text`` str with ``replace_with`` str.
|
|
12
|
+
"""
|
|
13
|
+
# matches = constants.URL_REGEX.finditer(text)
|
|
14
|
+
# result = text
|
|
15
|
+
# # Iterate through matches in reverse order (to avoid index issues)
|
|
16
|
+
# for match in reversed(list(matches)):
|
|
17
|
+
# # Check if the matched substring contains non-ASCII characters
|
|
18
|
+
# if not any(ord(char) > 127 for char in match.group()):
|
|
19
|
+
# result = text[:match.start()] + replace_with + text[match.end():]
|
|
20
|
+
return constants.URL_REGEX.sub(replace_with, text)
|
|
21
|
+
|
|
22
|
+
def replace_html(self, text, replace_with="<HTML>"):
|
|
23
|
+
"""
|
|
24
|
+
Replace all html tags in ``text`` str with ``replace_with`` str.
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
soup = BeautifulSoup(text, 'html.parser')
|
|
28
|
+
text = soup.get_text()
|
|
29
|
+
except:
|
|
30
|
+
text = constants.HTML_REGEX.sub(replace_with, text)
|
|
31
|
+
|
|
32
|
+
return text
|
|
33
|
+
|
|
34
|
+
def replace_emails(self, text, replace_with="<EMAIL>"):
|
|
35
|
+
"""
|
|
36
|
+
Replace all emails in ``text`` str with ``replace_with`` str.
|
|
37
|
+
"""
|
|
38
|
+
return constants.EMAIL_REGEX.sub(replace_with, text)
|
|
39
|
+
|
|
40
|
+
def replace_phone_numbers(self, text, replace_with="<PHONE>"):
|
|
41
|
+
"""
|
|
42
|
+
Replace all phone numbers in ``text`` str with ``replace_with`` str.
|
|
43
|
+
"""
|
|
44
|
+
return constants.PHONE_REGEX.sub(replace_with, text)
|
|
45
|
+
|
|
46
|
+
def replace_numbers(self, text, replace_with="<NUMBER>"):
|
|
47
|
+
"""
|
|
48
|
+
Replace all numbers in ``text`` str with ``replace_with`` str.
|
|
49
|
+
"""
|
|
50
|
+
return constants.NUMBERS_REGEX.sub(replace_with, text)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from sct.utils import constants
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ProcessDateTime:
|
|
5
|
+
|
|
6
|
+
def __init__(self):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
def replace_years(self, text, replace_with ="<YEAR>"):
|
|
10
|
+
"""
|
|
11
|
+
Replaces years between 1900 to 2099 in the text with a special token.
|
|
12
|
+
"""
|
|
13
|
+
cleaned_string = constants.YEAR_REGEX.sub(replace_with, text)
|
|
14
|
+
|
|
15
|
+
return cleaned_string
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import itertools
|
|
3
|
+
from collections import defaultdict
|
|
4
|
+
|
|
5
|
+
import transformers
|
|
6
|
+
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
|
7
|
+
|
|
8
|
+
from sct.utils import constants
|
|
9
|
+
transformers.logging.set_verbosity_error()
|
|
10
|
+
|
|
11
|
+
class GeneralNER:
|
|
12
|
+
|
|
13
|
+
"""
|
|
14
|
+
To tag [PER, LOC, ORG, MISC] postional tags using ensemble technique
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
def __init__(self):
|
|
18
|
+
|
|
19
|
+
#---- NER Models
|
|
20
|
+
TOKENIZER_EN = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
|
|
21
|
+
MODEL_EN = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-english")
|
|
22
|
+
|
|
23
|
+
TOKENIZER_NL = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll02-dutch")
|
|
24
|
+
MODEL_NL = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll02-dutch")
|
|
25
|
+
|
|
26
|
+
TOKENIZER_DE = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-german")
|
|
27
|
+
MODEL_DE = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll03-german")
|
|
28
|
+
|
|
29
|
+
TOKENIZER_ES = AutoTokenizer.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll02-spanish")
|
|
30
|
+
MODEL_ES = AutoModelForTokenClassification.from_pretrained("FacebookAI/xlm-roberta-large-finetuned-conll02-spanish")
|
|
31
|
+
|
|
32
|
+
TOKENIZER_MULTI = AutoTokenizer.from_pretrained("Babelscape/wikineural-multilingual-ner")
|
|
33
|
+
MODEL_MULTI = AutoModelForTokenClassification.from_pretrained("Babelscape/wikineural-multilingual-ner")
|
|
34
|
+
|
|
35
|
+
# Length with buffer of 10%
|
|
36
|
+
self.len_single_sentence = [TOKENIZER_EN.max_len_single_sentence, TOKENIZER_MULTI.max_len_single_sentence]
|
|
37
|
+
self.min_token_length = math.ceil(min(self.len_single_sentence) * 0.9)
|
|
38
|
+
tokenizer_indicator = self.len_single_sentence.index(min(self.len_single_sentence))
|
|
39
|
+
|
|
40
|
+
if tokenizer_indicator == 0:
|
|
41
|
+
self.tokenizer = TOKENIZER_EN
|
|
42
|
+
elif tokenizer_indicator == 1:
|
|
43
|
+
self.tokenizer = TOKENIZER_MULTI
|
|
44
|
+
|
|
45
|
+
self.nlp_en = pipeline("ner", model=MODEL_EN, tokenizer=TOKENIZER_EN, aggregation_strategy="simple")
|
|
46
|
+
self.nlp_nl = pipeline("ner", model=MODEL_NL, tokenizer=TOKENIZER_NL, aggregation_strategy="simple")
|
|
47
|
+
self.nlp_de = pipeline("ner", model=MODEL_DE, tokenizer=TOKENIZER_DE, aggregation_strategy="simple")
|
|
48
|
+
self.nlp_es = pipeline("ner", model=MODEL_ES, tokenizer=TOKENIZER_ES, aggregation_strategy="simple")
|
|
49
|
+
self.nlp_multi = pipeline("ner", model=MODEL_MULTI, tokenizer=TOKENIZER_MULTI, aggregation_strategy="simple")
|
|
50
|
+
|
|
51
|
+
def ner_data(self, data, pos):
|
|
52
|
+
"""
|
|
53
|
+
Formats NER (Named Entity Recognition) files.
|
|
54
|
+
"""
|
|
55
|
+
return [{'entity_group': ix['entity_group'], 'score': ix['score'], 'word': ix['word'], 'key': str(ix['start']) + str(ix['end'])} for ix in data if ix['entity_group'] in pos]
|
|
56
|
+
|
|
57
|
+
def ner_ensemble(self, ner_results, t):
|
|
58
|
+
"""
|
|
59
|
+
Applies an ensemble method for NER.
|
|
60
|
+
"""
|
|
61
|
+
ner_keys = defaultdict(lambda: [0, 0])
|
|
62
|
+
ner_words = set()
|
|
63
|
+
|
|
64
|
+
for entity in ner_results:
|
|
65
|
+
ner_keys[entity['key']][0] += 1
|
|
66
|
+
ner_keys[entity['key']][1] += entity['score']
|
|
67
|
+
|
|
68
|
+
ner_keys = [key for key, val in ner_keys.items() if val[1] / val[0] >= t]
|
|
69
|
+
|
|
70
|
+
for entity in ner_results:
|
|
71
|
+
if entity['key'] in ner_keys:
|
|
72
|
+
ner_words.add(entity['word'])
|
|
73
|
+
|
|
74
|
+
ner_words = sorted(list(ner_words), key=len, reverse=True)
|
|
75
|
+
return ner_words
|
|
76
|
+
|
|
77
|
+
def ner_process(self, text, positional_tags, ner_confidence_threshold, language):
|
|
78
|
+
"""_summary_
|
|
79
|
+
Executes NER Process to remove the positional tags, PER, LOC, ORG, MISC.
|
|
80
|
+
Args:
|
|
81
|
+
text (string): text from which postional tags need to be recognised
|
|
82
|
+
positional_tags (list): pass tags as ['PER', 'LOC'] (default), also supports 'ORG', 'MISC'
|
|
83
|
+
ner_confidence_threshold (int): NER Confidence
|
|
84
|
+
language (string): language model which need to be used, currently supports ENGLISH, DUTCH
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
list: a list of words sorted based on length for the provided positional tags which meets the threshold
|
|
88
|
+
"""
|
|
89
|
+
ner_results = []
|
|
90
|
+
|
|
91
|
+
# text length
|
|
92
|
+
text_token_length = len(self.tokenizer.tokenize(text))
|
|
93
|
+
|
|
94
|
+
# parts it need to get split
|
|
95
|
+
num_parts = math.ceil(text_token_length/self.min_token_length)
|
|
96
|
+
|
|
97
|
+
if num_parts == 0:
|
|
98
|
+
texts = [text]
|
|
99
|
+
else:
|
|
100
|
+
texts = self.split_text(text, self.min_token_length, self.tokenizer)
|
|
101
|
+
|
|
102
|
+
for text in texts:
|
|
103
|
+
ner_results.append(self.ner_data(self.nlp_multi(text), positional_tags))
|
|
104
|
+
ner_results.append(self.ner_data(self.nlp_en(text), positional_tags))
|
|
105
|
+
|
|
106
|
+
if language == 'DUTCH':
|
|
107
|
+
ner_results.append(self.ner_data(self.nlp_nl(text), positional_tags))
|
|
108
|
+
elif language == 'GERMAN':
|
|
109
|
+
ner_results.append(self.ner_data(self.nlp_de(text), positional_tags))
|
|
110
|
+
elif language == 'SPANISH':
|
|
111
|
+
ner_results.append(self.ner_data(self.nlp_es(text), positional_tags))
|
|
112
|
+
|
|
113
|
+
# flat out the list
|
|
114
|
+
ner_results = list(itertools.chain.from_iterable(ner_results))
|
|
115
|
+
ner_words = self.ner_ensemble(ner_results, ner_confidence_threshold)
|
|
116
|
+
return ner_words
|
|
117
|
+
|
|
118
|
+
|
|
119
|
+
def split_text(self, text, max_tokens, tokenizer):
|
|
120
|
+
|
|
121
|
+
sentence_boundaries = [(m.start(), m.end()) for m in constants.SENTENCE_BOUNDARY_PATTERN.finditer(text)]
|
|
122
|
+
|
|
123
|
+
chunks = []
|
|
124
|
+
current_chunk = []
|
|
125
|
+
current_token_count = 0
|
|
126
|
+
current_position = 0
|
|
127
|
+
|
|
128
|
+
for boundary_start, boundary_end in sentence_boundaries:
|
|
129
|
+
sentence = text[current_position:boundary_start+1]
|
|
130
|
+
current_position = boundary_end
|
|
131
|
+
|
|
132
|
+
token_count = len(tokenizer(sentence)["input_ids"])
|
|
133
|
+
|
|
134
|
+
if current_token_count + token_count <= max_tokens:
|
|
135
|
+
current_chunk.append(sentence)
|
|
136
|
+
current_token_count += token_count
|
|
137
|
+
else:
|
|
138
|
+
chunks.append(''.join(current_chunk))
|
|
139
|
+
current_chunk = [sentence]
|
|
140
|
+
current_token_count = token_count
|
|
141
|
+
|
|
142
|
+
# Append the last sentence
|
|
143
|
+
last_sentence = text[current_position:]
|
|
144
|
+
current_chunk.append(last_sentence)
|
|
145
|
+
chunks.append(''.join(current_chunk))
|
|
146
|
+
|
|
147
|
+
return chunks
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
from ftfy import fix_text
|
|
2
|
+
from unidecode import unidecode
|
|
3
|
+
from emoji import demojize, emojize
|
|
4
|
+
#---
|
|
5
|
+
from sct.utils import constants
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NormaliseText:
|
|
9
|
+
|
|
10
|
+
def __init__(self):
|
|
11
|
+
|
|
12
|
+
pass
|
|
13
|
+
|
|
14
|
+
def fix_bad_unicode(self, text, normalization="NFC"):
|
|
15
|
+
"""
|
|
16
|
+
Fix unicode text that's "broken" using `ftfy <http://ftfy.readthedocs.org/>`_;
|
|
17
|
+
this includes mojibake, HTML entities and other code cruft,
|
|
18
|
+
and non-standard forms for display purposes.
|
|
19
|
+
Args:
|
|
20
|
+
text (str): raw text
|
|
21
|
+
normalization ({'NFC', 'NFKC', 'NFD', 'NFKD'}): if 'NFC',
|
|
22
|
+
combines characters and diacritics written using separate code points,
|
|
23
|
+
e.g. converting "e" plus an acute accent modifier into "é"; unicode
|
|
24
|
+
can be converted to NFC form without any change in its meaning!
|
|
25
|
+
if 'NFKC', additional normalizations are applied that can change
|
|
26
|
+
the meanings of characters, e.g. ellipsis characters will be replaced
|
|
27
|
+
with three periods
|
|
28
|
+
"""
|
|
29
|
+
# trying to fix backslash-replaced strings (via https://stackoverflow.com/a/57192592/4028896)
|
|
30
|
+
try:
|
|
31
|
+
text = text.encode("latin", "backslashreplace").decode("unicode-escape")
|
|
32
|
+
except:
|
|
33
|
+
pass
|
|
34
|
+
|
|
35
|
+
return fix_text(text, normalization=normalization)
|
|
36
|
+
|
|
37
|
+
def fix_strange_quotes(self, text):
|
|
38
|
+
"""
|
|
39
|
+
Replace strange quotes, i.e., 〞with a single quote ' or a double quote " if it fits better.
|
|
40
|
+
"""
|
|
41
|
+
text = constants.SINGLE_QUOTE_REGEX.sub("'", text)
|
|
42
|
+
text = constants.DOUBLE_QUOTE_REGEX.sub('"', text)
|
|
43
|
+
return text
|
|
44
|
+
|
|
45
|
+
def to_ascii_unicode(self, text, no_emoji=True):
|
|
46
|
+
"""
|
|
47
|
+
Try to represent unicode data in ascii characters similar to what a human
|
|
48
|
+
with a US keyboard would choose.
|
|
49
|
+
Works great for languages of Western origin, worse the farther the language
|
|
50
|
+
gets from Latin-based alphabets. It's based on hand-tuned character mappings
|
|
51
|
+
that also contain ascii approximations for symbols and non-Latin alphabets.
|
|
52
|
+
"""
|
|
53
|
+
# normalize quotes before since this improves transliteration quality
|
|
54
|
+
text = self.fix_strange_quotes(text)
|
|
55
|
+
|
|
56
|
+
if not no_emoji:
|
|
57
|
+
text = demojize(text, use_aliases=True)
|
|
58
|
+
|
|
59
|
+
text = unidecode(text)
|
|
60
|
+
|
|
61
|
+
return text
|
|
62
|
+
|
|
63
|
+
def normalize_whitespace(self, text, strip_lines=True, no_line_breaks=False, keep_two_line_breaks=False):
|
|
64
|
+
"""
|
|
65
|
+
Given ``text`` str, replace one or more spacings with a single space, and one
|
|
66
|
+
or more line breaks with a single newline. Also strip leading/trailing whitespace.
|
|
67
|
+
"""
|
|
68
|
+
if strip_lines:
|
|
69
|
+
text = "\n".join([x.strip() for x in text.splitlines()])
|
|
70
|
+
|
|
71
|
+
if no_line_breaks:
|
|
72
|
+
text = constants.MULTI_WHITESPACE_TO_ONE_REGEX.sub(" ", text)
|
|
73
|
+
else:
|
|
74
|
+
if keep_two_line_breaks:
|
|
75
|
+
text = constants.NONBREAKING_SPACE_REGEX.sub(
|
|
76
|
+
" ", constants.TWO_LINEBREAK_REGEX.sub(r"\n\n", text)
|
|
77
|
+
)
|
|
78
|
+
else:
|
|
79
|
+
text = constants.NONBREAKING_SPACE_REGEX.sub(
|
|
80
|
+
" ", constants.LINEBREAK_REGEX.sub(r"\n", text)
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
return text.strip()
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
import warnings
|
|
2
|
+
warnings.filterwarnings('ignore')
|
|
3
|
+
|
|
4
|
+
from lingua import Language, LanguageDetectorBuilder
|
|
5
|
+
|
|
6
|
+
#---- Detect the Language / Also add languages to support in Future
|
|
7
|
+
LANGUAGES = [Language.DUTCH, Language.ENGLISH]
|
|
8
|
+
DETECTOR = LanguageDetectorBuilder.from_languages(*LANGUAGES).build()
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
import re
|
|
2
|
+
import string
|
|
3
|
+
#---
|
|
4
|
+
from sct.utils import constants
|
|
5
|
+
|
|
6
|
+
class ProcessSpecialSymbols:
|
|
7
|
+
|
|
8
|
+
def __init__(self):
|
|
9
|
+
pass
|
|
10
|
+
|
|
11
|
+
def replace_currency_symbols(self, text, replace_with="<CUR>"):
|
|
12
|
+
"""
|
|
13
|
+
Replace currency symbols in ``text`` str with string specified by ``replace_with`` str.
|
|
14
|
+
Args:
|
|
15
|
+
text (str): raw text
|
|
16
|
+
replace_with (str): if None (default), replace symbols with
|
|
17
|
+
their standard 3-letter abbreviations (e.g. '$' with 'USD', '£' with 'GBP');
|
|
18
|
+
otherwise, pass in a string with which to replace all symbols
|
|
19
|
+
(e.g. "*CURRENCY*")
|
|
20
|
+
"""
|
|
21
|
+
if replace_with is None:
|
|
22
|
+
for k, v in constants.CURRENCIES.items():
|
|
23
|
+
text = text.replace(k, v)
|
|
24
|
+
return text
|
|
25
|
+
else:
|
|
26
|
+
return constants.CURRENCY_REGEX.sub(replace_with, text)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def remove_isolated_letters(self, text):
|
|
30
|
+
"""
|
|
31
|
+
Removes any isolated letters which doesn't add any value to the text.
|
|
32
|
+
"""
|
|
33
|
+
cleaned_text = constants.ISOLATED_LETTERS_REGEX.sub('', text)
|
|
34
|
+
|
|
35
|
+
return cleaned_text
|
|
36
|
+
|
|
37
|
+
def remove_isolated_special_symbols(self, text):
|
|
38
|
+
"""
|
|
39
|
+
Removes any isolated symbols which shouldn't be present in the text.
|
|
40
|
+
"""
|
|
41
|
+
cleaned_text = re.sub(r'\[[^\]]+\]', '', text) # to remove [] content, usianlly they are image or file loc text
|
|
42
|
+
cleaned_text = re.sub(r'\{[^}]+\}', '', cleaned_text) # to remove {}} content, usianlly they are html links
|
|
43
|
+
cleaned_text = constants.ISOLATED_SPECIAL_SYMBOLS_REGEX.sub('', cleaned_text)
|
|
44
|
+
cleaned_text = re.sub(r"(?<![a-zA-Z0-9])['\"\-*%](?![a-zA-Z0-9])", '', cleaned_text, flags=re.UNICODE | re.IGNORECASE)
|
|
45
|
+
|
|
46
|
+
return cleaned_text
|
|
47
|
+
|
|
48
|
+
def remove_punctuation(self, text):
|
|
49
|
+
chars = re.escape(string.punctuation)
|
|
50
|
+
return re.sub('['+chars+']', '',text)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from nltk.corpus import stopwords as sw
|
|
3
|
+
|
|
4
|
+
class ProcessStopwords:
|
|
5
|
+
|
|
6
|
+
def __init__(self):
|
|
7
|
+
### stop words
|
|
8
|
+
self.STOP_WORDS_EN = sw.words('english')
|
|
9
|
+
self.STOP_WORDS_NL = sw.words('dutch')
|
|
10
|
+
self.STOP_WORDS_DE = sw.words('german')
|
|
11
|
+
self.STOP_WORDS_ES = sw.words('spanish')
|
|
12
|
+
|
|
13
|
+
def remove_stopwords(self, text, lan):
|
|
14
|
+
"""
|
|
15
|
+
Removes stopwords based on the language.
|
|
16
|
+
"""
|
|
17
|
+
if lan == 'DUTCH':
|
|
18
|
+
stop_words = self.STOP_WORDS_NL
|
|
19
|
+
elif lan == 'ENGLISH':
|
|
20
|
+
stop_words = self.STOP_WORDS_EN
|
|
21
|
+
elif lan == 'GERMAN':
|
|
22
|
+
stop_words = self.STOP_WORDS_DE
|
|
23
|
+
elif lan == 'SPANISH':
|
|
24
|
+
stop_words = self.STOP_WORDS_ES
|
|
25
|
+
|
|
26
|
+
text = text.split()
|
|
27
|
+
return " ".join([word for word in text if word not in stop_words])
|
|
28
|
+
|
|
29
|
+
def remove_words_from_string(self, text, words_to_remove):
|
|
30
|
+
# Join the words with the "|" (OR) operator in regex to create a pattern
|
|
31
|
+
pattern = r'\b(?:' + '|'.join(re.escape(word) for word in words_to_remove) + r')\b'
|
|
32
|
+
|
|
33
|
+
# Use re.sub() to replace all matches with an empty string
|
|
34
|
+
result_text = re.sub(pattern, '', text)
|
|
35
|
+
return result_text
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
from setuptools import setup, find_packages
|
|
2
|
+
|
|
3
|
+
setup(
|
|
4
|
+
name='SqueakyCleanText',
|
|
5
|
+
version='0.1.0',
|
|
6
|
+
author='Rehan Fazal',
|
|
7
|
+
description='A comprehensive text cleaning and preprocessing pipeline.',
|
|
8
|
+
long_description=open('README.md').read(),
|
|
9
|
+
long_description_content_type='text/markdown',
|
|
10
|
+
url='https://github.com/rhnfzl/SqueakyCleanText',
|
|
11
|
+
license='MIT',
|
|
12
|
+
packages=find_packages(),
|
|
13
|
+
install_requires=[
|
|
14
|
+
'lingua-language-detector',
|
|
15
|
+
'nltk',
|
|
16
|
+
'emoji',
|
|
17
|
+
'ftfy',
|
|
18
|
+
'Unidecode',
|
|
19
|
+
'beautifulsoup4',
|
|
20
|
+
'transformers',
|
|
21
|
+
'torch',
|
|
22
|
+
],
|
|
23
|
+
extras_require={
|
|
24
|
+
'dev': [
|
|
25
|
+
'hypothesis',
|
|
26
|
+
'faker',
|
|
27
|
+
'flake8',
|
|
28
|
+
'pytest',
|
|
29
|
+
],
|
|
30
|
+
},
|
|
31
|
+
classifiers=[
|
|
32
|
+
'Programming Language :: Python :: 3',
|
|
33
|
+
'License :: OSI Approved :: MIT License',
|
|
34
|
+
'Operating System :: OS Independent',
|
|
35
|
+
],
|
|
36
|
+
python_requires='>=3.7',
|
|
37
|
+
entry_points={
|
|
38
|
+
'console_scripts': [
|
|
39
|
+
'nltk_downloader=sct.scripts.download_nltk_stopwords:main'
|
|
40
|
+
],
|
|
41
|
+
},
|
|
42
|
+
test_suite='tests',
|
|
43
|
+
)
|
|
File without changes
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
import random
|
|
3
|
+
import string
|
|
4
|
+
from hypothesis import given, settings
|
|
5
|
+
from hypothesis.strategies import text, from_regex
|
|
6
|
+
from faker import Faker
|
|
7
|
+
from sct import config
|
|
8
|
+
from sct.utils import contact, datetime, special, normtext, stopwords, constants
|
|
9
|
+
|
|
10
|
+
class TextCleanerTest(unittest.TestCase):
|
|
11
|
+
|
|
12
|
+
@classmethod
|
|
13
|
+
def setUpClass(cls):
|
|
14
|
+
config.CHECK_NER_PROCESS = False
|
|
15
|
+
cls.ProcessContacts = contact.ProcessContacts()
|
|
16
|
+
cls.ProcessDateTime = datetime.ProcessDateTime()
|
|
17
|
+
cls.ProcessSpecialSymbols = special.ProcessSpecialSymbols()
|
|
18
|
+
cls.NormaliseText = normtext.NormaliseText()
|
|
19
|
+
cls.ProcessStopwords = stopwords.ProcessStopwords()
|
|
20
|
+
cls.fake = Faker()
|
|
21
|
+
|
|
22
|
+
@settings(deadline=None)
|
|
23
|
+
@given(from_regex(constants.EMAIL_REGEX, fullmatch=True))
|
|
24
|
+
def test_email_regex(self, rx):
|
|
25
|
+
self.assertEqual("", self.ProcessContacts.replace_emails(rx, ""))
|
|
26
|
+
|
|
27
|
+
@settings(deadline=None)
|
|
28
|
+
@given(from_regex(constants.PHONE_REGEX, fullmatch=True))
|
|
29
|
+
def test_phone_regex(self, rx):
|
|
30
|
+
self.assertEqual("", self.ProcessContacts.replace_phone_numbers(rx, ""))
|
|
31
|
+
|
|
32
|
+
@settings(deadline=None)
|
|
33
|
+
@given(from_regex(constants.NUMBERS_REGEX, fullmatch=True))
|
|
34
|
+
def test_number_regex(self, rx):
|
|
35
|
+
self.assertEqual("", self.ProcessContacts.replace_numbers(rx, ""))
|
|
36
|
+
|
|
37
|
+
@settings(deadline=None)
|
|
38
|
+
@given(from_regex(constants.URL_REGEX, fullmatch=True))
|
|
39
|
+
def test_url_regex(self, rx):
|
|
40
|
+
self.assertNotEqual(rx, self.ProcessContacts.replace_urls(rx, ""))
|
|
41
|
+
|
|
42
|
+
@settings(deadline=None)
|
|
43
|
+
@given(from_regex(constants.YEAR_REGEX, fullmatch=True))
|
|
44
|
+
def test_year_regex(self, rx):
|
|
45
|
+
self.assertEqual("", self.ProcessDateTime.replace_years(rx, ""))
|
|
46
|
+
|
|
47
|
+
@settings(deadline=None)
|
|
48
|
+
@given(from_regex(constants.ISOLATED_LETTERS_REGEX, fullmatch=True))
|
|
49
|
+
def test_isolated_letters_regex(self, rx):
|
|
50
|
+
rx = self.ProcessSpecialSymbols.remove_isolated_letters(rx)
|
|
51
|
+
rx = self.NormaliseText.normalize_whitespace(rx)
|
|
52
|
+
self.assertEqual("", rx)
|
|
53
|
+
|
|
54
|
+
@settings(deadline=None)
|
|
55
|
+
@given(from_regex(constants.ISOLATED_SPECIAL_SYMBOLS_REGEX, fullmatch=True))
|
|
56
|
+
def test_isolated_symbols_regex(self, rx):
|
|
57
|
+
self.assertEqual("", self.ProcessSpecialSymbols.remove_isolated_special_symbols(rx))
|
|
58
|
+
|
|
59
|
+
@settings(deadline=None)
|
|
60
|
+
@given(text(alphabet=string.ascii_letters + string.digits, min_size=0, max_size=4))
|
|
61
|
+
def test_faker_email(self, fkw):
|
|
62
|
+
"""Check if generated emails are replaced correctly."""
|
|
63
|
+
email = self.fake.email()
|
|
64
|
+
clean_email = self.ProcessContacts.replace_emails(email, replace_with=fkw)
|
|
65
|
+
self.assertEqual(clean_email, fkw)
|
|
66
|
+
|
|
67
|
+
@settings(deadline=None)
|
|
68
|
+
@given(text(alphabet=string.ascii_letters + string.digits, min_size=0, max_size=4))
|
|
69
|
+
def test_faker_phone_number(self, fkw):
|
|
70
|
+
"""Check if generated phone numbers are replaced correctly."""
|
|
71
|
+
phonenum = self.fake.phone_number()
|
|
72
|
+
clean_phonenum = self.ProcessContacts.replace_phone_numbers(phonenum, replace_with=fkw)
|
|
73
|
+
self.assertEqual(clean_phonenum, fkw)
|
|
74
|
+
|
|
75
|
+
@settings(deadline=None)
|
|
76
|
+
@given(text(alphabet=string.ascii_letters + string.digits, min_size=0, max_size=4))
|
|
77
|
+
def test_faker_url(self, fkw):
|
|
78
|
+
"""Check if generated URLs are replaced correctly."""
|
|
79
|
+
url = self.fake.url()
|
|
80
|
+
clean_url = self.ProcessContacts.replace_urls(url, replace_with=fkw)
|
|
81
|
+
self.assertEqual(clean_url, fkw)
|
|
82
|
+
|
|
83
|
+
if __name__ == "__main__":
|
|
84
|
+
unittest.main(verbosity=2)
|