SqueakyCleanText 0.2.5__tar.gz → 0.2.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SqueakyCleanText-0.2.5/SqueakyCleanText.egg-info → SqueakyCleanText-0.2.6}/PKG-INFO +49 -48
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/README.md +49 -48
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6/SqueakyCleanText.egg-info}/PKG-INFO +49 -48
- SqueakyCleanText-0.2.6/sct/config.py +57 -0
- SqueakyCleanText-0.2.6/sct/sct.py +129 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/utils/constants.py +9 -2
- SqueakyCleanText-0.2.6/sct/utils/contact.py +50 -0
- SqueakyCleanText-0.2.6/sct/utils/datetime.py +15 -0
- SqueakyCleanText-0.2.6/sct/utils/ner.py +228 -0
- SqueakyCleanText-0.2.6/sct/utils/normtext.py +83 -0
- SqueakyCleanText-0.2.6/sct/utils/special.py +50 -0
- SqueakyCleanText-0.2.6/sct/utils/stopwords.py +35 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/setup.py +1 -1
- SqueakyCleanText-0.2.5/sct/config.py +0 -100
- SqueakyCleanText-0.2.5/sct/sct.py +0 -341
- SqueakyCleanText-0.2.5/sct/utils/contact.py +0 -88
- SqueakyCleanText-0.2.5/sct/utils/datetime.py +0 -26
- SqueakyCleanText-0.2.5/sct/utils/ner.py +0 -367
- SqueakyCleanText-0.2.5/sct/utils/normtext.py +0 -148
- SqueakyCleanText-0.2.5/sct/utils/special.py +0 -113
- SqueakyCleanText-0.2.5/sct/utils/stopwords.py +0 -62
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/LICENSE +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/MANIFEST.in +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/SOURCES.txt +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/dependency_links.txt +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/entry_points.txt +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/requires.txt +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/top_level.txt +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/__init__.py +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/scripts/__init__.py +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/scripts/download_nltk_stopwords.py +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/utils/__init__.py +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/utils/resources.py +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/setup.cfg +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/tests/__init__.py +0 -0
- {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/tests/test_sct.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: SqueakyCleanText
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: A comprehensive text cleaning and preprocessing pipeline.
|
|
5
5
|
Home-page: https://github.com/rhnfzl/SqueakyCleanText
|
|
6
6
|
Author: Rehan Fazal
|
|
@@ -156,55 +156,56 @@ You can modify the package’s functionality by changing settings in the configu
|
|
|
156
156
|
Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
|
|
157
157
|
|
|
158
158
|
```python
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
159
|
+
from sct import sct, config
|
|
160
|
+
# In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
|
|
161
|
+
# then only CHECK_DETECT_LANGUAGE will be considered False.
|
|
162
|
+
config.CHECK_DETECT_LANGUAGE = True
|
|
163
|
+
config.CHECK_FIX_BAD_UNICODE = True
|
|
164
|
+
config.CHECK_TO_ASCII_UNICODE = True
|
|
165
|
+
config.CHECK_REPLACE_HTML = True
|
|
166
|
+
config.CHECK_REPLACE_URLS = True
|
|
167
|
+
config.CHECK_REPLACE_EMAILS = True
|
|
168
|
+
config.CHECK_REPLACE_YEARS = True
|
|
169
|
+
config.CHECK_REPLACE_PHONE_NUMBERS = True
|
|
170
|
+
config.CHECK_REPLACE_NUMBERS = True
|
|
171
|
+
config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
172
|
+
config.CHECK_NER_PROCESS = True
|
|
173
|
+
config.CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
174
|
+
config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
175
|
+
config.CHECK_NORMALIZE_WHITESPACE = True
|
|
176
|
+
config.CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
177
|
+
config.CHECK_CASEFOLD = True
|
|
178
|
+
config.CHECK_REMOVE_STOPWORDS = True
|
|
179
|
+
config.CHECK_REMOVE_PUNCTUATION = True
|
|
180
|
+
config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
|
|
181
|
+
# Tags can be replaced if needed, like if no special tags are necessary "" can be passed
|
|
182
|
+
config.REPLACE_WITH_URL = "<URL>"
|
|
183
|
+
config.REPLACE_WITH_HTML = "<HTML>"
|
|
184
|
+
config.REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
185
|
+
config.REPLACE_WITH_YEARS = "<YEAR>"
|
|
186
|
+
config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
187
|
+
config.REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
188
|
+
config.REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
189
|
+
# You can remove any of the tags
|
|
190
|
+
config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
191
|
+
config.NER_CONFIDENCE_THRESHOLD = 0.85
|
|
192
|
+
# Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
|
|
193
|
+
config.LANGUAGE = None
|
|
194
|
+
|
|
195
|
+
# Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
|
|
196
|
+
# All models passed need to support transformers AutoModel
|
|
197
|
+
config.NER_MODELS_LIST = [
|
|
198
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll03-english",
|
|
199
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
|
|
200
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll03-german",
|
|
201
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
|
|
202
|
+
"Babelscape/wikineural-multilingual-ner"
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
sx = sct.TextCleaner()
|
|
206
206
|
```
|
|
207
207
|
|
|
208
|
+
|
|
208
209
|
## API
|
|
209
210
|
|
|
210
211
|
### `sct.TextCleaner`
|
|
@@ -132,55 +132,56 @@ You can modify the package’s functionality by changing settings in the configu
|
|
|
132
132
|
Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
|
|
133
133
|
|
|
134
134
|
```python
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
135
|
+
from sct import sct, config
|
|
136
|
+
# In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
|
|
137
|
+
# then only CHECK_DETECT_LANGUAGE will be considered False.
|
|
138
|
+
config.CHECK_DETECT_LANGUAGE = True
|
|
139
|
+
config.CHECK_FIX_BAD_UNICODE = True
|
|
140
|
+
config.CHECK_TO_ASCII_UNICODE = True
|
|
141
|
+
config.CHECK_REPLACE_HTML = True
|
|
142
|
+
config.CHECK_REPLACE_URLS = True
|
|
143
|
+
config.CHECK_REPLACE_EMAILS = True
|
|
144
|
+
config.CHECK_REPLACE_YEARS = True
|
|
145
|
+
config.CHECK_REPLACE_PHONE_NUMBERS = True
|
|
146
|
+
config.CHECK_REPLACE_NUMBERS = True
|
|
147
|
+
config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
148
|
+
config.CHECK_NER_PROCESS = True
|
|
149
|
+
config.CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
150
|
+
config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
151
|
+
config.CHECK_NORMALIZE_WHITESPACE = True
|
|
152
|
+
config.CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
153
|
+
config.CHECK_CASEFOLD = True
|
|
154
|
+
config.CHECK_REMOVE_STOPWORDS = True
|
|
155
|
+
config.CHECK_REMOVE_PUNCTUATION = True
|
|
156
|
+
config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
|
|
157
|
+
# Tags can be replaced if needed, like if no special tags are necessary "" can be passed
|
|
158
|
+
config.REPLACE_WITH_URL = "<URL>"
|
|
159
|
+
config.REPLACE_WITH_HTML = "<HTML>"
|
|
160
|
+
config.REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
161
|
+
config.REPLACE_WITH_YEARS = "<YEAR>"
|
|
162
|
+
config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
163
|
+
config.REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
164
|
+
config.REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
165
|
+
# You can remove any of the tags
|
|
166
|
+
config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
167
|
+
config.NER_CONFIDENCE_THRESHOLD = 0.85
|
|
168
|
+
# Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
|
|
169
|
+
config.LANGUAGE = None
|
|
170
|
+
|
|
171
|
+
# Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
|
|
172
|
+
# All models passed need to support transformers AutoModel
|
|
173
|
+
config.NER_MODELS_LIST = [
|
|
174
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll03-english",
|
|
175
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
|
|
176
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll03-german",
|
|
177
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
|
|
178
|
+
"Babelscape/wikineural-multilingual-ner"
|
|
179
|
+
]
|
|
180
|
+
|
|
181
|
+
sx = sct.TextCleaner()
|
|
182
182
|
```
|
|
183
183
|
|
|
184
|
+
|
|
184
185
|
## API
|
|
185
186
|
|
|
186
187
|
### `sct.TextCleaner`
|
|
@@ -204,4 +205,4 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
|
|
|
204
205
|
|
|
205
206
|
The package took inspirations from the following repo:
|
|
206
207
|
|
|
207
|
-
- [clean-text](https://github.com/jfilter/clean-text)
|
|
208
|
+
- [clean-text](https://github.com/jfilter/clean-text)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: SqueakyCleanText
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.6
|
|
4
4
|
Summary: A comprehensive text cleaning and preprocessing pipeline.
|
|
5
5
|
Home-page: https://github.com/rhnfzl/SqueakyCleanText
|
|
6
6
|
Author: Rehan Fazal
|
|
@@ -156,55 +156,56 @@ You can modify the package’s functionality by changing settings in the configu
|
|
|
156
156
|
Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
|
|
157
157
|
|
|
158
158
|
```python
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
159
|
+
from sct import sct, config
|
|
160
|
+
# In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
|
|
161
|
+
# then only CHECK_DETECT_LANGUAGE will be considered False.
|
|
162
|
+
config.CHECK_DETECT_LANGUAGE = True
|
|
163
|
+
config.CHECK_FIX_BAD_UNICODE = True
|
|
164
|
+
config.CHECK_TO_ASCII_UNICODE = True
|
|
165
|
+
config.CHECK_REPLACE_HTML = True
|
|
166
|
+
config.CHECK_REPLACE_URLS = True
|
|
167
|
+
config.CHECK_REPLACE_EMAILS = True
|
|
168
|
+
config.CHECK_REPLACE_YEARS = True
|
|
169
|
+
config.CHECK_REPLACE_PHONE_NUMBERS = True
|
|
170
|
+
config.CHECK_REPLACE_NUMBERS = True
|
|
171
|
+
config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
172
|
+
config.CHECK_NER_PROCESS = True
|
|
173
|
+
config.CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
174
|
+
config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
175
|
+
config.CHECK_NORMALIZE_WHITESPACE = True
|
|
176
|
+
config.CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
177
|
+
config.CHECK_CASEFOLD = True
|
|
178
|
+
config.CHECK_REMOVE_STOPWORDS = True
|
|
179
|
+
config.CHECK_REMOVE_PUNCTUATION = True
|
|
180
|
+
config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
|
|
181
|
+
# Tags can be replaced if needed, like if no special tags are necessary "" can be passed
|
|
182
|
+
config.REPLACE_WITH_URL = "<URL>"
|
|
183
|
+
config.REPLACE_WITH_HTML = "<HTML>"
|
|
184
|
+
config.REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
185
|
+
config.REPLACE_WITH_YEARS = "<YEAR>"
|
|
186
|
+
config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
187
|
+
config.REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
188
|
+
config.REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
189
|
+
# You can remove any of the tags
|
|
190
|
+
config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
191
|
+
config.NER_CONFIDENCE_THRESHOLD = 0.85
|
|
192
|
+
# Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
|
|
193
|
+
config.LANGUAGE = None
|
|
194
|
+
|
|
195
|
+
# Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
|
|
196
|
+
# All models passed need to support transformers AutoModel
|
|
197
|
+
config.NER_MODELS_LIST = [
|
|
198
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll03-english",
|
|
199
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
|
|
200
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll03-german",
|
|
201
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
|
|
202
|
+
"Babelscape/wikineural-multilingual-ner"
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
sx = sct.TextCleaner()
|
|
206
206
|
```
|
|
207
207
|
|
|
208
|
+
|
|
208
209
|
## API
|
|
209
210
|
|
|
210
211
|
### `sct.TextCleaner`
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""
|
|
2
|
+
detect_language : to detect the language automatically, but would consume more time if done on a batch
|
|
3
|
+
fix_bad_unicode : if True, fix "broken" unicode such as mojibake and garbled HTML entities
|
|
4
|
+
to_ascii_unicode : if True, convert non-to_ascii characters into their closest to_ascii equivalents
|
|
5
|
+
replace_with_url : special URL token, default "",
|
|
6
|
+
replace_with_email : special EMAIL token, default "",
|
|
7
|
+
replace_years : replace year, default "",
|
|
8
|
+
replace_with_phone_number : special PHONE token, default "",
|
|
9
|
+
replace_with_number : special NUMBER token, default "",
|
|
10
|
+
no_currency_symbols : if True, replace all currency symbols with the respective alphabetical ones,
|
|
11
|
+
ner_process : To execute NER Process to remove the positpositional tags, PER, LOC, ORG, MISC
|
|
12
|
+
remove_isolated_letters : remove any isolated letters which doesn't add any value to the text
|
|
13
|
+
remove_isolated_symbols : remove any isolated symbols which shouldn't be present in the text, usually which isn't
|
|
14
|
+
immediatly prefixed and suffixed by letter or number
|
|
15
|
+
normalize_whitespace : remove any unnecessary whitespace
|
|
16
|
+
statistical_model_processing : to get the statistical model text, like for fastText, SVM, LR etc
|
|
17
|
+
casefold : to lower the text
|
|
18
|
+
remove_stopwords : remove stopwords based on the language, usues NLTK stopwords
|
|
19
|
+
remove_punctuation : removes all the special symbols
|
|
20
|
+
"""
|
|
21
|
+
|
|
22
|
+
CHECK_DETECT_LANGUAGE = True
|
|
23
|
+
CHECK_FIX_BAD_UNICODE = True
|
|
24
|
+
CHECK_TO_ASCII_UNICODE = True
|
|
25
|
+
CHECK_REPLACE_HTML = True
|
|
26
|
+
CHECK_REPLACE_URLS = True
|
|
27
|
+
CHECK_REPLACE_EMAILS = True
|
|
28
|
+
CHECK_REPLACE_YEARS = True
|
|
29
|
+
CHECK_REPLACE_PHONE_NUMBERS = True
|
|
30
|
+
CHECK_REPLACE_NUMBERS = True
|
|
31
|
+
CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
32
|
+
CHECK_NER_PROCESS = True
|
|
33
|
+
CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
34
|
+
CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
35
|
+
CHECK_NORMALIZE_WHITESPACE = True
|
|
36
|
+
CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
37
|
+
CHECK_CASEFOLD = True
|
|
38
|
+
CHECK_REMOVE_STOPWORDS = True
|
|
39
|
+
CHECK_REMOVE_PUNCTUATION = True
|
|
40
|
+
CHECK_REMOVE_STEXT_CUSTOM_STOP_WORDS = True
|
|
41
|
+
REPLACE_WITH_URL = "<URL>"
|
|
42
|
+
REPLACE_WITH_HTML = "<HTML>"
|
|
43
|
+
REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
44
|
+
REPLACE_WITH_YEARS = "<YEAR>"
|
|
45
|
+
REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
46
|
+
REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
47
|
+
REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
48
|
+
POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
49
|
+
NER_CONFIDENCE_THRESHOLD = 0.85
|
|
50
|
+
LANGUAGE = None
|
|
51
|
+
|
|
52
|
+
# Order of the model is Important : English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
|
|
53
|
+
NER_MODELS_LIST = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",
|
|
54
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
|
|
55
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll03-german",
|
|
56
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
|
|
57
|
+
"Babelscape/wikineural-multilingual-ner"]
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
"""
|
|
2
|
+
This code provides a comprehensive text cleaning and preprocessing pipeline.
|
|
3
|
+
It includes functions to normalize, remove personal information and clean text data,
|
|
4
|
+
which is crucial for natural language processing tasks.
|
|
5
|
+
"""
|
|
6
|
+
from sct import config
|
|
7
|
+
from sct.utils import contact, datetime, ner, normtext, resources, special, stopwords
|
|
8
|
+
|
|
9
|
+
class TextCleaner:
|
|
10
|
+
|
|
11
|
+
def __init__(self):
|
|
12
|
+
self.ProcessContacts = contact.ProcessContacts()
|
|
13
|
+
self.ProcessDateTime = datetime.ProcessDateTime()
|
|
14
|
+
self.ProcessSpecialSymbols = special.ProcessSpecialSymbols()
|
|
15
|
+
self.NormaliseText = normtext.NormaliseText()
|
|
16
|
+
self.ProcessStopwords = stopwords.ProcessStopwords()
|
|
17
|
+
self.GeneralNER = ner.GeneralNER()
|
|
18
|
+
self.pipeline = []
|
|
19
|
+
self.language = None
|
|
20
|
+
self.init_pipeline()
|
|
21
|
+
|
|
22
|
+
def init_pipeline(self):
|
|
23
|
+
# Initialize pipeline steps based on config
|
|
24
|
+
language_config = config.LANGUAGE.lower() if config.LANGUAGE else None
|
|
25
|
+
|
|
26
|
+
if language_config and language_config in resources.LANGUAGE_NAME:
|
|
27
|
+
self.language = language_config.upper()
|
|
28
|
+
elif any([config.CHECK_DETECT_LANGUAGE, config.CHECK_NER_PROCESS, config.CHECK_REMOVE_STOPWORDS]):
|
|
29
|
+
self.pipeline.append(self.detect_language)
|
|
30
|
+
|
|
31
|
+
if config.CHECK_FIX_BAD_UNICODE:
|
|
32
|
+
self.pipeline.append(self.fix_bad_unicode)
|
|
33
|
+
if config.CHECK_TO_ASCII_UNICODE:
|
|
34
|
+
self.pipeline.append(self.to_ascii_unicode)
|
|
35
|
+
|
|
36
|
+
if config.CHECK_REPLACE_HTML:
|
|
37
|
+
self.pipeline.append(self.replace_html)
|
|
38
|
+
if config.CHECK_REPLACE_URLS:
|
|
39
|
+
self.pipeline.append(self.replace_urls)
|
|
40
|
+
if config.CHECK_REPLACE_EMAILS:
|
|
41
|
+
self.pipeline.append(self.replace_emails)
|
|
42
|
+
if config.CHECK_REPLACE_YEARS:
|
|
43
|
+
self.pipeline.append(self.replace_years)
|
|
44
|
+
if config.CHECK_REPLACE_PHONE_NUMBERS:
|
|
45
|
+
self.pipeline.append(self.replace_phone_numbers)
|
|
46
|
+
if config.CHECK_REPLACE_NUMBERS:
|
|
47
|
+
self.pipeline.append(self.replace_numbers)
|
|
48
|
+
if config.CHECK_REPLACE_CURRENCY_SYMBOLS:
|
|
49
|
+
self.pipeline.append(self.replace_currency_symbols)
|
|
50
|
+
|
|
51
|
+
if config.CHECK_NER_PROCESS:
|
|
52
|
+
self.pipeline.append(self.ner_process)
|
|
53
|
+
|
|
54
|
+
if config.CHECK_REMOVE_ISOLATED_LETTERS:
|
|
55
|
+
self.pipeline.append(self.remove_isolated_letters)
|
|
56
|
+
if config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS:
|
|
57
|
+
self.pipeline.append(self.remove_isolated_special_symbols)
|
|
58
|
+
if config.CHECK_NORMALIZE_WHITESPACE:
|
|
59
|
+
self.pipeline.append(self.normalize_whitespace)
|
|
60
|
+
|
|
61
|
+
def process(self, text):
|
|
62
|
+
text = str(text)
|
|
63
|
+
|
|
64
|
+
for step in self.pipeline:
|
|
65
|
+
text = step(text)
|
|
66
|
+
|
|
67
|
+
if config.CHECK_STATISTICAL_MODEL_PROCESSING:
|
|
68
|
+
stext = self.statistical_model_processing(text)
|
|
69
|
+
return text, stext, self.language
|
|
70
|
+
elif config.CHECK_DETECT_LANGUAGE:
|
|
71
|
+
return text, self.language
|
|
72
|
+
else:
|
|
73
|
+
return text
|
|
74
|
+
|
|
75
|
+
def detect_language(self, text):
|
|
76
|
+
self.language = str(resources.DETECTOR.detect_language_of(text)).split(".")[-1]
|
|
77
|
+
return text
|
|
78
|
+
|
|
79
|
+
def fix_bad_unicode(self, text):
|
|
80
|
+
return self.NormaliseText.fix_bad_unicode(text)
|
|
81
|
+
|
|
82
|
+
def to_ascii_unicode(self, text):
|
|
83
|
+
return self.NormaliseText.to_ascii_unicode(text)
|
|
84
|
+
|
|
85
|
+
def replace_html(self, text):
|
|
86
|
+
return self.ProcessContacts.replace_html(text, replace_with=config.REPLACE_WITH_HTML)
|
|
87
|
+
|
|
88
|
+
def replace_urls(self, text):
|
|
89
|
+
return self.ProcessContacts.replace_urls(text, replace_with=config.REPLACE_WITH_URL)
|
|
90
|
+
|
|
91
|
+
def replace_emails(self, text):
|
|
92
|
+
return self.ProcessContacts.replace_emails(text, replace_with=config.REPLACE_WITH_EMAIL)
|
|
93
|
+
|
|
94
|
+
def replace_years(self, text):
|
|
95
|
+
return self.ProcessDateTime.replace_years(text, replace_with=config.REPLACE_WITH_YEARS)
|
|
96
|
+
|
|
97
|
+
def replace_phone_numbers(self, text):
|
|
98
|
+
return self.ProcessContacts.replace_phone_numbers(text, replace_with=config.REPLACE_WITH_PHONE_NUMBERS)
|
|
99
|
+
|
|
100
|
+
def replace_numbers(self, text):
|
|
101
|
+
return self.ProcessContacts.replace_numbers(text, replace_with=config.REPLACE_WITH_NUMBERS)
|
|
102
|
+
|
|
103
|
+
def replace_currency_symbols(self, text):
|
|
104
|
+
return self.ProcessSpecialSymbols.replace_currency_symbols(text, replace_with=config.REPLACE_WITH_CURRENCY_SYMBOLS)
|
|
105
|
+
|
|
106
|
+
def ner_process(self, text):
|
|
107
|
+
return self.GeneralNER.ner_process(text, config.POSITIONAL_TAGS, config.NER_CONFIDENCE_THRESHOLD, self.language)
|
|
108
|
+
|
|
109
|
+
def remove_isolated_letters(self, text):
|
|
110
|
+
return self.ProcessSpecialSymbols.remove_isolated_letters(text)
|
|
111
|
+
|
|
112
|
+
def remove_isolated_special_symbols(self, text):
|
|
113
|
+
return self.ProcessSpecialSymbols.remove_isolated_special_symbols(text)
|
|
114
|
+
|
|
115
|
+
def normalize_whitespace(self, text):
|
|
116
|
+
return self.NormaliseText.normalize_whitespace(text, no_line_breaks=True)
|
|
117
|
+
|
|
118
|
+
def statistical_model_processing(self, text):
|
|
119
|
+
if config.CHECK_CASEFOLD:
|
|
120
|
+
stext = text.casefold() # lowercase
|
|
121
|
+
if config.CHECK_REMOVE_STOPWORDS:
|
|
122
|
+
stext = self.ProcessStopwords.remove_stopwords(stext, self.language)
|
|
123
|
+
if config.CHECK_REMOVE_PUNCTUATION:
|
|
124
|
+
stext = self.ProcessSpecialSymbols.remove_punctuation(stext)
|
|
125
|
+
if config.CHECK_REMOVE_ISOLATED_LETTERS:
|
|
126
|
+
stext = self.ProcessSpecialSymbols.remove_isolated_letters(stext)
|
|
127
|
+
if config.CHECK_NORMALIZE_WHITESPACE:
|
|
128
|
+
stext = self.NormaliseText.normalize_whitespace(stext)
|
|
129
|
+
return stext
|
|
@@ -36,6 +36,14 @@ EMAIL_REGEX = re.compile(
|
|
|
36
36
|
flags=re.IGNORECASE | re.UNICODE,
|
|
37
37
|
)
|
|
38
38
|
|
|
39
|
+
# for more information: https://github.com/jfilter/clean-text/issues/10
|
|
40
|
+
# PHONE_REGEX = re.compile(
|
|
41
|
+
# r"((?:^|(?<=[^\w)]))(((\+?[01])|(\+\d{2}))[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
|
|
42
|
+
# )
|
|
43
|
+
# PHONE_REGEX = re.compile(
|
|
44
|
+
# r"((?:^|(?<=[^\w)]))((\+?[01]|0{1,2}\d{0,1}|\+\d{2})[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
|
|
45
|
+
# )
|
|
46
|
+
|
|
39
47
|
PHONE_REGEX = re.compile(
|
|
40
48
|
r"((?:^|(?<=[^\w)]))((\+?\d+|0{1,2}\d*?)[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
|
|
41
49
|
)
|
|
@@ -108,7 +116,6 @@ strange_double_quotes = [
|
|
|
108
116
|
"〟",
|
|
109
117
|
""",
|
|
110
118
|
]
|
|
111
|
-
|
|
112
119
|
strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", "‘", "’"]
|
|
113
120
|
|
|
114
121
|
DOUBLE_QUOTE_REGEX = re.compile("|".join(strange_double_quotes))
|
|
@@ -120,4 +127,4 @@ ISOLATED_LETTERS_REGEX = re.compile(r"(?:^|\s)[B, C, D, E, F, G, H, I, J, K, L,
|
|
|
120
127
|
|
|
121
128
|
ISOLATED_SPECIAL_SYMBOLS_REGEX = re.compile(r"(?<![a-zA-Z0-9])[:_.|><;·}@~!?+#)({,/\\\\^]+(?![a-zA-Z0-9])", flags=re.UNICODE | re.IGNORECASE)
|
|
122
129
|
|
|
123
|
-
SENTENCE_BOUNDARY_PATTERN = re.compile('(?<=[.!?])\s+(?=[^\d])')
|
|
130
|
+
SENTENCE_BOUNDARY_PATTERN = re.compile('(?<=[.!?])\s+(?=[^\d])')
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from sct.utils import constants
|
|
2
|
+
from bs4 import BeautifulSoup
|
|
3
|
+
|
|
4
|
+
class ProcessContacts:
|
|
5
|
+
|
|
6
|
+
def __init__(self):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
def replace_urls(self, text, replace_with="<URL>"):
|
|
10
|
+
"""
|
|
11
|
+
Replace all URLs in ``text`` str with ``replace_with`` str.
|
|
12
|
+
"""
|
|
13
|
+
# matches = constants.URL_REGEX.finditer(text)
|
|
14
|
+
# result = text
|
|
15
|
+
# # Iterate through matches in reverse order (to avoid index issues)
|
|
16
|
+
# for match in reversed(list(matches)):
|
|
17
|
+
# # Check if the matched substring contains non-ASCII characters
|
|
18
|
+
# if not any(ord(char) > 127 for char in match.group()):
|
|
19
|
+
# result = text[:match.start()] + replace_with + text[match.end():]
|
|
20
|
+
return constants.URL_REGEX.sub(replace_with, text)
|
|
21
|
+
|
|
22
|
+
def replace_html(self, text, replace_with="<HTML>"):
|
|
23
|
+
"""
|
|
24
|
+
Replace all html tags in ``text`` str with ``replace_with`` str.
|
|
25
|
+
"""
|
|
26
|
+
try:
|
|
27
|
+
soup = BeautifulSoup(text, 'html.parser')
|
|
28
|
+
text = soup.get_text()
|
|
29
|
+
except:
|
|
30
|
+
text = constants.HTML_REGEX.sub(replace_with, text)
|
|
31
|
+
|
|
32
|
+
return text
|
|
33
|
+
|
|
34
|
+
def replace_emails(self, text, replace_with="<EMAIL>"):
|
|
35
|
+
"""
|
|
36
|
+
Replace all emails in ``text`` str with ``replace_with`` str.
|
|
37
|
+
"""
|
|
38
|
+
return constants.EMAIL_REGEX.sub(replace_with, text)
|
|
39
|
+
|
|
40
|
+
def replace_phone_numbers(self, text, replace_with="<PHONE>"):
|
|
41
|
+
"""
|
|
42
|
+
Replace all phone numbers in ``text`` str with ``replace_with`` str.
|
|
43
|
+
"""
|
|
44
|
+
return constants.PHONE_REGEX.sub(replace_with, text)
|
|
45
|
+
|
|
46
|
+
def replace_numbers(self, text, replace_with="<NUMBER>"):
|
|
47
|
+
"""
|
|
48
|
+
Replace all numbers in ``text`` str with ``replace_with`` str.
|
|
49
|
+
"""
|
|
50
|
+
return constants.NUMBERS_REGEX.sub(replace_with, text)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
from sct.utils import constants
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class ProcessDateTime:
|
|
5
|
+
|
|
6
|
+
def __init__(self):
|
|
7
|
+
pass
|
|
8
|
+
|
|
9
|
+
def replace_years(self, text, replace_with ="<YEAR>"):
|
|
10
|
+
"""
|
|
11
|
+
Replaces years between 1900 to 2099 in the text with a special token.
|
|
12
|
+
"""
|
|
13
|
+
cleaned_string = constants.YEAR_REGEX.sub(replace_with, text)
|
|
14
|
+
|
|
15
|
+
return cleaned_string
|