SqueakyCleanText 0.2.3__tar.gz → 0.2.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {SqueakyCleanText-0.2.3/SqueakyCleanText.egg-info → SqueakyCleanText-0.2.5}/PKG-INFO +34 -31
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/README.md +33 -30
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5/SqueakyCleanText.egg-info}/PKG-INFO +34 -31
- SqueakyCleanText-0.2.5/sct/config.py +100 -0
- SqueakyCleanText-0.2.5/sct/sct.py +341 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/utils/constants.py +2 -9
- SqueakyCleanText-0.2.5/sct/utils/contact.py +88 -0
- SqueakyCleanText-0.2.5/sct/utils/datetime.py +26 -0
- SqueakyCleanText-0.2.5/sct/utils/ner.py +367 -0
- SqueakyCleanText-0.2.5/sct/utils/normtext.py +148 -0
- SqueakyCleanText-0.2.5/sct/utils/special.py +113 -0
- SqueakyCleanText-0.2.5/sct/utils/stopwords.py +62 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/setup.py +1 -1
- SqueakyCleanText-0.2.3/sct/config.py +0 -57
- SqueakyCleanText-0.2.3/sct/sct.py +0 -129
- SqueakyCleanText-0.2.3/sct/utils/contact.py +0 -50
- SqueakyCleanText-0.2.3/sct/utils/datetime.py +0 -15
- SqueakyCleanText-0.2.3/sct/utils/ner.py +0 -227
- SqueakyCleanText-0.2.3/sct/utils/normtext.py +0 -83
- SqueakyCleanText-0.2.3/sct/utils/special.py +0 -50
- SqueakyCleanText-0.2.3/sct/utils/stopwords.py +0 -35
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/LICENSE +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/MANIFEST.in +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/SOURCES.txt +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/dependency_links.txt +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/entry_points.txt +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/requires.txt +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/top_level.txt +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/__init__.py +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/scripts/__init__.py +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/scripts/download_nltk_stopwords.py +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/utils/__init__.py +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/utils/resources.py +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/setup.cfg +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/tests/__init__.py +0 -0
- {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/tests/test_sct.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: SqueakyCleanText
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: A comprehensive text cleaning and preprocessing pipeline.
|
|
5
5
|
Home-page: https://github.com/rhnfzl/SqueakyCleanText
|
|
6
6
|
Author: Rehan Fazal
|
|
@@ -156,50 +156,53 @@ You can modify the package’s functionality by changing settings in the configu
|
|
|
156
156
|
Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
|
|
157
157
|
|
|
158
158
|
```python
|
|
159
|
+
from sct import sct, config
|
|
159
160
|
# In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
|
|
160
161
|
# then only CHECK_DETECT_LANGUAGE will be considered False.
|
|
161
|
-
CHECK_DETECT_LANGUAGE = True
|
|
162
|
-
CHECK_FIX_BAD_UNICODE = True
|
|
163
|
-
CHECK_TO_ASCII_UNICODE = True
|
|
164
|
-
CHECK_REPLACE_HTML = True
|
|
165
|
-
CHECK_REPLACE_URLS = True
|
|
166
|
-
CHECK_REPLACE_EMAILS = True
|
|
167
|
-
CHECK_REPLACE_YEARS = True
|
|
168
|
-
CHECK_REPLACE_PHONE_NUMBERS = True
|
|
169
|
-
CHECK_REPLACE_NUMBERS = True
|
|
170
|
-
CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
171
|
-
CHECK_NER_PROCESS = True
|
|
172
|
-
CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
173
|
-
CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
174
|
-
CHECK_NORMALIZE_WHITESPACE = True
|
|
175
|
-
CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
176
|
-
CHECK_CASEFOLD = True
|
|
177
|
-
CHECK_REMOVE_STOPWORDS = True
|
|
178
|
-
CHECK_REMOVE_PUNCTUATION = True
|
|
179
|
-
CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
|
|
162
|
+
config.CHECK_DETECT_LANGUAGE = True
|
|
163
|
+
config.CHECK_FIX_BAD_UNICODE = True
|
|
164
|
+
config.CHECK_TO_ASCII_UNICODE = True
|
|
165
|
+
config.CHECK_REPLACE_HTML = True
|
|
166
|
+
config.CHECK_REPLACE_URLS = True
|
|
167
|
+
config.CHECK_REPLACE_EMAILS = True
|
|
168
|
+
config.CHECK_REPLACE_YEARS = True
|
|
169
|
+
config.CHECK_REPLACE_PHONE_NUMBERS = True
|
|
170
|
+
config.CHECK_REPLACE_NUMBERS = True
|
|
171
|
+
config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
172
|
+
config.CHECK_NER_PROCESS = True
|
|
173
|
+
config.CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
174
|
+
config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
175
|
+
config.CHECK_NORMALIZE_WHITESPACE = True
|
|
176
|
+
config.CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
177
|
+
config.CHECK_CASEFOLD = True
|
|
178
|
+
config.CHECK_REMOVE_STOPWORDS = True
|
|
179
|
+
config.CHECK_REMOVE_PUNCTUATION = True
|
|
180
|
+
config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
|
|
180
181
|
# Tags can be replaced if needed, like if no special tags are necessary "" can be passed
|
|
181
|
-
REPLACE_WITH_URL = "<URL>"
|
|
182
|
-
REPLACE_WITH_HTML = "<HTML>"
|
|
183
|
-
REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
184
|
-
REPLACE_WITH_YEARS = "<YEAR>"
|
|
185
|
-
REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
186
|
-
REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
187
|
-
REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
182
|
+
config.REPLACE_WITH_URL = "<URL>"
|
|
183
|
+
config.REPLACE_WITH_HTML = "<HTML>"
|
|
184
|
+
config.REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
185
|
+
config.REPLACE_WITH_YEARS = "<YEAR>"
|
|
186
|
+
config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
187
|
+
config.REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
188
|
+
config.REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
188
189
|
# You can remove any of the tags
|
|
189
|
-
POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
190
|
-
NER_CONFIDENCE_THRESHOLD = 0.85
|
|
190
|
+
config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
191
|
+
config.NER_CONFIDENCE_THRESHOLD = 0.85
|
|
191
192
|
# Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
|
|
192
|
-
LANGUAGE = None
|
|
193
|
+
config.LANGUAGE = None
|
|
193
194
|
|
|
194
195
|
# Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
|
|
195
196
|
# All models passed need to support transformers AutoModel
|
|
196
|
-
NER_MODELS_LIST = [
|
|
197
|
+
config.NER_MODELS_LIST = [
|
|
197
198
|
"FacebookAI/xlm-roberta-large-finetuned-conll03-english",
|
|
198
199
|
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
|
|
199
200
|
"FacebookAI/xlm-roberta-large-finetuned-conll03-german",
|
|
200
201
|
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
|
|
201
202
|
"Babelscape/wikineural-multilingual-ner"
|
|
202
203
|
]
|
|
204
|
+
|
|
205
|
+
sx = sct.TextCleaner()
|
|
203
206
|
```
|
|
204
207
|
|
|
205
208
|
## API
|
|
@@ -132,50 +132,53 @@ You can modify the package’s functionality by changing settings in the configu
|
|
|
132
132
|
Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
|
|
133
133
|
|
|
134
134
|
```python
|
|
135
|
+
from sct import sct, config
|
|
135
136
|
# In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
|
|
136
137
|
# then only CHECK_DETECT_LANGUAGE will be considered False.
|
|
137
|
-
CHECK_DETECT_LANGUAGE = True
|
|
138
|
-
CHECK_FIX_BAD_UNICODE = True
|
|
139
|
-
CHECK_TO_ASCII_UNICODE = True
|
|
140
|
-
CHECK_REPLACE_HTML = True
|
|
141
|
-
CHECK_REPLACE_URLS = True
|
|
142
|
-
CHECK_REPLACE_EMAILS = True
|
|
143
|
-
CHECK_REPLACE_YEARS = True
|
|
144
|
-
CHECK_REPLACE_PHONE_NUMBERS = True
|
|
145
|
-
CHECK_REPLACE_NUMBERS = True
|
|
146
|
-
CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
147
|
-
CHECK_NER_PROCESS = True
|
|
148
|
-
CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
149
|
-
CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
150
|
-
CHECK_NORMALIZE_WHITESPACE = True
|
|
151
|
-
CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
152
|
-
CHECK_CASEFOLD = True
|
|
153
|
-
CHECK_REMOVE_STOPWORDS = True
|
|
154
|
-
CHECK_REMOVE_PUNCTUATION = True
|
|
155
|
-
CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
|
|
138
|
+
config.CHECK_DETECT_LANGUAGE = True
|
|
139
|
+
config.CHECK_FIX_BAD_UNICODE = True
|
|
140
|
+
config.CHECK_TO_ASCII_UNICODE = True
|
|
141
|
+
config.CHECK_REPLACE_HTML = True
|
|
142
|
+
config.CHECK_REPLACE_URLS = True
|
|
143
|
+
config.CHECK_REPLACE_EMAILS = True
|
|
144
|
+
config.CHECK_REPLACE_YEARS = True
|
|
145
|
+
config.CHECK_REPLACE_PHONE_NUMBERS = True
|
|
146
|
+
config.CHECK_REPLACE_NUMBERS = True
|
|
147
|
+
config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
148
|
+
config.CHECK_NER_PROCESS = True
|
|
149
|
+
config.CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
150
|
+
config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
151
|
+
config.CHECK_NORMALIZE_WHITESPACE = True
|
|
152
|
+
config.CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
153
|
+
config.CHECK_CASEFOLD = True
|
|
154
|
+
config.CHECK_REMOVE_STOPWORDS = True
|
|
155
|
+
config.CHECK_REMOVE_PUNCTUATION = True
|
|
156
|
+
config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
|
|
156
157
|
# Tags can be replaced if needed, like if no special tags are necessary "" can be passed
|
|
157
|
-
REPLACE_WITH_URL = "<URL>"
|
|
158
|
-
REPLACE_WITH_HTML = "<HTML>"
|
|
159
|
-
REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
160
|
-
REPLACE_WITH_YEARS = "<YEAR>"
|
|
161
|
-
REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
162
|
-
REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
163
|
-
REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
158
|
+
config.REPLACE_WITH_URL = "<URL>"
|
|
159
|
+
config.REPLACE_WITH_HTML = "<HTML>"
|
|
160
|
+
config.REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
161
|
+
config.REPLACE_WITH_YEARS = "<YEAR>"
|
|
162
|
+
config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
163
|
+
config.REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
164
|
+
config.REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
164
165
|
# You can remove any of the tags
|
|
165
|
-
POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
166
|
-
NER_CONFIDENCE_THRESHOLD = 0.85
|
|
166
|
+
config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
167
|
+
config.NER_CONFIDENCE_THRESHOLD = 0.85
|
|
167
168
|
# Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
|
|
168
|
-
LANGUAGE = None
|
|
169
|
+
config.LANGUAGE = None
|
|
169
170
|
|
|
170
171
|
# Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
|
|
171
172
|
# All models passed need to support transformers AutoModel
|
|
172
|
-
NER_MODELS_LIST = [
|
|
173
|
+
config.NER_MODELS_LIST = [
|
|
173
174
|
"FacebookAI/xlm-roberta-large-finetuned-conll03-english",
|
|
174
175
|
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
|
|
175
176
|
"FacebookAI/xlm-roberta-large-finetuned-conll03-german",
|
|
176
177
|
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
|
|
177
178
|
"Babelscape/wikineural-multilingual-ner"
|
|
178
179
|
]
|
|
180
|
+
|
|
181
|
+
sx = sct.TextCleaner()
|
|
179
182
|
```
|
|
180
183
|
|
|
181
184
|
## API
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: SqueakyCleanText
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.5
|
|
4
4
|
Summary: A comprehensive text cleaning and preprocessing pipeline.
|
|
5
5
|
Home-page: https://github.com/rhnfzl/SqueakyCleanText
|
|
6
6
|
Author: Rehan Fazal
|
|
@@ -156,50 +156,53 @@ You can modify the package’s functionality by changing settings in the configu
|
|
|
156
156
|
Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
|
|
157
157
|
|
|
158
158
|
```python
|
|
159
|
+
from sct import sct, config
|
|
159
160
|
# In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
|
|
160
161
|
# then only CHECK_DETECT_LANGUAGE will be considered False.
|
|
161
|
-
CHECK_DETECT_LANGUAGE = True
|
|
162
|
-
CHECK_FIX_BAD_UNICODE = True
|
|
163
|
-
CHECK_TO_ASCII_UNICODE = True
|
|
164
|
-
CHECK_REPLACE_HTML = True
|
|
165
|
-
CHECK_REPLACE_URLS = True
|
|
166
|
-
CHECK_REPLACE_EMAILS = True
|
|
167
|
-
CHECK_REPLACE_YEARS = True
|
|
168
|
-
CHECK_REPLACE_PHONE_NUMBERS = True
|
|
169
|
-
CHECK_REPLACE_NUMBERS = True
|
|
170
|
-
CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
171
|
-
CHECK_NER_PROCESS = True
|
|
172
|
-
CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
173
|
-
CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
174
|
-
CHECK_NORMALIZE_WHITESPACE = True
|
|
175
|
-
CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
176
|
-
CHECK_CASEFOLD = True
|
|
177
|
-
CHECK_REMOVE_STOPWORDS = True
|
|
178
|
-
CHECK_REMOVE_PUNCTUATION = True
|
|
179
|
-
CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
|
|
162
|
+
config.CHECK_DETECT_LANGUAGE = True
|
|
163
|
+
config.CHECK_FIX_BAD_UNICODE = True
|
|
164
|
+
config.CHECK_TO_ASCII_UNICODE = True
|
|
165
|
+
config.CHECK_REPLACE_HTML = True
|
|
166
|
+
config.CHECK_REPLACE_URLS = True
|
|
167
|
+
config.CHECK_REPLACE_EMAILS = True
|
|
168
|
+
config.CHECK_REPLACE_YEARS = True
|
|
169
|
+
config.CHECK_REPLACE_PHONE_NUMBERS = True
|
|
170
|
+
config.CHECK_REPLACE_NUMBERS = True
|
|
171
|
+
config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
172
|
+
config.CHECK_NER_PROCESS = True
|
|
173
|
+
config.CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
174
|
+
config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
175
|
+
config.CHECK_NORMALIZE_WHITESPACE = True
|
|
176
|
+
config.CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
177
|
+
config.CHECK_CASEFOLD = True
|
|
178
|
+
config.CHECK_REMOVE_STOPWORDS = True
|
|
179
|
+
config.CHECK_REMOVE_PUNCTUATION = True
|
|
180
|
+
config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
|
|
180
181
|
# Tags can be replaced if needed, like if no special tags are necessary "" can be passed
|
|
181
|
-
REPLACE_WITH_URL = "<URL>"
|
|
182
|
-
REPLACE_WITH_HTML = "<HTML>"
|
|
183
|
-
REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
184
|
-
REPLACE_WITH_YEARS = "<YEAR>"
|
|
185
|
-
REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
186
|
-
REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
187
|
-
REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
182
|
+
config.REPLACE_WITH_URL = "<URL>"
|
|
183
|
+
config.REPLACE_WITH_HTML = "<HTML>"
|
|
184
|
+
config.REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
185
|
+
config.REPLACE_WITH_YEARS = "<YEAR>"
|
|
186
|
+
config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
187
|
+
config.REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
188
|
+
config.REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
188
189
|
# You can remove any of the tags
|
|
189
|
-
POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
190
|
-
NER_CONFIDENCE_THRESHOLD = 0.85
|
|
190
|
+
config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
191
|
+
config.NER_CONFIDENCE_THRESHOLD = 0.85
|
|
191
192
|
# Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
|
|
192
|
-
LANGUAGE = None
|
|
193
|
+
config.LANGUAGE = None
|
|
193
194
|
|
|
194
195
|
# Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
|
|
195
196
|
# All models passed need to support transformers AutoModel
|
|
196
|
-
NER_MODELS_LIST = [
|
|
197
|
+
config.NER_MODELS_LIST = [
|
|
197
198
|
"FacebookAI/xlm-roberta-large-finetuned-conll03-english",
|
|
198
199
|
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
|
|
199
200
|
"FacebookAI/xlm-roberta-large-finetuned-conll03-german",
|
|
200
201
|
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
|
|
201
202
|
"Babelscape/wikineural-multilingual-ner"
|
|
202
203
|
]
|
|
204
|
+
|
|
205
|
+
sx = sct.TextCleaner()
|
|
203
206
|
```
|
|
204
207
|
|
|
205
208
|
## API
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Module containing the configuration parameters for the SCT package.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
# Flag to detect the language automatically. If True, the language will be detected for each text.
|
|
6
|
+
CHECK_DETECT_LANGUAGE = True
|
|
7
|
+
|
|
8
|
+
# Flag to fix "broken" unicode such as mojibake and garbled HTML entities.
|
|
9
|
+
CHECK_FIX_BAD_UNICODE = True
|
|
10
|
+
|
|
11
|
+
# Flag to convert non-ASCII characters into their closest ASCII equivalents.
|
|
12
|
+
CHECK_TO_ASCII_UNICODE = True
|
|
13
|
+
|
|
14
|
+
# Flag to replace HTML tags with a special token.
|
|
15
|
+
CHECK_REPLACE_HTML = True
|
|
16
|
+
|
|
17
|
+
# Flag to replace URLs with a special token.
|
|
18
|
+
CHECK_REPLACE_URLS = True
|
|
19
|
+
|
|
20
|
+
# Flag to replace email addresses with a special token.
|
|
21
|
+
CHECK_REPLACE_EMAILS = True
|
|
22
|
+
|
|
23
|
+
# Flag to replace years with a special token.
|
|
24
|
+
CHECK_REPLACE_YEARS = True
|
|
25
|
+
|
|
26
|
+
# Flag to replace phone numbers with a special token.
|
|
27
|
+
CHECK_REPLACE_PHONE_NUMBERS = True
|
|
28
|
+
|
|
29
|
+
# Flag to replace numbers with a special token.
|
|
30
|
+
CHECK_REPLACE_NUMBERS = True
|
|
31
|
+
|
|
32
|
+
# Flag to replace currency symbols with their respective alphabetical equivalents.
|
|
33
|
+
CHECK_REPLACE_CURRENCY_SYMBOLS = True
|
|
34
|
+
|
|
35
|
+
# Flag to execute Named Entity Recognition (NER) to remove positional tags such as PER, LOC, ORG, MISC.
|
|
36
|
+
CHECK_NER_PROCESS = True
|
|
37
|
+
|
|
38
|
+
# Flag to remove any isolated letters which do not add any value to the text.
|
|
39
|
+
CHECK_REMOVE_ISOLATED_LETTERS = True
|
|
40
|
+
|
|
41
|
+
# Flag to remove any isolated symbols which should not be present in the text.
|
|
42
|
+
CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
|
|
43
|
+
|
|
44
|
+
# Flag to remove any unnecessary whitespace.
|
|
45
|
+
CHECK_NORMALIZE_WHITESPACE = True
|
|
46
|
+
|
|
47
|
+
# Flag to get the statistical model text, such as for fastText, SVM, LR.
|
|
48
|
+
CHECK_STATISTICAL_MODEL_PROCESSING = True
|
|
49
|
+
|
|
50
|
+
# Flag to convert all characters to lowercase.
|
|
51
|
+
CHECK_CASEFOLD = True
|
|
52
|
+
|
|
53
|
+
# Flag to remove stopwords based on the language. Uses NLTK stopwords.
|
|
54
|
+
CHECK_REMOVE_STOPWORDS = True
|
|
55
|
+
|
|
56
|
+
# Flag to remove all special symbols.
|
|
57
|
+
CHECK_REMOVE_PUNCTUATION = True
|
|
58
|
+
|
|
59
|
+
# Flag to remove custom stopwords specific to the SCT package.
|
|
60
|
+
CHECK_REMOVE_STEXT_CUSTOM_STOP_WORDS = True
|
|
61
|
+
|
|
62
|
+
# Special token to replace URLs.
|
|
63
|
+
REPLACE_WITH_URL = "<URL>"
|
|
64
|
+
|
|
65
|
+
# Special token to replace HTML tags.
|
|
66
|
+
REPLACE_WITH_HTML = "<HTML>"
|
|
67
|
+
|
|
68
|
+
# Special token to replace email addresses.
|
|
69
|
+
REPLACE_WITH_EMAIL = "<EMAIL>"
|
|
70
|
+
|
|
71
|
+
# Special token to replace years.
|
|
72
|
+
REPLACE_WITH_YEARS = "<YEAR>"
|
|
73
|
+
|
|
74
|
+
# Special token to replace phone numbers.
|
|
75
|
+
REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
|
|
76
|
+
|
|
77
|
+
# Special token to replace numbers.
|
|
78
|
+
REPLACE_WITH_NUMBERS = "<NUMBER>"
|
|
79
|
+
|
|
80
|
+
# Special token to replace currency symbols. If None, symbols will be replaced with their 3-letter abbreviations.
|
|
81
|
+
REPLACE_WITH_CURRENCY_SYMBOLS = None
|
|
82
|
+
|
|
83
|
+
# List of positional tags to be removed by NER.
|
|
84
|
+
POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
|
|
85
|
+
|
|
86
|
+
# Confidence threshold for NER.
|
|
87
|
+
NER_CONFIDENCE_THRESHOLD = 0.85
|
|
88
|
+
|
|
89
|
+
# Language to be used for NER. If None, the language will be detected automatically.
|
|
90
|
+
LANGUAGE = None
|
|
91
|
+
|
|
92
|
+
# List of pre-trained NER models in order of importance.
|
|
93
|
+
NER_MODELS_LIST = [
|
|
94
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll03-english", # English Model
|
|
95
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-dutch", # Dutch Model
|
|
96
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll03-german", # German Model
|
|
97
|
+
"FacebookAI/xlm-roberta-large-finetuned-conll02-spanish", # Spanish Model
|
|
98
|
+
"Babelscape/wikineural-multilingual-ner" # Multilingual Model
|
|
99
|
+
]
|
|
100
|
+
|