SqueakyCleanText 0.2.5__tar.gz → 0.2.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {SqueakyCleanText-0.2.5/SqueakyCleanText.egg-info → SqueakyCleanText-0.2.6}/PKG-INFO +49 -48
  2. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/README.md +49 -48
  3. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6/SqueakyCleanText.egg-info}/PKG-INFO +49 -48
  4. SqueakyCleanText-0.2.6/sct/config.py +57 -0
  5. SqueakyCleanText-0.2.6/sct/sct.py +129 -0
  6. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/utils/constants.py +9 -2
  7. SqueakyCleanText-0.2.6/sct/utils/contact.py +50 -0
  8. SqueakyCleanText-0.2.6/sct/utils/datetime.py +15 -0
  9. SqueakyCleanText-0.2.6/sct/utils/ner.py +228 -0
  10. SqueakyCleanText-0.2.6/sct/utils/normtext.py +83 -0
  11. SqueakyCleanText-0.2.6/sct/utils/special.py +50 -0
  12. SqueakyCleanText-0.2.6/sct/utils/stopwords.py +35 -0
  13. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/setup.py +1 -1
  14. SqueakyCleanText-0.2.5/sct/config.py +0 -100
  15. SqueakyCleanText-0.2.5/sct/sct.py +0 -341
  16. SqueakyCleanText-0.2.5/sct/utils/contact.py +0 -88
  17. SqueakyCleanText-0.2.5/sct/utils/datetime.py +0 -26
  18. SqueakyCleanText-0.2.5/sct/utils/ner.py +0 -367
  19. SqueakyCleanText-0.2.5/sct/utils/normtext.py +0 -148
  20. SqueakyCleanText-0.2.5/sct/utils/special.py +0 -113
  21. SqueakyCleanText-0.2.5/sct/utils/stopwords.py +0 -62
  22. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/LICENSE +0 -0
  23. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/MANIFEST.in +0 -0
  24. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/SOURCES.txt +0 -0
  25. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/dependency_links.txt +0 -0
  26. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/entry_points.txt +0 -0
  27. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/requires.txt +0 -0
  28. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/SqueakyCleanText.egg-info/top_level.txt +0 -0
  29. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/__init__.py +0 -0
  30. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/scripts/__init__.py +0 -0
  31. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/scripts/download_nltk_stopwords.py +0 -0
  32. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/utils/__init__.py +0 -0
  33. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/utils/resources.py +0 -0
  34. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/setup.cfg +0 -0
  35. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/tests/__init__.py +0 -0
  36. {SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/tests/test_sct.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SqueakyCleanText
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: A comprehensive text cleaning and preprocessing pipeline.
5
5
  Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
6
  Author: Rehan Fazal
@@ -156,55 +156,56 @@ You can modify the package’s functionality by changing settings in the configu
156
156
  Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
157
157
 
158
158
  ```python
159
- from sct import sct, config
160
- # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
161
- # then only CHECK_DETECT_LANGUAGE will be considered False.
162
- config.CHECK_DETECT_LANGUAGE = True
163
- config.CHECK_FIX_BAD_UNICODE = True
164
- config.CHECK_TO_ASCII_UNICODE = True
165
- config.CHECK_REPLACE_HTML = True
166
- config.CHECK_REPLACE_URLS = True
167
- config.CHECK_REPLACE_EMAILS = True
168
- config.CHECK_REPLACE_YEARS = True
169
- config.CHECK_REPLACE_PHONE_NUMBERS = True
170
- config.CHECK_REPLACE_NUMBERS = True
171
- config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
172
- config.CHECK_NER_PROCESS = True
173
- config.CHECK_REMOVE_ISOLATED_LETTERS = True
174
- config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
175
- config.CHECK_NORMALIZE_WHITESPACE = True
176
- config.CHECK_STATISTICAL_MODEL_PROCESSING = True
177
- config.CHECK_CASEFOLD = True
178
- config.CHECK_REMOVE_STOPWORDS = True
179
- config.CHECK_REMOVE_PUNCTUATION = True
180
- config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
181
- # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
182
- config.REPLACE_WITH_URL = "<URL>"
183
- config.REPLACE_WITH_HTML = "<HTML>"
184
- config.REPLACE_WITH_EMAIL = "<EMAIL>"
185
- config.REPLACE_WITH_YEARS = "<YEAR>"
186
- config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
187
- config.REPLACE_WITH_NUMBERS = "<NUMBER>"
188
- config.REPLACE_WITH_CURRENCY_SYMBOLS = None
189
- # You can remove any of the tags
190
- config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
191
- config.NER_CONFIDENCE_THRESHOLD = 0.85
192
- # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
193
- config.LANGUAGE = None
194
-
195
- # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
196
- # All models passed need to support transformers AutoModel
197
- config.NER_MODELS_LIST = [
198
- "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
199
- "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
200
- "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
201
- "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
202
- "Babelscape/wikineural-multilingual-ner"
203
- ]
204
-
205
- sx = sct.TextCleaner()
159
+ from sct import sct, config
160
+ # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
161
+ # then only CHECK_DETECT_LANGUAGE will be considered False.
162
+ config.CHECK_DETECT_LANGUAGE = True
163
+ config.CHECK_FIX_BAD_UNICODE = True
164
+ config.CHECK_TO_ASCII_UNICODE = True
165
+ config.CHECK_REPLACE_HTML = True
166
+ config.CHECK_REPLACE_URLS = True
167
+ config.CHECK_REPLACE_EMAILS = True
168
+ config.CHECK_REPLACE_YEARS = True
169
+ config.CHECK_REPLACE_PHONE_NUMBERS = True
170
+ config.CHECK_REPLACE_NUMBERS = True
171
+ config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
172
+ config.CHECK_NER_PROCESS = True
173
+ config.CHECK_REMOVE_ISOLATED_LETTERS = True
174
+ config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
175
+ config.CHECK_NORMALIZE_WHITESPACE = True
176
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = True
177
+ config.CHECK_CASEFOLD = True
178
+ config.CHECK_REMOVE_STOPWORDS = True
179
+ config.CHECK_REMOVE_PUNCTUATION = True
180
+ config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
181
+ # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
182
+ config.REPLACE_WITH_URL = "<URL>"
183
+ config.REPLACE_WITH_HTML = "<HTML>"
184
+ config.REPLACE_WITH_EMAIL = "<EMAIL>"
185
+ config.REPLACE_WITH_YEARS = "<YEAR>"
186
+ config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
187
+ config.REPLACE_WITH_NUMBERS = "<NUMBER>"
188
+ config.REPLACE_WITH_CURRENCY_SYMBOLS = None
189
+ # You can remove any of the tags
190
+ config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
191
+ config.NER_CONFIDENCE_THRESHOLD = 0.85
192
+ # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
193
+ config.LANGUAGE = None
194
+
195
+ # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
196
+ # All models passed need to support transformers AutoModel
197
+ config.NER_MODELS_LIST = [
198
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
199
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
200
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
201
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
202
+ "Babelscape/wikineural-multilingual-ner"
203
+ ]
204
+
205
+ sx = sct.TextCleaner()
206
206
  ```
207
207
 
208
+
208
209
  ## API
209
210
 
210
211
  ### `sct.TextCleaner`
@@ -132,55 +132,56 @@ You can modify the package’s functionality by changing settings in the configu
132
132
  Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
133
133
 
134
134
  ```python
135
- from sct import sct, config
136
- # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
137
- # then only CHECK_DETECT_LANGUAGE will be considered False.
138
- config.CHECK_DETECT_LANGUAGE = True
139
- config.CHECK_FIX_BAD_UNICODE = True
140
- config.CHECK_TO_ASCII_UNICODE = True
141
- config.CHECK_REPLACE_HTML = True
142
- config.CHECK_REPLACE_URLS = True
143
- config.CHECK_REPLACE_EMAILS = True
144
- config.CHECK_REPLACE_YEARS = True
145
- config.CHECK_REPLACE_PHONE_NUMBERS = True
146
- config.CHECK_REPLACE_NUMBERS = True
147
- config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
148
- config.CHECK_NER_PROCESS = True
149
- config.CHECK_REMOVE_ISOLATED_LETTERS = True
150
- config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
151
- config.CHECK_NORMALIZE_WHITESPACE = True
152
- config.CHECK_STATISTICAL_MODEL_PROCESSING = True
153
- config.CHECK_CASEFOLD = True
154
- config.CHECK_REMOVE_STOPWORDS = True
155
- config.CHECK_REMOVE_PUNCTUATION = True
156
- config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
157
- # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
158
- config.REPLACE_WITH_URL = "<URL>"
159
- config.REPLACE_WITH_HTML = "<HTML>"
160
- config.REPLACE_WITH_EMAIL = "<EMAIL>"
161
- config.REPLACE_WITH_YEARS = "<YEAR>"
162
- config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
163
- config.REPLACE_WITH_NUMBERS = "<NUMBER>"
164
- config.REPLACE_WITH_CURRENCY_SYMBOLS = None
165
- # You can remove any of the tags
166
- config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
167
- config.NER_CONFIDENCE_THRESHOLD = 0.85
168
- # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
169
- config.LANGUAGE = None
170
-
171
- # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
172
- # All models passed need to support transformers AutoModel
173
- config.NER_MODELS_LIST = [
174
- "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
175
- "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
176
- "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
177
- "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
178
- "Babelscape/wikineural-multilingual-ner"
179
- ]
180
-
181
- sx = sct.TextCleaner()
135
+ from sct import sct, config
136
+ # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
137
+ # then only CHECK_DETECT_LANGUAGE will be considered False.
138
+ config.CHECK_DETECT_LANGUAGE = True
139
+ config.CHECK_FIX_BAD_UNICODE = True
140
+ config.CHECK_TO_ASCII_UNICODE = True
141
+ config.CHECK_REPLACE_HTML = True
142
+ config.CHECK_REPLACE_URLS = True
143
+ config.CHECK_REPLACE_EMAILS = True
144
+ config.CHECK_REPLACE_YEARS = True
145
+ config.CHECK_REPLACE_PHONE_NUMBERS = True
146
+ config.CHECK_REPLACE_NUMBERS = True
147
+ config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
148
+ config.CHECK_NER_PROCESS = True
149
+ config.CHECK_REMOVE_ISOLATED_LETTERS = True
150
+ config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
151
+ config.CHECK_NORMALIZE_WHITESPACE = True
152
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = True
153
+ config.CHECK_CASEFOLD = True
154
+ config.CHECK_REMOVE_STOPWORDS = True
155
+ config.CHECK_REMOVE_PUNCTUATION = True
156
+ config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
157
+ # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
158
+ config.REPLACE_WITH_URL = "<URL>"
159
+ config.REPLACE_WITH_HTML = "<HTML>"
160
+ config.REPLACE_WITH_EMAIL = "<EMAIL>"
161
+ config.REPLACE_WITH_YEARS = "<YEAR>"
162
+ config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
163
+ config.REPLACE_WITH_NUMBERS = "<NUMBER>"
164
+ config.REPLACE_WITH_CURRENCY_SYMBOLS = None
165
+ # You can remove any of the tags
166
+ config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
167
+ config.NER_CONFIDENCE_THRESHOLD = 0.85
168
+ # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
169
+ config.LANGUAGE = None
170
+
171
+ # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
172
+ # All models passed need to support transformers AutoModel
173
+ config.NER_MODELS_LIST = [
174
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
175
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
176
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
177
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
178
+ "Babelscape/wikineural-multilingual-ner"
179
+ ]
180
+
181
+ sx = sct.TextCleaner()
182
182
  ```
183
183
 
184
+
184
185
  ## API
185
186
 
186
187
  ### `sct.TextCleaner`
@@ -204,4 +205,4 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
204
205
 
205
206
  The package took inspirations from the following repo:
206
207
 
207
- - [clean-text](https://github.com/jfilter/clean-text)
208
+ - [clean-text](https://github.com/jfilter/clean-text)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SqueakyCleanText
3
- Version: 0.2.5
3
+ Version: 0.2.6
4
4
  Summary: A comprehensive text cleaning and preprocessing pipeline.
5
5
  Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
6
  Author: Rehan Fazal
@@ -156,55 +156,56 @@ You can modify the package’s functionality by changing settings in the configu
156
156
  Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
157
157
 
158
158
  ```python
159
- from sct import sct, config
160
- # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
161
- # then only CHECK_DETECT_LANGUAGE will be considered False.
162
- config.CHECK_DETECT_LANGUAGE = True
163
- config.CHECK_FIX_BAD_UNICODE = True
164
- config.CHECK_TO_ASCII_UNICODE = True
165
- config.CHECK_REPLACE_HTML = True
166
- config.CHECK_REPLACE_URLS = True
167
- config.CHECK_REPLACE_EMAILS = True
168
- config.CHECK_REPLACE_YEARS = True
169
- config.CHECK_REPLACE_PHONE_NUMBERS = True
170
- config.CHECK_REPLACE_NUMBERS = True
171
- config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
172
- config.CHECK_NER_PROCESS = True
173
- config.CHECK_REMOVE_ISOLATED_LETTERS = True
174
- config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
175
- config.CHECK_NORMALIZE_WHITESPACE = True
176
- config.CHECK_STATISTICAL_MODEL_PROCESSING = True
177
- config.CHECK_CASEFOLD = True
178
- config.CHECK_REMOVE_STOPWORDS = True
179
- config.CHECK_REMOVE_PUNCTUATION = True
180
- config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
181
- # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
182
- config.REPLACE_WITH_URL = "<URL>"
183
- config.REPLACE_WITH_HTML = "<HTML>"
184
- config.REPLACE_WITH_EMAIL = "<EMAIL>"
185
- config.REPLACE_WITH_YEARS = "<YEAR>"
186
- config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
187
- config.REPLACE_WITH_NUMBERS = "<NUMBER>"
188
- config.REPLACE_WITH_CURRENCY_SYMBOLS = None
189
- # You can remove any of the tags
190
- config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
191
- config.NER_CONFIDENCE_THRESHOLD = 0.85
192
- # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
193
- config.LANGUAGE = None
194
-
195
- # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
196
- # All models passed need to support transformers AutoModel
197
- config.NER_MODELS_LIST = [
198
- "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
199
- "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
200
- "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
201
- "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
202
- "Babelscape/wikineural-multilingual-ner"
203
- ]
204
-
205
- sx = sct.TextCleaner()
159
+ from sct import sct, config
160
+ # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
161
+ # then only CHECK_DETECT_LANGUAGE will be considered False.
162
+ config.CHECK_DETECT_LANGUAGE = True
163
+ config.CHECK_FIX_BAD_UNICODE = True
164
+ config.CHECK_TO_ASCII_UNICODE = True
165
+ config.CHECK_REPLACE_HTML = True
166
+ config.CHECK_REPLACE_URLS = True
167
+ config.CHECK_REPLACE_EMAILS = True
168
+ config.CHECK_REPLACE_YEARS = True
169
+ config.CHECK_REPLACE_PHONE_NUMBERS = True
170
+ config.CHECK_REPLACE_NUMBERS = True
171
+ config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
172
+ config.CHECK_NER_PROCESS = True
173
+ config.CHECK_REMOVE_ISOLATED_LETTERS = True
174
+ config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
175
+ config.CHECK_NORMALIZE_WHITESPACE = True
176
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = True
177
+ config.CHECK_CASEFOLD = True
178
+ config.CHECK_REMOVE_STOPWORDS = True
179
+ config.CHECK_REMOVE_PUNCTUATION = True
180
+ config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
181
+ # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
182
+ config.REPLACE_WITH_URL = "<URL>"
183
+ config.REPLACE_WITH_HTML = "<HTML>"
184
+ config.REPLACE_WITH_EMAIL = "<EMAIL>"
185
+ config.REPLACE_WITH_YEARS = "<YEAR>"
186
+ config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
187
+ config.REPLACE_WITH_NUMBERS = "<NUMBER>"
188
+ config.REPLACE_WITH_CURRENCY_SYMBOLS = None
189
+ # You can remove any of the tags
190
+ config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
191
+ config.NER_CONFIDENCE_THRESHOLD = 0.85
192
+ # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
193
+ config.LANGUAGE = None
194
+
195
+ # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
196
+ # All models passed need to support transformers AutoModel
197
+ config.NER_MODELS_LIST = [
198
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
199
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
200
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
201
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
202
+ "Babelscape/wikineural-multilingual-ner"
203
+ ]
204
+
205
+ sx = sct.TextCleaner()
206
206
  ```
207
207
 
208
+
208
209
  ## API
209
210
 
210
211
  ### `sct.TextCleaner`
@@ -0,0 +1,57 @@
1
+ """
2
+ detect_language : to detect the language automatically, but would consume more time if done on a batch
3
+ fix_bad_unicode : if True, fix "broken" unicode such as mojibake and garbled HTML entities
4
+ to_ascii_unicode : if True, convert non-to_ascii characters into their closest to_ascii equivalents
5
+ replace_with_url : special URL token, default "",
6
+ replace_with_email : special EMAIL token, default "",
7
+ replace_years : replace year, default "",
8
+ replace_with_phone_number : special PHONE token, default "",
9
+ replace_with_number : special NUMBER token, default "",
10
+ no_currency_symbols : if True, replace all currency symbols with the respective alphabetical ones,
11
+ ner_process : To execute NER Process to remove the positpositional tags, PER, LOC, ORG, MISC
12
+ remove_isolated_letters : remove any isolated letters which doesn't add any value to the text
13
+ remove_isolated_symbols : remove any isolated symbols which shouldn't be present in the text, usually which isn't
14
+ immediatly prefixed and suffixed by letter or number
15
+ normalize_whitespace : remove any unnecessary whitespace
16
+ statistical_model_processing : to get the statistical model text, like for fastText, SVM, LR etc
17
+ casefold : to lower the text
18
+ remove_stopwords : remove stopwords based on the language, usues NLTK stopwords
19
+ remove_punctuation : removes all the special symbols
20
+ """
21
+
22
+ CHECK_DETECT_LANGUAGE = True
23
+ CHECK_FIX_BAD_UNICODE = True
24
+ CHECK_TO_ASCII_UNICODE = True
25
+ CHECK_REPLACE_HTML = True
26
+ CHECK_REPLACE_URLS = True
27
+ CHECK_REPLACE_EMAILS = True
28
+ CHECK_REPLACE_YEARS = True
29
+ CHECK_REPLACE_PHONE_NUMBERS = True
30
+ CHECK_REPLACE_NUMBERS = True
31
+ CHECK_REPLACE_CURRENCY_SYMBOLS = True
32
+ CHECK_NER_PROCESS = True
33
+ CHECK_REMOVE_ISOLATED_LETTERS = True
34
+ CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
35
+ CHECK_NORMALIZE_WHITESPACE = True
36
+ CHECK_STATISTICAL_MODEL_PROCESSING = True
37
+ CHECK_CASEFOLD = True
38
+ CHECK_REMOVE_STOPWORDS = True
39
+ CHECK_REMOVE_PUNCTUATION = True
40
+ CHECK_REMOVE_STEXT_CUSTOM_STOP_WORDS = True
41
+ REPLACE_WITH_URL = "<URL>"
42
+ REPLACE_WITH_HTML = "<HTML>"
43
+ REPLACE_WITH_EMAIL = "<EMAIL>"
44
+ REPLACE_WITH_YEARS = "<YEAR>"
45
+ REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
46
+ REPLACE_WITH_NUMBERS = "<NUMBER>"
47
+ REPLACE_WITH_CURRENCY_SYMBOLS = None
48
+ POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
49
+ NER_CONFIDENCE_THRESHOLD = 0.85
50
+ LANGUAGE = None
51
+
52
+ # Order of the model is Important : English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
53
+ NER_MODELS_LIST = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",
54
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
55
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
56
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
57
+ "Babelscape/wikineural-multilingual-ner"]
@@ -0,0 +1,129 @@
1
+ """
2
+ This code provides a comprehensive text cleaning and preprocessing pipeline.
3
+ It includes functions to normalize, remove personal information and clean text data,
4
+ which is crucial for natural language processing tasks.
5
+ """
6
+ from sct import config
7
+ from sct.utils import contact, datetime, ner, normtext, resources, special, stopwords
8
+
9
+ class TextCleaner:
10
+
11
+ def __init__(self):
12
+ self.ProcessContacts = contact.ProcessContacts()
13
+ self.ProcessDateTime = datetime.ProcessDateTime()
14
+ self.ProcessSpecialSymbols = special.ProcessSpecialSymbols()
15
+ self.NormaliseText = normtext.NormaliseText()
16
+ self.ProcessStopwords = stopwords.ProcessStopwords()
17
+ self.GeneralNER = ner.GeneralNER()
18
+ self.pipeline = []
19
+ self.language = None
20
+ self.init_pipeline()
21
+
22
+ def init_pipeline(self):
23
+ # Initialize pipeline steps based on config
24
+ language_config = config.LANGUAGE.lower() if config.LANGUAGE else None
25
+
26
+ if language_config and language_config in resources.LANGUAGE_NAME:
27
+ self.language = language_config.upper()
28
+ elif any([config.CHECK_DETECT_LANGUAGE, config.CHECK_NER_PROCESS, config.CHECK_REMOVE_STOPWORDS]):
29
+ self.pipeline.append(self.detect_language)
30
+
31
+ if config.CHECK_FIX_BAD_UNICODE:
32
+ self.pipeline.append(self.fix_bad_unicode)
33
+ if config.CHECK_TO_ASCII_UNICODE:
34
+ self.pipeline.append(self.to_ascii_unicode)
35
+
36
+ if config.CHECK_REPLACE_HTML:
37
+ self.pipeline.append(self.replace_html)
38
+ if config.CHECK_REPLACE_URLS:
39
+ self.pipeline.append(self.replace_urls)
40
+ if config.CHECK_REPLACE_EMAILS:
41
+ self.pipeline.append(self.replace_emails)
42
+ if config.CHECK_REPLACE_YEARS:
43
+ self.pipeline.append(self.replace_years)
44
+ if config.CHECK_REPLACE_PHONE_NUMBERS:
45
+ self.pipeline.append(self.replace_phone_numbers)
46
+ if config.CHECK_REPLACE_NUMBERS:
47
+ self.pipeline.append(self.replace_numbers)
48
+ if config.CHECK_REPLACE_CURRENCY_SYMBOLS:
49
+ self.pipeline.append(self.replace_currency_symbols)
50
+
51
+ if config.CHECK_NER_PROCESS:
52
+ self.pipeline.append(self.ner_process)
53
+
54
+ if config.CHECK_REMOVE_ISOLATED_LETTERS:
55
+ self.pipeline.append(self.remove_isolated_letters)
56
+ if config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS:
57
+ self.pipeline.append(self.remove_isolated_special_symbols)
58
+ if config.CHECK_NORMALIZE_WHITESPACE:
59
+ self.pipeline.append(self.normalize_whitespace)
60
+
61
+ def process(self, text):
62
+ text = str(text)
63
+
64
+ for step in self.pipeline:
65
+ text = step(text)
66
+
67
+ if config.CHECK_STATISTICAL_MODEL_PROCESSING:
68
+ stext = self.statistical_model_processing(text)
69
+ return text, stext, self.language
70
+ elif config.CHECK_DETECT_LANGUAGE:
71
+ return text, self.language
72
+ else:
73
+ return text
74
+
75
+ def detect_language(self, text):
76
+ self.language = str(resources.DETECTOR.detect_language_of(text)).split(".")[-1]
77
+ return text
78
+
79
+ def fix_bad_unicode(self, text):
80
+ return self.NormaliseText.fix_bad_unicode(text)
81
+
82
+ def to_ascii_unicode(self, text):
83
+ return self.NormaliseText.to_ascii_unicode(text)
84
+
85
+ def replace_html(self, text):
86
+ return self.ProcessContacts.replace_html(text, replace_with=config.REPLACE_WITH_HTML)
87
+
88
+ def replace_urls(self, text):
89
+ return self.ProcessContacts.replace_urls(text, replace_with=config.REPLACE_WITH_URL)
90
+
91
+ def replace_emails(self, text):
92
+ return self.ProcessContacts.replace_emails(text, replace_with=config.REPLACE_WITH_EMAIL)
93
+
94
+ def replace_years(self, text):
95
+ return self.ProcessDateTime.replace_years(text, replace_with=config.REPLACE_WITH_YEARS)
96
+
97
+ def replace_phone_numbers(self, text):
98
+ return self.ProcessContacts.replace_phone_numbers(text, replace_with=config.REPLACE_WITH_PHONE_NUMBERS)
99
+
100
+ def replace_numbers(self, text):
101
+ return self.ProcessContacts.replace_numbers(text, replace_with=config.REPLACE_WITH_NUMBERS)
102
+
103
+ def replace_currency_symbols(self, text):
104
+ return self.ProcessSpecialSymbols.replace_currency_symbols(text, replace_with=config.REPLACE_WITH_CURRENCY_SYMBOLS)
105
+
106
+ def ner_process(self, text):
107
+ return self.GeneralNER.ner_process(text, config.POSITIONAL_TAGS, config.NER_CONFIDENCE_THRESHOLD, self.language)
108
+
109
+ def remove_isolated_letters(self, text):
110
+ return self.ProcessSpecialSymbols.remove_isolated_letters(text)
111
+
112
+ def remove_isolated_special_symbols(self, text):
113
+ return self.ProcessSpecialSymbols.remove_isolated_special_symbols(text)
114
+
115
+ def normalize_whitespace(self, text):
116
+ return self.NormaliseText.normalize_whitespace(text, no_line_breaks=True)
117
+
118
+ def statistical_model_processing(self, text):
119
+ if config.CHECK_CASEFOLD:
120
+ stext = text.casefold() # lowercase
121
+ if config.CHECK_REMOVE_STOPWORDS:
122
+ stext = self.ProcessStopwords.remove_stopwords(stext, self.language)
123
+ if config.CHECK_REMOVE_PUNCTUATION:
124
+ stext = self.ProcessSpecialSymbols.remove_punctuation(stext)
125
+ if config.CHECK_REMOVE_ISOLATED_LETTERS:
126
+ stext = self.ProcessSpecialSymbols.remove_isolated_letters(stext)
127
+ if config.CHECK_NORMALIZE_WHITESPACE:
128
+ stext = self.NormaliseText.normalize_whitespace(stext)
129
+ return stext
@@ -36,6 +36,14 @@ EMAIL_REGEX = re.compile(
36
36
  flags=re.IGNORECASE | re.UNICODE,
37
37
  )
38
38
 
39
+ # for more information: https://github.com/jfilter/clean-text/issues/10
40
+ # PHONE_REGEX = re.compile(
41
+ # r"((?:^|(?<=[^\w)]))(((\+?[01])|(\+\d{2}))[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
42
+ # )
43
+ # PHONE_REGEX = re.compile(
44
+ # r"((?:^|(?<=[^\w)]))((\+?[01]|0{1,2}\d{0,1}|\+\d{2})[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
45
+ # )
46
+
39
47
  PHONE_REGEX = re.compile(
40
48
  r"((?:^|(?<=[^\w)]))((\+?\d+|0{1,2}\d*?)[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
41
49
  )
@@ -108,7 +116,6 @@ strange_double_quotes = [
108
116
  "〟",
109
117
  """,
110
118
  ]
111
-
112
119
  strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", "‘", "’"]
113
120
 
114
121
  DOUBLE_QUOTE_REGEX = re.compile("|".join(strange_double_quotes))
@@ -120,4 +127,4 @@ ISOLATED_LETTERS_REGEX = re.compile(r"(?:^|\s)[B, C, D, E, F, G, H, I, J, K, L,
120
127
 
121
128
  ISOLATED_SPECIAL_SYMBOLS_REGEX = re.compile(r"(?<![a-zA-Z0-9])[:_.|><;·}@~!?+#)({,/\\\\^]+(?![a-zA-Z0-9])", flags=re.UNICODE | re.IGNORECASE)
122
129
 
123
- SENTENCE_BOUNDARY_PATTERN = re.compile('(?<=[.!?])\s+(?=[^\d])')
130
+ SENTENCE_BOUNDARY_PATTERN = re.compile('(?<=[.!?])\s+(?=[^\d])')
@@ -0,0 +1,50 @@
1
+ from sct.utils import constants
2
+ from bs4 import BeautifulSoup
3
+
4
+ class ProcessContacts:
5
+
6
+ def __init__(self):
7
+ pass
8
+
9
+ def replace_urls(self, text, replace_with="<URL>"):
10
+ """
11
+ Replace all URLs in ``text`` str with ``replace_with`` str.
12
+ """
13
+ # matches = constants.URL_REGEX.finditer(text)
14
+ # result = text
15
+ # # Iterate through matches in reverse order (to avoid index issues)
16
+ # for match in reversed(list(matches)):
17
+ # # Check if the matched substring contains non-ASCII characters
18
+ # if not any(ord(char) > 127 for char in match.group()):
19
+ # result = text[:match.start()] + replace_with + text[match.end():]
20
+ return constants.URL_REGEX.sub(replace_with, text)
21
+
22
+ def replace_html(self, text, replace_with="<HTML>"):
23
+ """
24
+ Replace all html tags in ``text`` str with ``replace_with`` str.
25
+ """
26
+ try:
27
+ soup = BeautifulSoup(text, 'html.parser')
28
+ text = soup.get_text()
29
+ except:
30
+ text = constants.HTML_REGEX.sub(replace_with, text)
31
+
32
+ return text
33
+
34
+ def replace_emails(self, text, replace_with="<EMAIL>"):
35
+ """
36
+ Replace all emails in ``text`` str with ``replace_with`` str.
37
+ """
38
+ return constants.EMAIL_REGEX.sub(replace_with, text)
39
+
40
+ def replace_phone_numbers(self, text, replace_with="<PHONE>"):
41
+ """
42
+ Replace all phone numbers in ``text`` str with ``replace_with`` str.
43
+ """
44
+ return constants.PHONE_REGEX.sub(replace_with, text)
45
+
46
+ def replace_numbers(self, text, replace_with="<NUMBER>"):
47
+ """
48
+ Replace all numbers in ``text`` str with ``replace_with`` str.
49
+ """
50
+ return constants.NUMBERS_REGEX.sub(replace_with, text)
@@ -0,0 +1,15 @@
1
+ from sct.utils import constants
2
+
3
+
4
+ class ProcessDateTime:
5
+
6
+ def __init__(self):
7
+ pass
8
+
9
+ def replace_years(self, text, replace_with ="<YEAR>"):
10
+ """
11
+ Replaces years between 1900 to 2099 in the text with a special token.
12
+ """
13
+ cleaned_string = constants.YEAR_REGEX.sub(replace_with, text)
14
+
15
+ return cleaned_string