SqueakyCleanText 0.2.4__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {SqueakyCleanText-0.2.4/SqueakyCleanText.egg-info → SqueakyCleanText-0.2.5}/PKG-INFO +34 -31
  2. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/README.md +33 -30
  3. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5/SqueakyCleanText.egg-info}/PKG-INFO +34 -31
  4. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/utils/ner.py +1 -0
  5. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/setup.py +1 -1
  6. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/LICENSE +0 -0
  7. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/MANIFEST.in +0 -0
  8. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/SOURCES.txt +0 -0
  9. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/dependency_links.txt +0 -0
  10. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/entry_points.txt +0 -0
  11. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/requires.txt +0 -0
  12. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/top_level.txt +0 -0
  13. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/__init__.py +0 -0
  14. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/config.py +0 -0
  15. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/scripts/__init__.py +0 -0
  16. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/scripts/download_nltk_stopwords.py +0 -0
  17. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/sct.py +0 -0
  18. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/utils/__init__.py +0 -0
  19. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/utils/constants.py +0 -0
  20. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/utils/contact.py +0 -0
  21. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/utils/datetime.py +0 -0
  22. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/utils/normtext.py +0 -0
  23. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/utils/resources.py +0 -0
  24. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/utils/special.py +0 -0
  25. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/sct/utils/stopwords.py +0 -0
  26. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/setup.cfg +0 -0
  27. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/tests/__init__.py +0 -0
  28. {SqueakyCleanText-0.2.4 → SqueakyCleanText-0.2.5}/tests/test_sct.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SqueakyCleanText
3
- Version: 0.2.4
3
+ Version: 0.2.5
4
4
  Summary: A comprehensive text cleaning and preprocessing pipeline.
5
5
  Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
6
  Author: Rehan Fazal
@@ -156,50 +156,53 @@ You can modify the package’s functionality by changing settings in the configu
156
156
  Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
157
157
 
158
158
  ```python
159
+ from sct import sct, config
159
160
  # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
160
161
  # then only CHECK_DETECT_LANGUAGE will be considered False.
161
- CHECK_DETECT_LANGUAGE = True
162
- CHECK_FIX_BAD_UNICODE = True
163
- CHECK_TO_ASCII_UNICODE = True
164
- CHECK_REPLACE_HTML = True
165
- CHECK_REPLACE_URLS = True
166
- CHECK_REPLACE_EMAILS = True
167
- CHECK_REPLACE_YEARS = True
168
- CHECK_REPLACE_PHONE_NUMBERS = True
169
- CHECK_REPLACE_NUMBERS = True
170
- CHECK_REPLACE_CURRENCY_SYMBOLS = True
171
- CHECK_NER_PROCESS = True
172
- CHECK_REMOVE_ISOLATED_LETTERS = True
173
- CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
174
- CHECK_NORMALIZE_WHITESPACE = True
175
- CHECK_STATISTICAL_MODEL_PROCESSING = True
176
- CHECK_CASEFOLD = True
177
- CHECK_REMOVE_STOPWORDS = True
178
- CHECK_REMOVE_PUNCTUATION = True
179
- CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
162
+ config.CHECK_DETECT_LANGUAGE = True
163
+ config.CHECK_FIX_BAD_UNICODE = True
164
+ config.CHECK_TO_ASCII_UNICODE = True
165
+ config.CHECK_REPLACE_HTML = True
166
+ config.CHECK_REPLACE_URLS = True
167
+ config.CHECK_REPLACE_EMAILS = True
168
+ config.CHECK_REPLACE_YEARS = True
169
+ config.CHECK_REPLACE_PHONE_NUMBERS = True
170
+ config.CHECK_REPLACE_NUMBERS = True
171
+ config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
172
+ config.CHECK_NER_PROCESS = True
173
+ config.CHECK_REMOVE_ISOLATED_LETTERS = True
174
+ config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
175
+ config.CHECK_NORMALIZE_WHITESPACE = True
176
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = True
177
+ config.CHECK_CASEFOLD = True
178
+ config.CHECK_REMOVE_STOPWORDS = True
179
+ config.CHECK_REMOVE_PUNCTUATION = True
180
+ config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
180
181
  # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
181
- REPLACE_WITH_URL = "<URL>"
182
- REPLACE_WITH_HTML = "<HTML>"
183
- REPLACE_WITH_EMAIL = "<EMAIL>"
184
- REPLACE_WITH_YEARS = "<YEAR>"
185
- REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
186
- REPLACE_WITH_NUMBERS = "<NUMBER>"
187
- REPLACE_WITH_CURRENCY_SYMBOLS = None
182
+ config.REPLACE_WITH_URL = "<URL>"
183
+ config.REPLACE_WITH_HTML = "<HTML>"
184
+ config.REPLACE_WITH_EMAIL = "<EMAIL>"
185
+ config.REPLACE_WITH_YEARS = "<YEAR>"
186
+ config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
187
+ config.REPLACE_WITH_NUMBERS = "<NUMBER>"
188
+ config.REPLACE_WITH_CURRENCY_SYMBOLS = None
188
189
  # You can remove any of the tags
189
- POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
190
- NER_CONFIDENCE_THRESHOLD = 0.85
190
+ config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
191
+ config.NER_CONFIDENCE_THRESHOLD = 0.85
191
192
  # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
192
- LANGUAGE = None
193
+ config.LANGUAGE = None
193
194
 
194
195
  # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
195
196
  # All models passed need to support transformers AutoModel
196
- NER_MODELS_LIST = [
197
+ config.NER_MODELS_LIST = [
197
198
  "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
198
199
  "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
199
200
  "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
200
201
  "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
201
202
  "Babelscape/wikineural-multilingual-ner"
202
203
  ]
204
+
205
+ sx = sct.TextCleaner()
203
206
  ```
204
207
 
205
208
  ## API
@@ -132,50 +132,53 @@ You can modify the package’s functionality by changing settings in the configu
132
132
  Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
133
133
 
134
134
  ```python
135
+ from sct import sct, config
135
136
  # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
136
137
  # then only CHECK_DETECT_LANGUAGE will be considered False.
137
- CHECK_DETECT_LANGUAGE = True
138
- CHECK_FIX_BAD_UNICODE = True
139
- CHECK_TO_ASCII_UNICODE = True
140
- CHECK_REPLACE_HTML = True
141
- CHECK_REPLACE_URLS = True
142
- CHECK_REPLACE_EMAILS = True
143
- CHECK_REPLACE_YEARS = True
144
- CHECK_REPLACE_PHONE_NUMBERS = True
145
- CHECK_REPLACE_NUMBERS = True
146
- CHECK_REPLACE_CURRENCY_SYMBOLS = True
147
- CHECK_NER_PROCESS = True
148
- CHECK_REMOVE_ISOLATED_LETTERS = True
149
- CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
150
- CHECK_NORMALIZE_WHITESPACE = True
151
- CHECK_STATISTICAL_MODEL_PROCESSING = True
152
- CHECK_CASEFOLD = True
153
- CHECK_REMOVE_STOPWORDS = True
154
- CHECK_REMOVE_PUNCTUATION = True
155
- CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
138
+ config.CHECK_DETECT_LANGUAGE = True
139
+ config.CHECK_FIX_BAD_UNICODE = True
140
+ config.CHECK_TO_ASCII_UNICODE = True
141
+ config.CHECK_REPLACE_HTML = True
142
+ config.CHECK_REPLACE_URLS = True
143
+ config.CHECK_REPLACE_EMAILS = True
144
+ config.CHECK_REPLACE_YEARS = True
145
+ config.CHECK_REPLACE_PHONE_NUMBERS = True
146
+ config.CHECK_REPLACE_NUMBERS = True
147
+ config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
148
+ config.CHECK_NER_PROCESS = True
149
+ config.CHECK_REMOVE_ISOLATED_LETTERS = True
150
+ config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
151
+ config.CHECK_NORMALIZE_WHITESPACE = True
152
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = True
153
+ config.CHECK_CASEFOLD = True
154
+ config.CHECK_REMOVE_STOPWORDS = True
155
+ config.CHECK_REMOVE_PUNCTUATION = True
156
+ config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
156
157
  # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
157
- REPLACE_WITH_URL = "<URL>"
158
- REPLACE_WITH_HTML = "<HTML>"
159
- REPLACE_WITH_EMAIL = "<EMAIL>"
160
- REPLACE_WITH_YEARS = "<YEAR>"
161
- REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
162
- REPLACE_WITH_NUMBERS = "<NUMBER>"
163
- REPLACE_WITH_CURRENCY_SYMBOLS = None
158
+ config.REPLACE_WITH_URL = "<URL>"
159
+ config.REPLACE_WITH_HTML = "<HTML>"
160
+ config.REPLACE_WITH_EMAIL = "<EMAIL>"
161
+ config.REPLACE_WITH_YEARS = "<YEAR>"
162
+ config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
163
+ config.REPLACE_WITH_NUMBERS = "<NUMBER>"
164
+ config.REPLACE_WITH_CURRENCY_SYMBOLS = None
164
165
  # You can remove any of the tags
165
- POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
166
- NER_CONFIDENCE_THRESHOLD = 0.85
166
+ config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
167
+ config.NER_CONFIDENCE_THRESHOLD = 0.85
167
168
  # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
168
- LANGUAGE = None
169
+ config.LANGUAGE = None
169
170
 
170
171
  # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
171
172
  # All models passed need to support transformers AutoModel
172
- NER_MODELS_LIST = [
173
+ config.NER_MODELS_LIST = [
173
174
  "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
174
175
  "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
175
176
  "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
176
177
  "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
177
178
  "Babelscape/wikineural-multilingual-ner"
178
179
  ]
180
+
181
+ sx = sct.TextCleaner()
179
182
  ```
180
183
 
181
184
  ## API
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SqueakyCleanText
3
- Version: 0.2.4
3
+ Version: 0.2.5
4
4
  Summary: A comprehensive text cleaning and preprocessing pipeline.
5
5
  Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
6
  Author: Rehan Fazal
@@ -156,50 +156,53 @@ You can modify the package’s functionality by changing settings in the configu
156
156
  Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
157
157
 
158
158
  ```python
159
+ from sct import sct, config
159
160
  # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
160
161
  # then only CHECK_DETECT_LANGUAGE will be considered False.
161
- CHECK_DETECT_LANGUAGE = True
162
- CHECK_FIX_BAD_UNICODE = True
163
- CHECK_TO_ASCII_UNICODE = True
164
- CHECK_REPLACE_HTML = True
165
- CHECK_REPLACE_URLS = True
166
- CHECK_REPLACE_EMAILS = True
167
- CHECK_REPLACE_YEARS = True
168
- CHECK_REPLACE_PHONE_NUMBERS = True
169
- CHECK_REPLACE_NUMBERS = True
170
- CHECK_REPLACE_CURRENCY_SYMBOLS = True
171
- CHECK_NER_PROCESS = True
172
- CHECK_REMOVE_ISOLATED_LETTERS = True
173
- CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
174
- CHECK_NORMALIZE_WHITESPACE = True
175
- CHECK_STATISTICAL_MODEL_PROCESSING = True
176
- CHECK_CASEFOLD = True
177
- CHECK_REMOVE_STOPWORDS = True
178
- CHECK_REMOVE_PUNCTUATION = True
179
- CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
162
+ config.CHECK_DETECT_LANGUAGE = True
163
+ config.CHECK_FIX_BAD_UNICODE = True
164
+ config.CHECK_TO_ASCII_UNICODE = True
165
+ config.CHECK_REPLACE_HTML = True
166
+ config.CHECK_REPLACE_URLS = True
167
+ config.CHECK_REPLACE_EMAILS = True
168
+ config.CHECK_REPLACE_YEARS = True
169
+ config.CHECK_REPLACE_PHONE_NUMBERS = True
170
+ config.CHECK_REPLACE_NUMBERS = True
171
+ config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
172
+ config.CHECK_NER_PROCESS = True
173
+ config.CHECK_REMOVE_ISOLATED_LETTERS = True
174
+ config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
175
+ config.CHECK_NORMALIZE_WHITESPACE = True
176
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = True
177
+ config.CHECK_CASEFOLD = True
178
+ config.CHECK_REMOVE_STOPWORDS = True
179
+ config.CHECK_REMOVE_PUNCTUATION = True
180
+ config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
180
181
  # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
181
- REPLACE_WITH_URL = "<URL>"
182
- REPLACE_WITH_HTML = "<HTML>"
183
- REPLACE_WITH_EMAIL = "<EMAIL>"
184
- REPLACE_WITH_YEARS = "<YEAR>"
185
- REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
186
- REPLACE_WITH_NUMBERS = "<NUMBER>"
187
- REPLACE_WITH_CURRENCY_SYMBOLS = None
182
+ config.REPLACE_WITH_URL = "<URL>"
183
+ config.REPLACE_WITH_HTML = "<HTML>"
184
+ config.REPLACE_WITH_EMAIL = "<EMAIL>"
185
+ config.REPLACE_WITH_YEARS = "<YEAR>"
186
+ config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
187
+ config.REPLACE_WITH_NUMBERS = "<NUMBER>"
188
+ config.REPLACE_WITH_CURRENCY_SYMBOLS = None
188
189
  # You can remove any of the tags
189
- POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
190
- NER_CONFIDENCE_THRESHOLD = 0.85
190
+ config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
191
+ config.NER_CONFIDENCE_THRESHOLD = 0.85
191
192
  # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
192
- LANGUAGE = None
193
+ config.LANGUAGE = None
193
194
 
194
195
  # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
195
196
  # All models passed need to support transformers AutoModel
196
- NER_MODELS_LIST = [
197
+ config.NER_MODELS_LIST = [
197
198
  "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
198
199
  "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
199
200
  "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
200
201
  "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
201
202
  "Babelscape/wikineural-multilingual-ner"
202
203
  ]
204
+
205
+ sx = sct.TextCleaner()
203
206
  ```
204
207
 
205
208
  ## API
@@ -11,6 +11,7 @@ from presidio_anonymizer.entities import RecognizerResult
11
11
 
12
12
  from sct.utils import constants
13
13
  from sct import config
14
+ from typing import List, Dict, Any # Add this line
14
15
  transformers.logging.set_verbosity_error()
15
16
 
16
17
  class GeneralNER:
@@ -2,7 +2,7 @@ from setuptools import setup, find_packages
2
2
 
3
3
  setup(
4
4
  name='SqueakyCleanText',
5
- version='0.2.4',
5
+ version='0.2.5',
6
6
  author='Rehan Fazal',
7
7
  description='A comprehensive text cleaning and preprocessing pipeline.',
8
8
  long_description=open('README.md', encoding='utf-8').read(),