SqueakyCleanText 0.2.2__tar.gz → 0.2.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. SqueakyCleanText-0.2.4/PKG-INFO +228 -0
  2. SqueakyCleanText-0.2.4/README.md +204 -0
  3. SqueakyCleanText-0.2.4/SqueakyCleanText.egg-info/PKG-INFO +228 -0
  4. SqueakyCleanText-0.2.4/sct/config.py +100 -0
  5. SqueakyCleanText-0.2.4/sct/sct.py +341 -0
  6. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/sct/utils/constants.py +2 -9
  7. SqueakyCleanText-0.2.4/sct/utils/contact.py +88 -0
  8. SqueakyCleanText-0.2.4/sct/utils/datetime.py +26 -0
  9. SqueakyCleanText-0.2.4/sct/utils/ner.py +366 -0
  10. SqueakyCleanText-0.2.4/sct/utils/normtext.py +148 -0
  11. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/sct/utils/resources.py +1 -0
  12. SqueakyCleanText-0.2.4/sct/utils/special.py +113 -0
  13. SqueakyCleanText-0.2.4/sct/utils/stopwords.py +62 -0
  14. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/setup.py +2 -3
  15. SqueakyCleanText-0.2.2/PKG-INFO +0 -157
  16. SqueakyCleanText-0.2.2/README.md +0 -132
  17. SqueakyCleanText-0.2.2/SqueakyCleanText.egg-info/PKG-INFO +0 -157
  18. SqueakyCleanText-0.2.2/sct/config.py +0 -57
  19. SqueakyCleanText-0.2.2/sct/sct.py +0 -125
  20. SqueakyCleanText-0.2.2/sct/utils/contact.py +0 -50
  21. SqueakyCleanText-0.2.2/sct/utils/datetime.py +0 -15
  22. SqueakyCleanText-0.2.2/sct/utils/ner.py +0 -227
  23. SqueakyCleanText-0.2.2/sct/utils/normtext.py +0 -83
  24. SqueakyCleanText-0.2.2/sct/utils/special.py +0 -50
  25. SqueakyCleanText-0.2.2/sct/utils/stopwords.py +0 -35
  26. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/LICENSE +0 -0
  27. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/MANIFEST.in +0 -0
  28. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/SqueakyCleanText.egg-info/SOURCES.txt +0 -0
  29. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/SqueakyCleanText.egg-info/dependency_links.txt +0 -0
  30. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/SqueakyCleanText.egg-info/entry_points.txt +0 -0
  31. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/SqueakyCleanText.egg-info/requires.txt +0 -0
  32. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/SqueakyCleanText.egg-info/top_level.txt +0 -0
  33. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/sct/__init__.py +0 -0
  34. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/sct/scripts/__init__.py +0 -0
  35. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/sct/scripts/download_nltk_stopwords.py +0 -0
  36. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/sct/utils/__init__.py +0 -0
  37. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/setup.cfg +0 -0
  38. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/tests/__init__.py +0 -0
  39. {SqueakyCleanText-0.2.2 → SqueakyCleanText-0.2.4}/tests/test_sct.py +0 -0
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.1
2
+ Name: SqueakyCleanText
3
+ Version: 0.2.4
4
+ Summary: A comprehensive text cleaning and preprocessing pipeline.
5
+ Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
+ Author: Rehan Fazal
7
+ License: MIT
8
+ Keywords: text cleaning,text preprocessing,NLP,natural language processing
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.8
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Topic :: Software Development :: Libraries
18
+ Classifier: Topic :: Text Processing
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ Provides-Extra: dev
22
+ Provides-Extra: test
23
+ License-File: LICENSE
24
+
25
+ # `SqueakyCleanText`
26
+
27
+ [![PyPI](https://img.shields.io/pypi/v/squeakycleantext.svg)](https://pypi.org/project/squeakycleantext/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/squeakycleantext)](https://pypistats.org/packages/squeakycleantext)
28
+
29
+ In the world of machine learning and natural language processing, clean and well-structured text data is crucial for building effective downstream models and managing token limits in language models.
30
+
31
+ SqueakyCleanText simplifies the process by automatically addressing common text issues, ensuring your data is clean and well-structured with minimal effort on your part.
32
+
33
+ ### Key Features
34
+ - Encoding Issues: Corrects text encoding problems.
35
+ - HTML and URLs: Removes unnecessary long HTML tags and URLs, or replaces them with special tokens.
36
+ - Contact Information: Strips emails, phone numbers, and other contact details, or replaces them with special tokens.
37
+ - Isolated Characters: Eliminates isolated letters or symbols that add no value.
38
+ - NER Support: Uses a soft voting ensemble technique to handle named entities like location, person, and organization names, which can be replaced with special tokens if not needed in the text.
39
+ - Stopwords and Punctuation: For statistical models, it optimizes text by removing stopwords, special symbols, and punctuation.
40
+ - Currency Symbols: Replaces all currency symbols with their alphabetical equivalents.
41
+ - Whitespace Normalization: Removes unnecessary whitespace.
42
+ - Detects the language of the processed text, useful for downstream tasks.
43
+ - Supports English, Dutch, German, and Spanish languages.
44
+ - Provides text formatted for both Language Model processing and Statistical Model processing.
45
+
46
+ ##### Benefits for Statistical Models
47
+ When working with statistical models, further optimization is often required, such as removing stopwords, special symbols, and punctuation.
48
+ SqueakyCleanText streamlines this process, ensuring your text data is in optimal shape for classification and other downstream tasks.
49
+
50
+ ##### Advantage for Ensemble NER Process
51
+ Relying on a single model for Named Entity Recognition (NER) may not be ideal, as there is a significant chance that it might miss some entities. Combining language-specific NER models increases specificity and reduces the risk of missing entities.
52
+ The NER model in this package includes a chunking mechanism, enabling effective NER processing even when the text exceeds the model's token size limit.
53
+
54
+ By automating these text cleaning steps, SqueakyCleanText ensures your data is prepared efficiently and effectively, saving time and improving model performance.
55
+
56
+ ## Installation
57
+
58
+ To install SqueakyCleanText, use the following pip command:
59
+
60
+ ```sh
61
+ pip install SqueakyCleanText
62
+ ```
63
+
64
+ ## Usage
65
+
66
+ Here are a few examples of how to use the SqueakyCleanText package:
67
+
68
+ Examples:
69
+ ```python
70
+ english_text = "Hey John Doe, wanna grab some coffee at Starbucks on 5th Avenue? I'm feeling a bit tired after last night's party at Jane's place. BTW, I can't make it to the meeting at 10:00 AM. LOL! Call me at +1-555-123-4567 or email me at john.doe@example.com. Check out this cool website: https://www.example.com."
71
+
72
+ dutch_text = "Hé Jan Jansen, wil je wat koffie halen bij Starbucks op de 5e Avenue? Ik voel me een beetje moe na het feest van gisteravond bij Annes huis. Btw, ik kan niet naar de vergadering om 10:00 uur. LOL! Bel me op +31-6-1234-5678 of mail me op jan.jansen@voorbeeld.com. Kijk eens naar deze coole website: https://www.voorbeeld.com."
73
+ ```
74
+
75
+ - Using default configuration settings:
76
+
77
+ ```python
78
+ # The first time you import the package, it may take some time because it will downloading the NER models. Please be patient.
79
+ from sct import sct
80
+
81
+ # Initialize the TextCleaner
82
+ sx = sct.TextCleaner()
83
+
84
+ # Process the text
85
+ # lmtext : Text for Language Models;
86
+ # cmtext : Text for Classical/Statistical ML;
87
+ # language : Processed text language
88
+
89
+ #### --- English Text
90
+ lmtext, cmtext, language = sx.process(english_text)
91
+ print(f"Language Model Text : {lmtext}")
92
+ print(f"Statistical Model Text : {cmtext}")
93
+ print(f"Language of the Text : {language}")
94
+
95
+ # Output the result
96
+ # Language Model Text : Hey <PERSON> wanna grab some coffee at Starbucks on <LOCATION> I'm feeling a bit tired after last night's party at <PERSON>'s place. BTW, can't make it to the meeting at <NUMBER><NUMBER> AM. LOL! Call me at <PHONE> or email me at <EMAIL> Check out this cool website: <URL>
97
+ # Statistical Model Text : hey person wanna grab coffee starbucks location im feeling bit tired last nights party persons place btw cant make meeting numbernumber am lol call phone email email check cool website url
98
+ # Language of the Text : ENGLISH
99
+
100
+ #### --- Dutch Text
101
+ lmtext, cmtext, language = sx.process(dutch_text)
102
+ print(f"Language Model Text : {lmtext}")
103
+ print(f"Statistical Model Text : {cmtext}")
104
+ print(f"Language of the Text : {language}")
105
+
106
+ # Output the result
107
+ # Language Model Text : He <PERSON> wil je wat koffie halen bij <ORGANISATION> op de <LOCATION> Ik voel me een beetje moe na het feest van gisteravond bij Annes huis. Btw, ik kan niet naar de vergadering om <NUMBER><NUMBER> uur. LOL! Bel me op <NUMBER><NUMBER><PHONE> of mail me op <EMAIL> Kijk eens naar deze coole website: <URL>
108
+ # Statistical Model Text : he person koffie halen organisation location voel beetje moe feest gisteravond annes huis btw vergadering numbernumber uur lol bel numbernumberphone mail email kijk coole website url
109
+ # Language of the Text : DUTCH
110
+ ```
111
+
112
+ - Using the package with custom configuration:
113
+ You can modify the package’s functionality by changing settings in the configuration file before initializing TextCleaner().
114
+
115
+ - Deactivating NER altogether:
116
+
117
+ ```python
118
+
119
+ from sct import sct, config
120
+
121
+ config.CHECK_NER_PROCESS = False
122
+ sx = sct.TextCleaner()
123
+
124
+ lmtext, cmtext, language = sx.process(english_text)
125
+ print(f"Language Model Text : {lmtext}")
126
+ print(f"Statistical Model Text : {cmtext}")
127
+ print(f"Language of the Text : {language}")
128
+
129
+ # Output the result
130
+ # Language Model Text : Hey John Doe, wanna grab some coffee at Starbucks on 5th Avenue? I'm feeling a bit tired after last night's party at Jane's place. BTW, can't make it to the meeting at <NUMBER><NUMBER> AM. LOL! Call me at <PHONE> or email me at <EMAIL> Check out this cool website: <URL>
131
+ # Statistical Model Text : hey john doe wanna grab coffee starbucks 5th avenue im feeling bit tired last nights party janes place btw cant make meeting numbernumber am lol call phone email email check cool website url
132
+ # Language of the Text : ENGLISH
133
+ ```
134
+
135
+ - Incase Statistical model text is not needed:
136
+
137
+ ```python
138
+
139
+ from sct import sct, config
140
+
141
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = False
142
+ sx = sct.TextCleaner()
143
+
144
+ lmtext, language = sx.process(english_text)
145
+ print(f"Language Model Text : {lmtext}")
146
+ print(f"Language of the Text : {language}")
147
+
148
+ # Output the result
149
+
150
+ # Output the result
151
+ # Language Model Text : Hey John Doe, wanna grab some coffee at Starbucks on 5th Avenue? I'm feeling a bit tired after last night's party at Jane's place. BTW, can't make it to the meeting at <NUMBER><NUMBER> AM. LOL! Call me at <PHONE> or email me at <EMAIL> Check out this cool website: <URL>
152
+ # Language of the Text : ENGLISH
153
+ ```
154
+ ### Full List of Configurable Settings:
155
+
156
+ Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
157
+
158
+ ```python
159
+ # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
160
+ # then only CHECK_DETECT_LANGUAGE will be considered False.
161
+ CHECK_DETECT_LANGUAGE = True
162
+ CHECK_FIX_BAD_UNICODE = True
163
+ CHECK_TO_ASCII_UNICODE = True
164
+ CHECK_REPLACE_HTML = True
165
+ CHECK_REPLACE_URLS = True
166
+ CHECK_REPLACE_EMAILS = True
167
+ CHECK_REPLACE_YEARS = True
168
+ CHECK_REPLACE_PHONE_NUMBERS = True
169
+ CHECK_REPLACE_NUMBERS = True
170
+ CHECK_REPLACE_CURRENCY_SYMBOLS = True
171
+ CHECK_NER_PROCESS = True
172
+ CHECK_REMOVE_ISOLATED_LETTERS = True
173
+ CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
174
+ CHECK_NORMALIZE_WHITESPACE = True
175
+ CHECK_STATISTICAL_MODEL_PROCESSING = True
176
+ CHECK_CASEFOLD = True
177
+ CHECK_REMOVE_STOPWORDS = True
178
+ CHECK_REMOVE_PUNCTUATION = True
179
+ CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
180
+ # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
181
+ REPLACE_WITH_URL = "<URL>"
182
+ REPLACE_WITH_HTML = "<HTML>"
183
+ REPLACE_WITH_EMAIL = "<EMAIL>"
184
+ REPLACE_WITH_YEARS = "<YEAR>"
185
+ REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
186
+ REPLACE_WITH_NUMBERS = "<NUMBER>"
187
+ REPLACE_WITH_CURRENCY_SYMBOLS = None
188
+ # You can remove any of the tags
189
+ POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
190
+ NER_CONFIDENCE_THRESHOLD = 0.85
191
+ # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
192
+ LANGUAGE = None
193
+
194
+ # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
195
+ # All models passed need to support transformers AutoModel
196
+ NER_MODELS_LIST = [
197
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
198
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
199
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
200
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
201
+ "Babelscape/wikineural-multilingual-ner"
202
+ ]
203
+ ```
204
+
205
+ ## API
206
+
207
+ ### `sct.TextCleaner`
208
+
209
+ #### `process(text: str) -> Tuple[str, str, str]`
210
+
211
+ Processes the input text and returns a tuple containing:
212
+ - Cleaned text formatted for language models.
213
+ - Cleaned text formatted for statistical models (stopwords removed).
214
+ - Detected language of the text.
215
+
216
+ ## Contributing
217
+
218
+ Contributions are welcome! Please feel free to submit a Pull Request or open an issue.
219
+
220
+ ## License
221
+
222
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
223
+
224
+ ## Acknowledgements
225
+
226
+ The package took inspirations from the following repo:
227
+
228
+ - [clean-text](https://github.com/jfilter/clean-text)
@@ -0,0 +1,204 @@
1
+ # `SqueakyCleanText`
2
+
3
+ [![PyPI](https://img.shields.io/pypi/v/squeakycleantext.svg)](https://pypi.org/project/squeakycleantext/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/squeakycleantext)](https://pypistats.org/packages/squeakycleantext)
4
+
5
+ In the world of machine learning and natural language processing, clean and well-structured text data is crucial for building effective downstream models and managing token limits in language models.
6
+
7
+ SqueakyCleanText simplifies the process by automatically addressing common text issues, ensuring your data is clean and well-structured with minimal effort on your part.
8
+
9
+ ### Key Features
10
+ - Encoding Issues: Corrects text encoding problems.
11
+ - HTML and URLs: Removes unnecessary long HTML tags and URLs, or replaces them with special tokens.
12
+ - Contact Information: Strips emails, phone numbers, and other contact details, or replaces them with special tokens.
13
+ - Isolated Characters: Eliminates isolated letters or symbols that add no value.
14
+ - NER Support: Uses a soft voting ensemble technique to handle named entities like location, person, and organization names, which can be replaced with special tokens if not needed in the text.
15
+ - Stopwords and Punctuation: For statistical models, it optimizes text by removing stopwords, special symbols, and punctuation.
16
+ - Currency Symbols: Replaces all currency symbols with their alphabetical equivalents.
17
+ - Whitespace Normalization: Removes unnecessary whitespace.
18
+ - Detects the language of the processed text, useful for downstream tasks.
19
+ - Supports English, Dutch, German, and Spanish languages.
20
+ - Provides text formatted for both Language Model processing and Statistical Model processing.
21
+
22
+ ##### Benefits for Statistical Models
23
+ When working with statistical models, further optimization is often required, such as removing stopwords, special symbols, and punctuation.
24
+ SqueakyCleanText streamlines this process, ensuring your text data is in optimal shape for classification and other downstream tasks.
25
+
26
+ ##### Advantage for Ensemble NER Process
27
+ Relying on a single model for Named Entity Recognition (NER) may not be ideal, as there is a significant chance that it might miss some entities. Combining language-specific NER models increases specificity and reduces the risk of missing entities.
28
+ The NER model in this package includes a chunking mechanism, enabling effective NER processing even when the text exceeds the model's token size limit.
29
+
30
+ By automating these text cleaning steps, SqueakyCleanText ensures your data is prepared efficiently and effectively, saving time and improving model performance.
31
+
32
+ ## Installation
33
+
34
+ To install SqueakyCleanText, use the following pip command:
35
+
36
+ ```sh
37
+ pip install SqueakyCleanText
38
+ ```
39
+
40
+ ## Usage
41
+
42
+ Here are a few examples of how to use the SqueakyCleanText package:
43
+
44
+ Examples:
45
+ ```python
46
+ english_text = "Hey John Doe, wanna grab some coffee at Starbucks on 5th Avenue? I'm feeling a bit tired after last night's party at Jane's place. BTW, I can't make it to the meeting at 10:00 AM. LOL! Call me at +1-555-123-4567 or email me at john.doe@example.com. Check out this cool website: https://www.example.com."
47
+
48
+ dutch_text = "Hé Jan Jansen, wil je wat koffie halen bij Starbucks op de 5e Avenue? Ik voel me een beetje moe na het feest van gisteravond bij Annes huis. Btw, ik kan niet naar de vergadering om 10:00 uur. LOL! Bel me op +31-6-1234-5678 of mail me op jan.jansen@voorbeeld.com. Kijk eens naar deze coole website: https://www.voorbeeld.com."
49
+ ```
50
+
51
+ - Using default configuration settings:
52
+
53
+ ```python
54
+ # The first time you import the package, it may take some time because it will downloading the NER models. Please be patient.
55
+ from sct import sct
56
+
57
+ # Initialize the TextCleaner
58
+ sx = sct.TextCleaner()
59
+
60
+ # Process the text
61
+ # lmtext : Text for Language Models;
62
+ # cmtext : Text for Classical/Statistical ML;
63
+ # language : Processed text language
64
+
65
+ #### --- English Text
66
+ lmtext, cmtext, language = sx.process(english_text)
67
+ print(f"Language Model Text : {lmtext}")
68
+ print(f"Statistical Model Text : {cmtext}")
69
+ print(f"Language of the Text : {language}")
70
+
71
+ # Output the result
72
+ # Language Model Text : Hey <PERSON> wanna grab some coffee at Starbucks on <LOCATION> I'm feeling a bit tired after last night's party at <PERSON>'s place. BTW, can't make it to the meeting at <NUMBER><NUMBER> AM. LOL! Call me at <PHONE> or email me at <EMAIL> Check out this cool website: <URL>
73
+ # Statistical Model Text : hey person wanna grab coffee starbucks location im feeling bit tired last nights party persons place btw cant make meeting numbernumber am lol call phone email email check cool website url
74
+ # Language of the Text : ENGLISH
75
+
76
+ #### --- Dutch Text
77
+ lmtext, cmtext, language = sx.process(dutch_text)
78
+ print(f"Language Model Text : {lmtext}")
79
+ print(f"Statistical Model Text : {cmtext}")
80
+ print(f"Language of the Text : {language}")
81
+
82
+ # Output the result
83
+ # Language Model Text : He <PERSON> wil je wat koffie halen bij <ORGANISATION> op de <LOCATION> Ik voel me een beetje moe na het feest van gisteravond bij Annes huis. Btw, ik kan niet naar de vergadering om <NUMBER><NUMBER> uur. LOL! Bel me op <NUMBER><NUMBER><PHONE> of mail me op <EMAIL> Kijk eens naar deze coole website: <URL>
84
+ # Statistical Model Text : he person koffie halen organisation location voel beetje moe feest gisteravond annes huis btw vergadering numbernumber uur lol bel numbernumberphone mail email kijk coole website url
85
+ # Language of the Text : DUTCH
86
+ ```
87
+
88
+ - Using the package with custom configuration:
89
+ You can modify the package’s functionality by changing settings in the configuration file before initializing TextCleaner().
90
+
91
+ - Deactivating NER altogether:
92
+
93
+ ```python
94
+
95
+ from sct import sct, config
96
+
97
+ config.CHECK_NER_PROCESS = False
98
+ sx = sct.TextCleaner()
99
+
100
+ lmtext, cmtext, language = sx.process(english_text)
101
+ print(f"Language Model Text : {lmtext}")
102
+ print(f"Statistical Model Text : {cmtext}")
103
+ print(f"Language of the Text : {language}")
104
+
105
+ # Output the result
106
+ # Language Model Text : Hey John Doe, wanna grab some coffee at Starbucks on 5th Avenue? I'm feeling a bit tired after last night's party at Jane's place. BTW, can't make it to the meeting at <NUMBER><NUMBER> AM. LOL! Call me at <PHONE> or email me at <EMAIL> Check out this cool website: <URL>
107
+ # Statistical Model Text : hey john doe wanna grab coffee starbucks 5th avenue im feeling bit tired last nights party janes place btw cant make meeting numbernumber am lol call phone email email check cool website url
108
+ # Language of the Text : ENGLISH
109
+ ```
110
+
111
+ - Incase Statistical model text is not needed:
112
+
113
+ ```python
114
+
115
+ from sct import sct, config
116
+
117
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = False
118
+ sx = sct.TextCleaner()
119
+
120
+ lmtext, language = sx.process(english_text)
121
+ print(f"Language Model Text : {lmtext}")
122
+ print(f"Language of the Text : {language}")
123
+
124
+ # Output the result
125
+
126
+ # Output the result
127
+ # Language Model Text : Hey John Doe, wanna grab some coffee at Starbucks on 5th Avenue? I'm feeling a bit tired after last night's party at Jane's place. BTW, can't make it to the meeting at <NUMBER><NUMBER> AM. LOL! Call me at <PHONE> or email me at <EMAIL> Check out this cool website: <URL>
128
+ # Language of the Text : ENGLISH
129
+ ```
130
+ ### Full List of Configurable Settings:
131
+
132
+ Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
133
+
134
+ ```python
135
+ # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
136
+ # then only CHECK_DETECT_LANGUAGE will be considered False.
137
+ CHECK_DETECT_LANGUAGE = True
138
+ CHECK_FIX_BAD_UNICODE = True
139
+ CHECK_TO_ASCII_UNICODE = True
140
+ CHECK_REPLACE_HTML = True
141
+ CHECK_REPLACE_URLS = True
142
+ CHECK_REPLACE_EMAILS = True
143
+ CHECK_REPLACE_YEARS = True
144
+ CHECK_REPLACE_PHONE_NUMBERS = True
145
+ CHECK_REPLACE_NUMBERS = True
146
+ CHECK_REPLACE_CURRENCY_SYMBOLS = True
147
+ CHECK_NER_PROCESS = True
148
+ CHECK_REMOVE_ISOLATED_LETTERS = True
149
+ CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
150
+ CHECK_NORMALIZE_WHITESPACE = True
151
+ CHECK_STATISTICAL_MODEL_PROCESSING = True
152
+ CHECK_CASEFOLD = True
153
+ CHECK_REMOVE_STOPWORDS = True
154
+ CHECK_REMOVE_PUNCTUATION = True
155
+ CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
156
+ # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
157
+ REPLACE_WITH_URL = "<URL>"
158
+ REPLACE_WITH_HTML = "<HTML>"
159
+ REPLACE_WITH_EMAIL = "<EMAIL>"
160
+ REPLACE_WITH_YEARS = "<YEAR>"
161
+ REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
162
+ REPLACE_WITH_NUMBERS = "<NUMBER>"
163
+ REPLACE_WITH_CURRENCY_SYMBOLS = None
164
+ # You can remove any of the tags
165
+ POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
166
+ NER_CONFIDENCE_THRESHOLD = 0.85
167
+ # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
168
+ LANGUAGE = None
169
+
170
+ # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
171
+ # All models passed need to support transformers AutoModel
172
+ NER_MODELS_LIST = [
173
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
174
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
175
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
176
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
177
+ "Babelscape/wikineural-multilingual-ner"
178
+ ]
179
+ ```
180
+
181
+ ## API
182
+
183
+ ### `sct.TextCleaner`
184
+
185
+ #### `process(text: str) -> Tuple[str, str, str]`
186
+
187
+ Processes the input text and returns a tuple containing:
188
+ - Cleaned text formatted for language models.
189
+ - Cleaned text formatted for statistical models (stopwords removed).
190
+ - Detected language of the text.
191
+
192
+ ## Contributing
193
+
194
+ Contributions are welcome! Please feel free to submit a Pull Request or open an issue.
195
+
196
+ ## License
197
+
198
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
199
+
200
+ ## Acknowledgements
201
+
202
+ The package took inspirations from the following repo:
203
+
204
+ - [clean-text](https://github.com/jfilter/clean-text)
@@ -0,0 +1,228 @@
1
+ Metadata-Version: 2.1
2
+ Name: SqueakyCleanText
3
+ Version: 0.2.4
4
+ Summary: A comprehensive text cleaning and preprocessing pipeline.
5
+ Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
+ Author: Rehan Fazal
7
+ License: MIT
8
+ Keywords: text cleaning,text preprocessing,NLP,natural language processing
9
+ Classifier: Programming Language :: Python :: 3
10
+ Classifier: Programming Language :: Python :: 3.8
11
+ Classifier: Programming Language :: Python :: 3.9
12
+ Classifier: Programming Language :: Python :: 3.10
13
+ Classifier: Programming Language :: Python :: 3.11
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: License :: OSI Approved :: MIT License
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Topic :: Software Development :: Libraries
18
+ Classifier: Topic :: Text Processing
19
+ Requires-Python: >=3.8
20
+ Description-Content-Type: text/markdown
21
+ Provides-Extra: dev
22
+ Provides-Extra: test
23
+ License-File: LICENSE
24
+
25
+ # `SqueakyCleanText`
26
+
27
+ [![PyPI](https://img.shields.io/pypi/v/squeakycleantext.svg)](https://pypi.org/project/squeakycleantext/) [![PyPI - Downloads](https://img.shields.io/pypi/dm/squeakycleantext)](https://pypistats.org/packages/squeakycleantext)
28
+
29
+ In the world of machine learning and natural language processing, clean and well-structured text data is crucial for building effective downstream models and managing token limits in language models.
30
+
31
+ SqueakyCleanText simplifies the process by automatically addressing common text issues, ensuring your data is clean and well-structured with minimal effort on your part.
32
+
33
+ ### Key Features
34
+ - Encoding Issues: Corrects text encoding problems.
35
+ - HTML and URLs: Removes unnecessary long HTML tags and URLs, or replaces them with special tokens.
36
+ - Contact Information: Strips emails, phone numbers, and other contact details, or replaces them with special tokens.
37
+ - Isolated Characters: Eliminates isolated letters or symbols that add no value.
38
+ - NER Support: Uses a soft voting ensemble technique to handle named entities like location, person, and organization names, which can be replaced with special tokens if not needed in the text.
39
+ - Stopwords and Punctuation: For statistical models, it optimizes text by removing stopwords, special symbols, and punctuation.
40
+ - Currency Symbols: Replaces all currency symbols with their alphabetical equivalents.
41
+ - Whitespace Normalization: Removes unnecessary whitespace.
42
+ - Detects the language of the processed text, useful for downstream tasks.
43
+ - Supports English, Dutch, German, and Spanish languages.
44
+ - Provides text formatted for both Language Model processing and Statistical Model processing.
45
+
46
+ ##### Benefits for Statistical Models
47
+ When working with statistical models, further optimization is often required, such as removing stopwords, special symbols, and punctuation.
48
+ SqueakyCleanText streamlines this process, ensuring your text data is in optimal shape for classification and other downstream tasks.
49
+
50
+ ##### Advantage for Ensemble NER Process
51
+ Relying on a single model for Named Entity Recognition (NER) may not be ideal, as there is a significant chance that it might miss some entities. Combining language-specific NER models increases specificity and reduces the risk of missing entities.
52
+ The NER model in this package includes a chunking mechanism, enabling effective NER processing even when the text exceeds the model's token size limit.
53
+
54
+ By automating these text cleaning steps, SqueakyCleanText ensures your data is prepared efficiently and effectively, saving time and improving model performance.
55
+
56
+ ## Installation
57
+
58
+ To install SqueakyCleanText, use the following pip command:
59
+
60
+ ```sh
61
+ pip install SqueakyCleanText
62
+ ```
63
+
64
+ ## Usage
65
+
66
+ Here are a few examples of how to use the SqueakyCleanText package:
67
+
68
+ Examples:
69
+ ```python
70
+ english_text = "Hey John Doe, wanna grab some coffee at Starbucks on 5th Avenue? I'm feeling a bit tired after last night's party at Jane's place. BTW, I can't make it to the meeting at 10:00 AM. LOL! Call me at +1-555-123-4567 or email me at john.doe@example.com. Check out this cool website: https://www.example.com."
71
+
72
+ dutch_text = "Hé Jan Jansen, wil je wat koffie halen bij Starbucks op de 5e Avenue? Ik voel me een beetje moe na het feest van gisteravond bij Annes huis. Btw, ik kan niet naar de vergadering om 10:00 uur. LOL! Bel me op +31-6-1234-5678 of mail me op jan.jansen@voorbeeld.com. Kijk eens naar deze coole website: https://www.voorbeeld.com."
73
+ ```
74
+
75
+ - Using default configuration settings:
76
+
77
+ ```python
78
+ # The first time you import the package, it may take some time because it will downloading the NER models. Please be patient.
79
+ from sct import sct
80
+
81
+ # Initialize the TextCleaner
82
+ sx = sct.TextCleaner()
83
+
84
+ # Process the text
85
+ # lmtext : Text for Language Models;
86
+ # cmtext : Text for Classical/Statistical ML;
87
+ # language : Processed text language
88
+
89
+ #### --- English Text
90
+ lmtext, cmtext, language = sx.process(english_text)
91
+ print(f"Language Model Text : {lmtext}")
92
+ print(f"Statistical Model Text : {cmtext}")
93
+ print(f"Language of the Text : {language}")
94
+
95
+ # Output the result
96
+ # Language Model Text : Hey <PERSON> wanna grab some coffee at Starbucks on <LOCATION> I'm feeling a bit tired after last night's party at <PERSON>'s place. BTW, can't make it to the meeting at <NUMBER><NUMBER> AM. LOL! Call me at <PHONE> or email me at <EMAIL> Check out this cool website: <URL>
97
+ # Statistical Model Text : hey person wanna grab coffee starbucks location im feeling bit tired last nights party persons place btw cant make meeting numbernumber am lol call phone email email check cool website url
98
+ # Language of the Text : ENGLISH
99
+
100
+ #### --- Dutch Text
101
+ lmtext, cmtext, language = sx.process(dutch_text)
102
+ print(f"Language Model Text : {lmtext}")
103
+ print(f"Statistical Model Text : {cmtext}")
104
+ print(f"Language of the Text : {language}")
105
+
106
+ # Output the result
107
+ # Language Model Text : He <PERSON> wil je wat koffie halen bij <ORGANISATION> op de <LOCATION> Ik voel me een beetje moe na het feest van gisteravond bij Annes huis. Btw, ik kan niet naar de vergadering om <NUMBER><NUMBER> uur. LOL! Bel me op <NUMBER><NUMBER><PHONE> of mail me op <EMAIL> Kijk eens naar deze coole website: <URL>
108
+ # Statistical Model Text : he person koffie halen organisation location voel beetje moe feest gisteravond annes huis btw vergadering numbernumber uur lol bel numbernumberphone mail email kijk coole website url
109
+ # Language of the Text : DUTCH
110
+ ```
111
+
112
+ - Using the package with custom configuration:
113
+ You can modify the package’s functionality by changing settings in the configuration file before initializing TextCleaner().
114
+
115
+ - Deactivating NER altogether:
116
+
117
+ ```python
118
+
119
+ from sct import sct, config
120
+
121
+ config.CHECK_NER_PROCESS = False
122
+ sx = sct.TextCleaner()
123
+
124
+ lmtext, cmtext, language = sx.process(english_text)
125
+ print(f"Language Model Text : {lmtext}")
126
+ print(f"Statistical Model Text : {cmtext}")
127
+ print(f"Language of the Text : {language}")
128
+
129
+ # Output the result
130
+ # Language Model Text : Hey John Doe, wanna grab some coffee at Starbucks on 5th Avenue? I'm feeling a bit tired after last night's party at Jane's place. BTW, can't make it to the meeting at <NUMBER><NUMBER> AM. LOL! Call me at <PHONE> or email me at <EMAIL> Check out this cool website: <URL>
131
+ # Statistical Model Text : hey john doe wanna grab coffee starbucks 5th avenue im feeling bit tired last nights party janes place btw cant make meeting numbernumber am lol call phone email email check cool website url
132
+ # Language of the Text : ENGLISH
133
+ ```
134
+
135
+ - Incase Statistical model text is not needed:
136
+
137
+ ```python
138
+
139
+ from sct import sct, config
140
+
141
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = False
142
+ sx = sct.TextCleaner()
143
+
144
+ lmtext, language = sx.process(english_text)
145
+ print(f"Language Model Text : {lmtext}")
146
+ print(f"Language of the Text : {language}")
147
+
148
+ # Output the result
149
+
150
+ # Output the result
151
+ # Language Model Text : Hey John Doe, wanna grab some coffee at Starbucks on 5th Avenue? I'm feeling a bit tired after last night's party at Jane's place. BTW, can't make it to the meeting at <NUMBER><NUMBER> AM. LOL! Call me at <PHONE> or email me at <EMAIL> Check out this cool website: <URL>
152
+ # Language of the Text : ENGLISH
153
+ ```
154
+ ### Full List of Configurable Settings:
155
+
156
+ Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
157
+
158
+ ```python
159
+ # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
160
+ # then only CHECK_DETECT_LANGUAGE will be considered False.
161
+ CHECK_DETECT_LANGUAGE = True
162
+ CHECK_FIX_BAD_UNICODE = True
163
+ CHECK_TO_ASCII_UNICODE = True
164
+ CHECK_REPLACE_HTML = True
165
+ CHECK_REPLACE_URLS = True
166
+ CHECK_REPLACE_EMAILS = True
167
+ CHECK_REPLACE_YEARS = True
168
+ CHECK_REPLACE_PHONE_NUMBERS = True
169
+ CHECK_REPLACE_NUMBERS = True
170
+ CHECK_REPLACE_CURRENCY_SYMBOLS = True
171
+ CHECK_NER_PROCESS = True
172
+ CHECK_REMOVE_ISOLATED_LETTERS = True
173
+ CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
174
+ CHECK_NORMALIZE_WHITESPACE = True
175
+ CHECK_STATISTICAL_MODEL_PROCESSING = True
176
+ CHECK_CASEFOLD = True
177
+ CHECK_REMOVE_STOPWORDS = True
178
+ CHECK_REMOVE_PUNCTUATION = True
179
+ CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
180
+ # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
181
+ REPLACE_WITH_URL = "<URL>"
182
+ REPLACE_WITH_HTML = "<HTML>"
183
+ REPLACE_WITH_EMAIL = "<EMAIL>"
184
+ REPLACE_WITH_YEARS = "<YEAR>"
185
+ REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
186
+ REPLACE_WITH_NUMBERS = "<NUMBER>"
187
+ REPLACE_WITH_CURRENCY_SYMBOLS = None
188
+ # You can remove any of the tags
189
+ POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
190
+ NER_CONFIDENCE_THRESHOLD = 0.85
191
+ # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
192
+ LANGUAGE = None
193
+
194
+ # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
195
+ # All models passed need to support transformers AutoModel
196
+ NER_MODELS_LIST = [
197
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
198
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
199
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
200
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
201
+ "Babelscape/wikineural-multilingual-ner"
202
+ ]
203
+ ```
204
+
205
+ ## API
206
+
207
+ ### `sct.TextCleaner`
208
+
209
+ #### `process(text: str) -> Tuple[str, str, str]`
210
+
211
+ Processes the input text and returns a tuple containing:
212
+ - Cleaned text formatted for language models.
213
+ - Cleaned text formatted for statistical models (stopwords removed).
214
+ - Detected language of the text.
215
+
216
+ ## Contributing
217
+
218
+ Contributions are welcome! Please feel free to submit a Pull Request or open an issue.
219
+
220
+ ## License
221
+
222
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
223
+
224
+ ## Acknowledgements
225
+
226
+ The package took inspirations from the following repo:
227
+
228
+ - [clean-text](https://github.com/jfilter/clean-text)