PyPI - SqueakyCleanText - Versions diffs - 0.2.5__tar.gz → 0.2.6__tar.gz - Mend

SqueakyCleanText 0.2.5tar.gz → 0.2.6tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{SqueakyCleanText-0.2.5/SqueakyCleanText.egg-info → SqueakyCleanText-0.2.6}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: SqueakyCleanText
-Version: 0.2.5
+Version: 0.2.6
 Summary: A comprehensive text cleaning and preprocessing pipeline.
 Home-page: https://github.com/rhnfzl/SqueakyCleanText
 Author: Rehan Fazal
@@ -156,55 +156,56 @@ You can modify the package’s functionality by changing settings in the configu
     Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
     ```python
-        from sct import sct, config
-        # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
-        # then only CHECK_DETECT_LANGUAGE will be considered False.
-        config.CHECK_DETECT_LANGUAGE = True
-        config.CHECK_FIX_BAD_UNICODE = True
-        config.CHECK_TO_ASCII_UNICODE = True
-        config.CHECK_REPLACE_HTML = True
-        config.CHECK_REPLACE_URLS = True
-        config.CHECK_REPLACE_EMAILS = True
-        config.CHECK_REPLACE_YEARS = True
-        config.CHECK_REPLACE_PHONE_NUMBERS = True
-        config.CHECK_REPLACE_NUMBERS = True
-        config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
-        config.CHECK_NER_PROCESS = True
-        config.CHECK_REMOVE_ISOLATED_LETTERS = True
-        config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
-        config.CHECK_NORMALIZE_WHITESPACE = True
-        config.CHECK_STATISTICAL_MODEL_PROCESSING = True
-        config.CHECK_CASEFOLD = True
-        config.CHECK_REMOVE_STOPWORDS = True
-        config.CHECK_REMOVE_PUNCTUATION = True
-        config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
-        # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
-        config.REPLACE_WITH_URL = "<URL>"
-        config.REPLACE_WITH_HTML = "<HTML>"
-        config.REPLACE_WITH_EMAIL = "<EMAIL>"
-        config.REPLACE_WITH_YEARS = "<YEAR>"
-        config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
-        config.REPLACE_WITH_NUMBERS = "<NUMBER>"
-        config.REPLACE_WITH_CURRENCY_SYMBOLS = None
-        # You can remove any of the tags
-        config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
-        config.NER_CONFIDENCE_THRESHOLD = 0.85
-        # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
-        config.LANGUAGE = None
-        # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
-        # All models passed need to support transformers AutoModel
-        config.NER_MODELS_LIST = [
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
-            "Babelscape/wikineural-multilingual-ner"
-        ]
-        sx = sct.TextCleaner()
+    from sct import sct, config
+    # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
+    # then only CHECK_DETECT_LANGUAGE will be considered False.
+    config.CHECK_DETECT_LANGUAGE = True
+    config.CHECK_FIX_BAD_UNICODE = True
+    config.CHECK_TO_ASCII_UNICODE = True
+    config.CHECK_REPLACE_HTML = True
+    config.CHECK_REPLACE_URLS = True
+    config.CHECK_REPLACE_EMAILS = True
+    config.CHECK_REPLACE_YEARS = True
+    config.CHECK_REPLACE_PHONE_NUMBERS = True
+    config.CHECK_REPLACE_NUMBERS = True
+    config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
+    config.CHECK_NER_PROCESS = True
+    config.CHECK_REMOVE_ISOLATED_LETTERS = True
+    config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
+    config.CHECK_NORMALIZE_WHITESPACE = True
+    config.CHECK_STATISTICAL_MODEL_PROCESSING = True
+    config.CHECK_CASEFOLD = True
+    config.CHECK_REMOVE_STOPWORDS = True
+    config.CHECK_REMOVE_PUNCTUATION = True
+    config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
+    # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
+    config.REPLACE_WITH_URL = "<URL>"
+    config.REPLACE_WITH_HTML = "<HTML>"
+    config.REPLACE_WITH_EMAIL = "<EMAIL>"
+    config.REPLACE_WITH_YEARS = "<YEAR>"
+    config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
+    config.REPLACE_WITH_NUMBERS = "<NUMBER>"
+    config.REPLACE_WITH_CURRENCY_SYMBOLS = None
+    # You can remove any of the tags
+    config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
+    config.NER_CONFIDENCE_THRESHOLD = 0.85
+    # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
+    config.LANGUAGE = None
+    # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
+    # All models passed need to support transformers AutoModel
+    config.NER_MODELS_LIST = [
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
+        "Babelscape/wikineural-multilingual-ner"
+    ]
+    sx = sct.TextCleaner()
     ```
 ## API
 ### `sct.TextCleaner`

{SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/README.md RENAMED Viewed

@@ -132,55 +132,56 @@ You can modify the package’s functionality by changing settings in the configu
     Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
     ```python
-        from sct import sct, config
-        # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
-        # then only CHECK_DETECT_LANGUAGE will be considered False.
-        config.CHECK_DETECT_LANGUAGE = True
-        config.CHECK_FIX_BAD_UNICODE = True
-        config.CHECK_TO_ASCII_UNICODE = True
-        config.CHECK_REPLACE_HTML = True
-        config.CHECK_REPLACE_URLS = True
-        config.CHECK_REPLACE_EMAILS = True
-        config.CHECK_REPLACE_YEARS = True
-        config.CHECK_REPLACE_PHONE_NUMBERS = True
-        config.CHECK_REPLACE_NUMBERS = True
-        config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
-        config.CHECK_NER_PROCESS = True
-        config.CHECK_REMOVE_ISOLATED_LETTERS = True
-        config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
-        config.CHECK_NORMALIZE_WHITESPACE = True
-        config.CHECK_STATISTICAL_MODEL_PROCESSING = True
-        config.CHECK_CASEFOLD = True
-        config.CHECK_REMOVE_STOPWORDS = True
-        config.CHECK_REMOVE_PUNCTUATION = True
-        config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
-        # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
-        config.REPLACE_WITH_URL = "<URL>"
-        config.REPLACE_WITH_HTML = "<HTML>"
-        config.REPLACE_WITH_EMAIL = "<EMAIL>"
-        config.REPLACE_WITH_YEARS = "<YEAR>"
-        config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
-        config.REPLACE_WITH_NUMBERS = "<NUMBER>"
-        config.REPLACE_WITH_CURRENCY_SYMBOLS = None
-        # You can remove any of the tags
-        config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
-        config.NER_CONFIDENCE_THRESHOLD = 0.85
-        # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
-        config.LANGUAGE = None
-        # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
-        # All models passed need to support transformers AutoModel
-        config.NER_MODELS_LIST = [
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
-            "Babelscape/wikineural-multilingual-ner"
-        ]
-        sx = sct.TextCleaner()
+    from sct import sct, config
+    # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
+    # then only CHECK_DETECT_LANGUAGE will be considered False.
+    config.CHECK_DETECT_LANGUAGE = True
+    config.CHECK_FIX_BAD_UNICODE = True
+    config.CHECK_TO_ASCII_UNICODE = True
+    config.CHECK_REPLACE_HTML = True
+    config.CHECK_REPLACE_URLS = True
+    config.CHECK_REPLACE_EMAILS = True
+    config.CHECK_REPLACE_YEARS = True
+    config.CHECK_REPLACE_PHONE_NUMBERS = True
+    config.CHECK_REPLACE_NUMBERS = True
+    config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
+    config.CHECK_NER_PROCESS = True
+    config.CHECK_REMOVE_ISOLATED_LETTERS = True
+    config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
+    config.CHECK_NORMALIZE_WHITESPACE = True
+    config.CHECK_STATISTICAL_MODEL_PROCESSING = True
+    config.CHECK_CASEFOLD = True
+    config.CHECK_REMOVE_STOPWORDS = True
+    config.CHECK_REMOVE_PUNCTUATION = True
+    config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
+    # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
+    config.REPLACE_WITH_URL = "<URL>"
+    config.REPLACE_WITH_HTML = "<HTML>"
+    config.REPLACE_WITH_EMAIL = "<EMAIL>"
+    config.REPLACE_WITH_YEARS = "<YEAR>"
+    config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
+    config.REPLACE_WITH_NUMBERS = "<NUMBER>"
+    config.REPLACE_WITH_CURRENCY_SYMBOLS = None
+    # You can remove any of the tags
+    config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
+    config.NER_CONFIDENCE_THRESHOLD = 0.85
+    # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
+    config.LANGUAGE = None
+    # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
+    # All models passed need to support transformers AutoModel
+    config.NER_MODELS_LIST = [
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
+        "Babelscape/wikineural-multilingual-ner"
+    ]
+    sx = sct.TextCleaner()
     ```
 ## API
 ### `sct.TextCleaner`
@@ -204,4 +205,4 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file
 The package took inspirations from the following repo:
-- [clean-text](https://github.com/jfilter/clean-text)
+- [clean-text](https://github.com/jfilter/clean-text)

{SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6/SqueakyCleanText.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: SqueakyCleanText
-Version: 0.2.5
+Version: 0.2.6
 Summary: A comprehensive text cleaning and preprocessing pipeline.
 Home-page: https://github.com/rhnfzl/SqueakyCleanText
 Author: Rehan Fazal
@@ -156,55 +156,56 @@ You can modify the package’s functionality by changing settings in the configu
     Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
     ```python
-        from sct import sct, config
-        # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
-        # then only CHECK_DETECT_LANGUAGE will be considered False.
-        config.CHECK_DETECT_LANGUAGE = True
-        config.CHECK_FIX_BAD_UNICODE = True
-        config.CHECK_TO_ASCII_UNICODE = True
-        config.CHECK_REPLACE_HTML = True
-        config.CHECK_REPLACE_URLS = True
-        config.CHECK_REPLACE_EMAILS = True
-        config.CHECK_REPLACE_YEARS = True
-        config.CHECK_REPLACE_PHONE_NUMBERS = True
-        config.CHECK_REPLACE_NUMBERS = True
-        config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
-        config.CHECK_NER_PROCESS = True
-        config.CHECK_REMOVE_ISOLATED_LETTERS = True
-        config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
-        config.CHECK_NORMALIZE_WHITESPACE = True
-        config.CHECK_STATISTICAL_MODEL_PROCESSING = True
-        config.CHECK_CASEFOLD = True
-        config.CHECK_REMOVE_STOPWORDS = True
-        config.CHECK_REMOVE_PUNCTUATION = True
-        config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
-        # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
-        config.REPLACE_WITH_URL = "<URL>"
-        config.REPLACE_WITH_HTML = "<HTML>"
-        config.REPLACE_WITH_EMAIL = "<EMAIL>"
-        config.REPLACE_WITH_YEARS = "<YEAR>"
-        config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
-        config.REPLACE_WITH_NUMBERS = "<NUMBER>"
-        config.REPLACE_WITH_CURRENCY_SYMBOLS = None
-        # You can remove any of the tags
-        config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
-        config.NER_CONFIDENCE_THRESHOLD = 0.85
-        # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
-        config.LANGUAGE = None
-        # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
-        # All models passed need to support transformers AutoModel
-        config.NER_MODELS_LIST = [
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
-            "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
-            "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
-            "Babelscape/wikineural-multilingual-ner"
-        ]
-        sx = sct.TextCleaner()
+    from sct import sct, config
+    # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
+    # then only CHECK_DETECT_LANGUAGE will be considered False.
+    config.CHECK_DETECT_LANGUAGE = True
+    config.CHECK_FIX_BAD_UNICODE = True
+    config.CHECK_TO_ASCII_UNICODE = True
+    config.CHECK_REPLACE_HTML = True
+    config.CHECK_REPLACE_URLS = True
+    config.CHECK_REPLACE_EMAILS = True
+    config.CHECK_REPLACE_YEARS = True
+    config.CHECK_REPLACE_PHONE_NUMBERS = True
+    config.CHECK_REPLACE_NUMBERS = True
+    config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
+    config.CHECK_NER_PROCESS = True
+    config.CHECK_REMOVE_ISOLATED_LETTERS = True
+    config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
+    config.CHECK_NORMALIZE_WHITESPACE = True
+    config.CHECK_STATISTICAL_MODEL_PROCESSING = True
+    config.CHECK_CASEFOLD = True
+    config.CHECK_REMOVE_STOPWORDS = True
+    config.CHECK_REMOVE_PUNCTUATION = True
+    config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
+    # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
+    config.REPLACE_WITH_URL = "<URL>"
+    config.REPLACE_WITH_HTML = "<HTML>"
+    config.REPLACE_WITH_EMAIL = "<EMAIL>"
+    config.REPLACE_WITH_YEARS = "<YEAR>"
+    config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
+    config.REPLACE_WITH_NUMBERS = "<NUMBER>"
+    config.REPLACE_WITH_CURRENCY_SYMBOLS = None
+    # You can remove any of the tags
+    config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
+    config.NER_CONFIDENCE_THRESHOLD = 0.85
+    # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
+    config.LANGUAGE = None
+    # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
+    # All models passed need to support transformers AutoModel
+    config.NER_MODELS_LIST = [
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
+        "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
+        "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
+        "Babelscape/wikineural-multilingual-ner"
+    ]
+    sx = sct.TextCleaner()
     ```
 ## API
 ### `sct.TextCleaner`

SqueakyCleanText-0.2.6/sct/config.py ADDED Viewed

@@ -0,0 +1,57 @@
+"""
+    detect_language : to detect the language automatically, but would consume more time if done on a batch
+    fix_bad_unicode : if True, fix "broken" unicode such as mojibake and garbled HTML entities
+    to_ascii_unicode : if True, convert non-to_ascii characters into their closest to_ascii equivalents
+    replace_with_url : special URL token, default "",
+    replace_with_email : special EMAIL token, default "",
+    replace_years : replace year, default "",
+    replace_with_phone_number : special PHONE token, default "",
+    replace_with_number : special NUMBER token, default "",
+    no_currency_symbols : if True, replace all currency symbols with the respective alphabetical ones,
+    ner_process : To execute NER Process to remove the positpositional tags, PER, LOC, ORG, MISC
+    remove_isolated_letters : remove any isolated letters which doesn't add any value to the text
+    remove_isolated_symbols : remove any isolated symbols which shouldn't be present in the text, usually which isn't
+                            immediatly prefixed and suffixed by letter or number
+    normalize_whitespace : remove any unnecessary whitespace
+    statistical_model_processing : to get the statistical model text, like for fastText, SVM, LR etc
+    casefold : to lower the text
+    remove_stopwords : remove stopwords based on the language, usues NLTK stopwords
+    remove_punctuation : removes all the special symbols
+"""
+CHECK_DETECT_LANGUAGE = True
+CHECK_FIX_BAD_UNICODE = True
+CHECK_TO_ASCII_UNICODE = True
+CHECK_REPLACE_HTML = True
+CHECK_REPLACE_URLS = True
+CHECK_REPLACE_EMAILS = True
+CHECK_REPLACE_YEARS = True
+CHECK_REPLACE_PHONE_NUMBERS = True
+CHECK_REPLACE_NUMBERS = True
+CHECK_REPLACE_CURRENCY_SYMBOLS = True
+CHECK_NER_PROCESS = True
+CHECK_REMOVE_ISOLATED_LETTERS = True
+CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
+CHECK_NORMALIZE_WHITESPACE = True
+CHECK_STATISTICAL_MODEL_PROCESSING = True
+CHECK_CASEFOLD = True
+CHECK_REMOVE_STOPWORDS = True
+CHECK_REMOVE_PUNCTUATION = True
+CHECK_REMOVE_STEXT_CUSTOM_STOP_WORDS = True
+REPLACE_WITH_URL = "<URL>"
+REPLACE_WITH_HTML = "<HTML>"
+REPLACE_WITH_EMAIL = "<EMAIL>"
+REPLACE_WITH_YEARS = "<YEAR>"
+REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
+REPLACE_WITH_NUMBERS = "<NUMBER>"
+REPLACE_WITH_CURRENCY_SYMBOLS = None
+POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
+NER_CONFIDENCE_THRESHOLD = 0.85
+LANGUAGE = None
+# Order of the model is Important : English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
+NER_MODELS_LIST = ["FacebookAI/xlm-roberta-large-finetuned-conll03-english",
+              "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
+              "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
+              "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
+              "Babelscape/wikineural-multilingual-ner"]

SqueakyCleanText-0.2.6/sct/sct.py ADDED Viewed

@@ -0,0 +1,129 @@
+"""
+This code provides a comprehensive text cleaning and preprocessing pipeline.
+It includes functions to normalize, remove personal information and clean text data,
+which is crucial for natural language processing tasks.
+"""
+from sct import config
+from sct.utils import contact, datetime, ner, normtext, resources, special, stopwords
+class TextCleaner:
+    def __init__(self):
+        self.ProcessContacts = contact.ProcessContacts()
+        self.ProcessDateTime = datetime.ProcessDateTime()
+        self.ProcessSpecialSymbols = special.ProcessSpecialSymbols()
+        self.NormaliseText = normtext.NormaliseText()
+        self.ProcessStopwords = stopwords.ProcessStopwords()
+        self.GeneralNER = ner.GeneralNER()
+        self.pipeline = []
+        self.language = None
+        self.init_pipeline()
+    def init_pipeline(self):
+        # Initialize pipeline steps based on config
+        language_config = config.LANGUAGE.lower() if config.LANGUAGE else None
+        if language_config and language_config in resources.LANGUAGE_NAME:
+            self.language = language_config.upper()
+        elif any([config.CHECK_DETECT_LANGUAGE, config.CHECK_NER_PROCESS, config.CHECK_REMOVE_STOPWORDS]):
+            self.pipeline.append(self.detect_language)
+        if config.CHECK_FIX_BAD_UNICODE:
+            self.pipeline.append(self.fix_bad_unicode)
+        if config.CHECK_TO_ASCII_UNICODE:
+            self.pipeline.append(self.to_ascii_unicode)
+        if config.CHECK_REPLACE_HTML:
+            self.pipeline.append(self.replace_html)
+        if config.CHECK_REPLACE_URLS:
+            self.pipeline.append(self.replace_urls)
+        if config.CHECK_REPLACE_EMAILS:
+            self.pipeline.append(self.replace_emails)
+        if config.CHECK_REPLACE_YEARS:
+            self.pipeline.append(self.replace_years)
+        if config.CHECK_REPLACE_PHONE_NUMBERS:
+            self.pipeline.append(self.replace_phone_numbers)
+        if config.CHECK_REPLACE_NUMBERS:
+            self.pipeline.append(self.replace_numbers)
+        if config.CHECK_REPLACE_CURRENCY_SYMBOLS:
+            self.pipeline.append(self.replace_currency_symbols)
+        if config.CHECK_NER_PROCESS:
+            self.pipeline.append(self.ner_process)
+        if config.CHECK_REMOVE_ISOLATED_LETTERS:
+            self.pipeline.append(self.remove_isolated_letters)
+        if config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS:
+            self.pipeline.append(self.remove_isolated_special_symbols)
+        if config.CHECK_NORMALIZE_WHITESPACE:
+            self.pipeline.append(self.normalize_whitespace)
+    def process(self, text):
+        text = str(text)
+        for step in self.pipeline:
+            text = step(text)
+        if config.CHECK_STATISTICAL_MODEL_PROCESSING:
+            stext = self.statistical_model_processing(text)
+            return text, stext, self.language
+        elif config.CHECK_DETECT_LANGUAGE:
+            return text, self.language
+        else:
+            return text
+    def detect_language(self, text):
+        self.language = str(resources.DETECTOR.detect_language_of(text)).split(".")[-1]
+        return text
+    def fix_bad_unicode(self, text):
+        return self.NormaliseText.fix_bad_unicode(text)
+    def to_ascii_unicode(self, text):
+        return self.NormaliseText.to_ascii_unicode(text)
+    def replace_html(self, text):
+        return self.ProcessContacts.replace_html(text, replace_with=config.REPLACE_WITH_HTML)
+    def replace_urls(self, text):
+        return self.ProcessContacts.replace_urls(text, replace_with=config.REPLACE_WITH_URL)
+    def replace_emails(self, text):
+        return self.ProcessContacts.replace_emails(text, replace_with=config.REPLACE_WITH_EMAIL)
+    def replace_years(self, text):
+        return self.ProcessDateTime.replace_years(text, replace_with=config.REPLACE_WITH_YEARS)
+    def replace_phone_numbers(self, text):
+        return self.ProcessContacts.replace_phone_numbers(text, replace_with=config.REPLACE_WITH_PHONE_NUMBERS)
+    def replace_numbers(self, text):
+        return self.ProcessContacts.replace_numbers(text, replace_with=config.REPLACE_WITH_NUMBERS)
+    def replace_currency_symbols(self, text):
+        return self.ProcessSpecialSymbols.replace_currency_symbols(text, replace_with=config.REPLACE_WITH_CURRENCY_SYMBOLS)
+    def ner_process(self, text):
+        return self.GeneralNER.ner_process(text, config.POSITIONAL_TAGS, config.NER_CONFIDENCE_THRESHOLD, self.language)
+    def remove_isolated_letters(self, text):
+        return self.ProcessSpecialSymbols.remove_isolated_letters(text)
+    def remove_isolated_special_symbols(self, text):
+        return self.ProcessSpecialSymbols.remove_isolated_special_symbols(text)
+    def normalize_whitespace(self, text):
+        return self.NormaliseText.normalize_whitespace(text, no_line_breaks=True)
+    def statistical_model_processing(self, text):
+        if config.CHECK_CASEFOLD:
+            stext = text.casefold()  # lowercase
+        if config.CHECK_REMOVE_STOPWORDS:
+            stext = self.ProcessStopwords.remove_stopwords(stext, self.language)
+        if config.CHECK_REMOVE_PUNCTUATION:
+            stext = self.ProcessSpecialSymbols.remove_punctuation(stext)
+        if config.CHECK_REMOVE_ISOLATED_LETTERS:
+            stext = self.ProcessSpecialSymbols.remove_isolated_letters(stext)
+        if config.CHECK_NORMALIZE_WHITESPACE:
+            stext = self.NormaliseText.normalize_whitespace(stext)
+        return stext

{SqueakyCleanText-0.2.5 → SqueakyCleanText-0.2.6}/sct/utils/constants.py RENAMED Viewed

@@ -36,6 +36,14 @@ EMAIL_REGEX = re.compile(
     flags=re.IGNORECASE | re.UNICODE,
 )
+# for more information: https://github.com/jfilter/clean-text/issues/10
+# PHONE_REGEX = re.compile(
+#     r"((?:^|(?<=[^\w)]))(((\+?[01])|(\+\d{2}))[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
+# )
+# PHONE_REGEX = re.compile(
+#     r"((?:^|(?<=[^\w)]))((\+?[01]|0{1,2}\d{0,1}|\+\d{2})[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
+# )
 PHONE_REGEX = re.compile(
     r"((?:^|(?<=[^\w)]))((\+?\d+|0{1,2}\d*?)[ .-]?)?(\(?\d{3,4}\)?/?[ .-]?)?(\d{3}[ .-]?\d{4})(\s?(?:ext\.?|[#x-])\s?\d{2,6})?(?:$|(?=\W)))|\+?\d{4,5}[ .-/]\d{6,9}"
 )
@@ -108,7 +116,6 @@ strange_double_quotes = [
     "〟",
     "＂",
 ]
 strange_single_quotes = ["‘", "‛", "’", "❛", "❜", "`", "´", "‘", "’"]
 DOUBLE_QUOTE_REGEX = re.compile("|".join(strange_double_quotes))
@@ -120,4 +127,4 @@ ISOLATED_LETTERS_REGEX = re.compile(r"(?:^|\s)[B, C, D, E, F, G, H, I, J, K, L,
 ISOLATED_SPECIAL_SYMBOLS_REGEX = re.compile(r"(?<![a-zA-Z0-9])[:_.|><;·}@~!?+#)({,/\\\\^]+(?![a-zA-Z0-9])", flags=re.UNICODE | re.IGNORECASE)
-SENTENCE_BOUNDARY_PATTERN = re.compile('(?<=[.!?])\s+(?=[^\d])')
+SENTENCE_BOUNDARY_PATTERN = re.compile('(?<=[.!?])\s+(?=[^\d])')

SqueakyCleanText-0.2.6/sct/utils/contact.py ADDED Viewed

@@ -0,0 +1,50 @@
+from sct.utils import constants
+from bs4 import BeautifulSoup
+class ProcessContacts:
+    def __init__(self):
+        pass
+    def replace_urls(self, text, replace_with="<URL>"):
+        """
+        Replace all URLs in ``text`` str with ``replace_with`` str.
+        """
+        # matches = constants.URL_REGEX.finditer(text)
+        # result = text
+        # # Iterate through matches in reverse order (to avoid index issues)
+        # for match in reversed(list(matches)):
+        # # Check if the matched substring contains non-ASCII characters
+        #     if not any(ord(char) > 127 for char in match.group()):
+        #         result = text[:match.start()] + replace_with + text[match.end():]
+        return constants.URL_REGEX.sub(replace_with, text)
+    def replace_html(self, text, replace_with="<HTML>"):
+        """
+        Replace all html tags in ``text`` str with ``replace_with`` str.
+        """
+        try:
+            soup = BeautifulSoup(text, 'html.parser')
+            text = soup.get_text()
+        except:
+            text = constants.HTML_REGEX.sub(replace_with, text)
+        return text
+    def replace_emails(self, text, replace_with="<EMAIL>"):
+        """
+        Replace all emails in ``text`` str with ``replace_with`` str.
+        """
+        return constants.EMAIL_REGEX.sub(replace_with, text)
+    def replace_phone_numbers(self, text, replace_with="<PHONE>"):
+        """
+        Replace all phone numbers in ``text`` str with ``replace_with`` str.
+        """
+        return constants.PHONE_REGEX.sub(replace_with, text)
+    def replace_numbers(self, text, replace_with="<NUMBER>"):
+        """
+        Replace all numbers in ``text`` str with ``replace_with`` str.
+        """
+        return constants.NUMBERS_REGEX.sub(replace_with, text)

SqueakyCleanText-0.2.6/sct/utils/datetime.py ADDED Viewed

@@ -0,0 +1,15 @@
+from sct.utils import constants
+class ProcessDateTime:
+    def __init__(self):
+        pass
+    def replace_years(self, text, replace_with ="<YEAR>"):
+        """
+        Replaces years between 1900 to 2099 in the text with a special token.
+        """
+        cleaned_string = constants.YEAR_REGEX.sub(replace_with, text)
+        return cleaned_string

SqueakyCleanText 0.2.5__tar.gz → 0.2.6__tar.gz

SqueakyCleanText 0.2.5tar.gz → 0.2.6tar.gz