PyPI - SqueakyCleanText - Versions diffs - 0.2.3__tar.gz → 0.2.5__tar.gz - Mend

SqueakyCleanText 0.2.3tar.gz → 0.2.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

{SqueakyCleanText-0.2.3/SqueakyCleanText.egg-info → SqueakyCleanText-0.2.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: SqueakyCleanText
-Version: 0.2.3
+Version: 0.2.5
 Summary: A comprehensive text cleaning and preprocessing pipeline.
 Home-page: https://github.com/rhnfzl/SqueakyCleanText
 Author: Rehan Fazal
@@ -156,50 +156,53 @@ You can modify the package’s functionality by changing settings in the configu
     Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
     ```python
+        from sct import sct, config
         # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
         # then only CHECK_DETECT_LANGUAGE will be considered False.
-        CHECK_DETECT_LANGUAGE = True
-        CHECK_FIX_BAD_UNICODE = True
-        CHECK_TO_ASCII_UNICODE = True
-        CHECK_REPLACE_HTML = True
-        CHECK_REPLACE_URLS = True
-        CHECK_REPLACE_EMAILS = True
-        CHECK_REPLACE_YEARS = True
-        CHECK_REPLACE_PHONE_NUMBERS = True
-        CHECK_REPLACE_NUMBERS = True
-        CHECK_REPLACE_CURRENCY_SYMBOLS = True
-        CHECK_NER_PROCESS = True
-        CHECK_REMOVE_ISOLATED_LETTERS = True
-        CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
-        CHECK_NORMALIZE_WHITESPACE = True
-        CHECK_STATISTICAL_MODEL_PROCESSING = True
-        CHECK_CASEFOLD = True
-        CHECK_REMOVE_STOPWORDS = True
-        CHECK_REMOVE_PUNCTUATION = True
-        CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
+        config.CHECK_DETECT_LANGUAGE = True
+        config.CHECK_FIX_BAD_UNICODE = True
+        config.CHECK_TO_ASCII_UNICODE = True
+        config.CHECK_REPLACE_HTML = True
+        config.CHECK_REPLACE_URLS = True
+        config.CHECK_REPLACE_EMAILS = True
+        config.CHECK_REPLACE_YEARS = True
+        config.CHECK_REPLACE_PHONE_NUMBERS = True
+        config.CHECK_REPLACE_NUMBERS = True
+        config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
+        config.CHECK_NER_PROCESS = True
+        config.CHECK_REMOVE_ISOLATED_LETTERS = True
+        config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
+        config.CHECK_NORMALIZE_WHITESPACE = True
+        config.CHECK_STATISTICAL_MODEL_PROCESSING = True
+        config.CHECK_CASEFOLD = True
+        config.CHECK_REMOVE_STOPWORDS = True
+        config.CHECK_REMOVE_PUNCTUATION = True
+        config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
         # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
-        REPLACE_WITH_URL = "<URL>"
-        REPLACE_WITH_HTML = "<HTML>"
-        REPLACE_WITH_EMAIL = "<EMAIL>"
-        REPLACE_WITH_YEARS = "<YEAR>"
-        REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
-        REPLACE_WITH_NUMBERS = "<NUMBER>"
-        REPLACE_WITH_CURRENCY_SYMBOLS = None
+        config.REPLACE_WITH_URL = "<URL>"
+        config.REPLACE_WITH_HTML = "<HTML>"
+        config.REPLACE_WITH_EMAIL = "<EMAIL>"
+        config.REPLACE_WITH_YEARS = "<YEAR>"
+        config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
+        config.REPLACE_WITH_NUMBERS = "<NUMBER>"
+        config.REPLACE_WITH_CURRENCY_SYMBOLS = None
         # You can remove any of the tags
-        POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
-        NER_CONFIDENCE_THRESHOLD = 0.85
+        config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
+        config.NER_CONFIDENCE_THRESHOLD = 0.85
         # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
-        LANGUAGE = None
+        config.LANGUAGE = None
         # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
         # All models passed need to support transformers AutoModel
-        NER_MODELS_LIST = [
+        config.NER_MODELS_LIST = [
             "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
             "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
             "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
             "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
             "Babelscape/wikineural-multilingual-ner"
         ]
+        sx = sct.TextCleaner()
     ```
 ## API

{SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/README.md RENAMED Viewed

@@ -132,50 +132,53 @@ You can modify the package’s functionality by changing settings in the configu
     Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
     ```python
+        from sct import sct, config
         # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
         # then only CHECK_DETECT_LANGUAGE will be considered False.
-        CHECK_DETECT_LANGUAGE = True
-        CHECK_FIX_BAD_UNICODE = True
-        CHECK_TO_ASCII_UNICODE = True
-        CHECK_REPLACE_HTML = True
-        CHECK_REPLACE_URLS = True
-        CHECK_REPLACE_EMAILS = True
-        CHECK_REPLACE_YEARS = True
-        CHECK_REPLACE_PHONE_NUMBERS = True
-        CHECK_REPLACE_NUMBERS = True
-        CHECK_REPLACE_CURRENCY_SYMBOLS = True
-        CHECK_NER_PROCESS = True
-        CHECK_REMOVE_ISOLATED_LETTERS = True
-        CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
-        CHECK_NORMALIZE_WHITESPACE = True
-        CHECK_STATISTICAL_MODEL_PROCESSING = True
-        CHECK_CASEFOLD = True
-        CHECK_REMOVE_STOPWORDS = True
-        CHECK_REMOVE_PUNCTUATION = True
-        CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
+        config.CHECK_DETECT_LANGUAGE = True
+        config.CHECK_FIX_BAD_UNICODE = True
+        config.CHECK_TO_ASCII_UNICODE = True
+        config.CHECK_REPLACE_HTML = True
+        config.CHECK_REPLACE_URLS = True
+        config.CHECK_REPLACE_EMAILS = True
+        config.CHECK_REPLACE_YEARS = True
+        config.CHECK_REPLACE_PHONE_NUMBERS = True
+        config.CHECK_REPLACE_NUMBERS = True
+        config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
+        config.CHECK_NER_PROCESS = True
+        config.CHECK_REMOVE_ISOLATED_LETTERS = True
+        config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
+        config.CHECK_NORMALIZE_WHITESPACE = True
+        config.CHECK_STATISTICAL_MODEL_PROCESSING = True
+        config.CHECK_CASEFOLD = True
+        config.CHECK_REMOVE_STOPWORDS = True
+        config.CHECK_REMOVE_PUNCTUATION = True
+        config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
         # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
-        REPLACE_WITH_URL = "<URL>"
-        REPLACE_WITH_HTML = "<HTML>"
-        REPLACE_WITH_EMAIL = "<EMAIL>"
-        REPLACE_WITH_YEARS = "<YEAR>"
-        REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
-        REPLACE_WITH_NUMBERS = "<NUMBER>"
-        REPLACE_WITH_CURRENCY_SYMBOLS = None
+        config.REPLACE_WITH_URL = "<URL>"
+        config.REPLACE_WITH_HTML = "<HTML>"
+        config.REPLACE_WITH_EMAIL = "<EMAIL>"
+        config.REPLACE_WITH_YEARS = "<YEAR>"
+        config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
+        config.REPLACE_WITH_NUMBERS = "<NUMBER>"
+        config.REPLACE_WITH_CURRENCY_SYMBOLS = None
         # You can remove any of the tags
-        POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
-        NER_CONFIDENCE_THRESHOLD = 0.85
+        config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
+        config.NER_CONFIDENCE_THRESHOLD = 0.85
         # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
-        LANGUAGE = None
+        config.LANGUAGE = None
         # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
         # All models passed need to support transformers AutoModel
-        NER_MODELS_LIST = [
+        config.NER_MODELS_LIST = [
             "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
             "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
             "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
             "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
             "Babelscape/wikineural-multilingual-ner"
         ]
+        sx = sct.TextCleaner()
     ```
 ## API

{SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5/SqueakyCleanText.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: SqueakyCleanText
-Version: 0.2.3
+Version: 0.2.5
 Summary: A comprehensive text cleaning and preprocessing pipeline.
 Home-page: https://github.com/rhnfzl/SqueakyCleanText
 Author: Rehan Fazal
@@ -156,50 +156,53 @@ You can modify the package’s functionality by changing settings in the configu
     Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
     ```python
+        from sct import sct, config
         # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
         # then only CHECK_DETECT_LANGUAGE will be considered False.
-        CHECK_DETECT_LANGUAGE = True
-        CHECK_FIX_BAD_UNICODE = True
-        CHECK_TO_ASCII_UNICODE = True
-        CHECK_REPLACE_HTML = True
-        CHECK_REPLACE_URLS = True
-        CHECK_REPLACE_EMAILS = True
-        CHECK_REPLACE_YEARS = True
-        CHECK_REPLACE_PHONE_NUMBERS = True
-        CHECK_REPLACE_NUMBERS = True
-        CHECK_REPLACE_CURRENCY_SYMBOLS = True
-        CHECK_NER_PROCESS = True
-        CHECK_REMOVE_ISOLATED_LETTERS = True
-        CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
-        CHECK_NORMALIZE_WHITESPACE = True
-        CHECK_STATISTICAL_MODEL_PROCESSING = True
-        CHECK_CASEFOLD = True
-        CHECK_REMOVE_STOPWORDS = True
-        CHECK_REMOVE_PUNCTUATION = True
-        CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
+        config.CHECK_DETECT_LANGUAGE = True
+        config.CHECK_FIX_BAD_UNICODE = True
+        config.CHECK_TO_ASCII_UNICODE = True
+        config.CHECK_REPLACE_HTML = True
+        config.CHECK_REPLACE_URLS = True
+        config.CHECK_REPLACE_EMAILS = True
+        config.CHECK_REPLACE_YEARS = True
+        config.CHECK_REPLACE_PHONE_NUMBERS = True
+        config.CHECK_REPLACE_NUMBERS = True
+        config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
+        config.CHECK_NER_PROCESS = True
+        config.CHECK_REMOVE_ISOLATED_LETTERS = True
+        config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
+        config.CHECK_NORMALIZE_WHITESPACE = True
+        config.CHECK_STATISTICAL_MODEL_PROCESSING = True
+        config.CHECK_CASEFOLD = True
+        config.CHECK_REMOVE_STOPWORDS = True
+        config.CHECK_REMOVE_PUNCTUATION = True
+        config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
         # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
-        REPLACE_WITH_URL = "<URL>"
-        REPLACE_WITH_HTML = "<HTML>"
-        REPLACE_WITH_EMAIL = "<EMAIL>"
-        REPLACE_WITH_YEARS = "<YEAR>"
-        REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
-        REPLACE_WITH_NUMBERS = "<NUMBER>"
-        REPLACE_WITH_CURRENCY_SYMBOLS = None
+        config.REPLACE_WITH_URL = "<URL>"
+        config.REPLACE_WITH_HTML = "<HTML>"
+        config.REPLACE_WITH_EMAIL = "<EMAIL>"
+        config.REPLACE_WITH_YEARS = "<YEAR>"
+        config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
+        config.REPLACE_WITH_NUMBERS = "<NUMBER>"
+        config.REPLACE_WITH_CURRENCY_SYMBOLS = None
         # You can remove any of the tags
-        POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
-        NER_CONFIDENCE_THRESHOLD = 0.85
+        config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
+        config.NER_CONFIDENCE_THRESHOLD = 0.85
         # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
-        LANGUAGE = None
+        config.LANGUAGE = None
         # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
         # All models passed need to support transformers AutoModel
-        NER_MODELS_LIST = [
+        config.NER_MODELS_LIST = [
             "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
             "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
             "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
             "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
             "Babelscape/wikineural-multilingual-ner"
         ]
+        sx = sct.TextCleaner()
     ```
 ## API

SqueakyCleanText-0.2.5/sct/config.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""
+Module containing the configuration parameters for the SCT package.
+"""
+# Flag to detect the language automatically. If True, the language will be detected for each text.
+CHECK_DETECT_LANGUAGE = True
+# Flag to fix "broken" unicode such as mojibake and garbled HTML entities.
+CHECK_FIX_BAD_UNICODE = True
+# Flag to convert non-ASCII characters into their closest ASCII equivalents.
+CHECK_TO_ASCII_UNICODE = True
+# Flag to replace HTML tags with a special token.
+CHECK_REPLACE_HTML = True
+# Flag to replace URLs with a special token.
+CHECK_REPLACE_URLS = True
+# Flag to replace email addresses with a special token.
+CHECK_REPLACE_EMAILS = True
+# Flag to replace years with a special token.
+CHECK_REPLACE_YEARS = True
+# Flag to replace phone numbers with a special token.
+CHECK_REPLACE_PHONE_NUMBERS = True
+# Flag to replace numbers with a special token.
+CHECK_REPLACE_NUMBERS = True
+# Flag to replace currency symbols with their respective alphabetical equivalents.
+CHECK_REPLACE_CURRENCY_SYMBOLS = True
+# Flag to execute Named Entity Recognition (NER) to remove positional tags such as PER, LOC, ORG, MISC.
+CHECK_NER_PROCESS = True
+# Flag to remove any isolated letters which do not add any value to the text.
+CHECK_REMOVE_ISOLATED_LETTERS = True
+# Flag to remove any isolated symbols which should not be present in the text.
+CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
+# Flag to remove any unnecessary whitespace.
+CHECK_NORMALIZE_WHITESPACE = True
+# Flag to get the statistical model text, such as for fastText, SVM, LR.
+CHECK_STATISTICAL_MODEL_PROCESSING = True
+# Flag to convert all characters to lowercase.
+CHECK_CASEFOLD = True
+# Flag to remove stopwords based on the language. Uses NLTK stopwords.
+CHECK_REMOVE_STOPWORDS = True
+# Flag to remove all special symbols.
+CHECK_REMOVE_PUNCTUATION = True
+# Flag to remove custom stopwords specific to the SCT package.
+CHECK_REMOVE_STEXT_CUSTOM_STOP_WORDS = True
+# Special token to replace URLs.
+REPLACE_WITH_URL = "<URL>"
+# Special token to replace HTML tags.
+REPLACE_WITH_HTML = "<HTML>"
+# Special token to replace email addresses.
+REPLACE_WITH_EMAIL = "<EMAIL>"
+# Special token to replace years.
+REPLACE_WITH_YEARS = "<YEAR>"
+# Special token to replace phone numbers.
+REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
+# Special token to replace numbers.
+REPLACE_WITH_NUMBERS = "<NUMBER>"
+# Special token to replace currency symbols. If None, symbols will be replaced with their 3-letter abbreviations.
+REPLACE_WITH_CURRENCY_SYMBOLS = None
+# List of positional tags to be removed by NER.
+POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
+# Confidence threshold for NER.
+NER_CONFIDENCE_THRESHOLD = 0.85
+# Language to be used for NER. If None, the language will be detected automatically.
+LANGUAGE = None
+# List of pre-trained NER models in order of importance.
+NER_MODELS_LIST = [
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-english",  # English Model
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",  # Dutch Model
+    "FacebookAI/xlm-roberta-large-finetuned-conll03-german",  # German Model
+    "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",  # Spanish Model
+    "Babelscape/wikineural-multilingual-ner"  # Multilingual Model
+]

SqueakyCleanText 0.2.3__tar.gz → 0.2.5__tar.gz

SqueakyCleanText 0.2.3tar.gz → 0.2.5tar.gz