SqueakyCleanText 0.2.3__tar.gz → 0.2.5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. {SqueakyCleanText-0.2.3/SqueakyCleanText.egg-info → SqueakyCleanText-0.2.5}/PKG-INFO +34 -31
  2. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/README.md +33 -30
  3. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5/SqueakyCleanText.egg-info}/PKG-INFO +34 -31
  4. SqueakyCleanText-0.2.5/sct/config.py +100 -0
  5. SqueakyCleanText-0.2.5/sct/sct.py +341 -0
  6. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/utils/constants.py +2 -9
  7. SqueakyCleanText-0.2.5/sct/utils/contact.py +88 -0
  8. SqueakyCleanText-0.2.5/sct/utils/datetime.py +26 -0
  9. SqueakyCleanText-0.2.5/sct/utils/ner.py +367 -0
  10. SqueakyCleanText-0.2.5/sct/utils/normtext.py +148 -0
  11. SqueakyCleanText-0.2.5/sct/utils/special.py +113 -0
  12. SqueakyCleanText-0.2.5/sct/utils/stopwords.py +62 -0
  13. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/setup.py +1 -1
  14. SqueakyCleanText-0.2.3/sct/config.py +0 -57
  15. SqueakyCleanText-0.2.3/sct/sct.py +0 -129
  16. SqueakyCleanText-0.2.3/sct/utils/contact.py +0 -50
  17. SqueakyCleanText-0.2.3/sct/utils/datetime.py +0 -15
  18. SqueakyCleanText-0.2.3/sct/utils/ner.py +0 -227
  19. SqueakyCleanText-0.2.3/sct/utils/normtext.py +0 -83
  20. SqueakyCleanText-0.2.3/sct/utils/special.py +0 -50
  21. SqueakyCleanText-0.2.3/sct/utils/stopwords.py +0 -35
  22. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/LICENSE +0 -0
  23. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/MANIFEST.in +0 -0
  24. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/SOURCES.txt +0 -0
  25. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/dependency_links.txt +0 -0
  26. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/entry_points.txt +0 -0
  27. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/requires.txt +0 -0
  28. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/SqueakyCleanText.egg-info/top_level.txt +0 -0
  29. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/__init__.py +0 -0
  30. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/scripts/__init__.py +0 -0
  31. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/scripts/download_nltk_stopwords.py +0 -0
  32. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/utils/__init__.py +0 -0
  33. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/sct/utils/resources.py +0 -0
  34. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/setup.cfg +0 -0
  35. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/tests/__init__.py +0 -0
  36. {SqueakyCleanText-0.2.3 → SqueakyCleanText-0.2.5}/tests/test_sct.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SqueakyCleanText
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: A comprehensive text cleaning and preprocessing pipeline.
5
5
  Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
6
  Author: Rehan Fazal
@@ -156,50 +156,53 @@ You can modify the package’s functionality by changing settings in the configu
156
156
  Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
157
157
 
158
158
  ```python
159
+ from sct import sct, config
159
160
  # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
160
161
  # then only CHECK_DETECT_LANGUAGE will be considered False.
161
- CHECK_DETECT_LANGUAGE = True
162
- CHECK_FIX_BAD_UNICODE = True
163
- CHECK_TO_ASCII_UNICODE = True
164
- CHECK_REPLACE_HTML = True
165
- CHECK_REPLACE_URLS = True
166
- CHECK_REPLACE_EMAILS = True
167
- CHECK_REPLACE_YEARS = True
168
- CHECK_REPLACE_PHONE_NUMBERS = True
169
- CHECK_REPLACE_NUMBERS = True
170
- CHECK_REPLACE_CURRENCY_SYMBOLS = True
171
- CHECK_NER_PROCESS = True
172
- CHECK_REMOVE_ISOLATED_LETTERS = True
173
- CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
174
- CHECK_NORMALIZE_WHITESPACE = True
175
- CHECK_STATISTICAL_MODEL_PROCESSING = True
176
- CHECK_CASEFOLD = True
177
- CHECK_REMOVE_STOPWORDS = True
178
- CHECK_REMOVE_PUNCTUATION = True
179
- CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
162
+ config.CHECK_DETECT_LANGUAGE = True
163
+ config.CHECK_FIX_BAD_UNICODE = True
164
+ config.CHECK_TO_ASCII_UNICODE = True
165
+ config.CHECK_REPLACE_HTML = True
166
+ config.CHECK_REPLACE_URLS = True
167
+ config.CHECK_REPLACE_EMAILS = True
168
+ config.CHECK_REPLACE_YEARS = True
169
+ config.CHECK_REPLACE_PHONE_NUMBERS = True
170
+ config.CHECK_REPLACE_NUMBERS = True
171
+ config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
172
+ config.CHECK_NER_PROCESS = True
173
+ config.CHECK_REMOVE_ISOLATED_LETTERS = True
174
+ config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
175
+ config.CHECK_NORMALIZE_WHITESPACE = True
176
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = True
177
+ config.CHECK_CASEFOLD = True
178
+ config.CHECK_REMOVE_STOPWORDS = True
179
+ config.CHECK_REMOVE_PUNCTUATION = True
180
+ config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
180
181
  # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
181
- REPLACE_WITH_URL = "<URL>"
182
- REPLACE_WITH_HTML = "<HTML>"
183
- REPLACE_WITH_EMAIL = "<EMAIL>"
184
- REPLACE_WITH_YEARS = "<YEAR>"
185
- REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
186
- REPLACE_WITH_NUMBERS = "<NUMBER>"
187
- REPLACE_WITH_CURRENCY_SYMBOLS = None
182
+ config.REPLACE_WITH_URL = "<URL>"
183
+ config.REPLACE_WITH_HTML = "<HTML>"
184
+ config.REPLACE_WITH_EMAIL = "<EMAIL>"
185
+ config.REPLACE_WITH_YEARS = "<YEAR>"
186
+ config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
187
+ config.REPLACE_WITH_NUMBERS = "<NUMBER>"
188
+ config.REPLACE_WITH_CURRENCY_SYMBOLS = None
188
189
  # You can remove any of the tags
189
- POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
190
- NER_CONFIDENCE_THRESHOLD = 0.85
190
+ config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
191
+ config.NER_CONFIDENCE_THRESHOLD = 0.85
191
192
  # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
192
- LANGUAGE = None
193
+ config.LANGUAGE = None
193
194
 
194
195
  # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
195
196
  # All models passed need to support transformers AutoModel
196
- NER_MODELS_LIST = [
197
+ config.NER_MODELS_LIST = [
197
198
  "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
198
199
  "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
199
200
  "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
200
201
  "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
201
202
  "Babelscape/wikineural-multilingual-ner"
202
203
  ]
204
+
205
+ sx = sct.TextCleaner()
203
206
  ```
204
207
 
205
208
  ## API
@@ -132,50 +132,53 @@ You can modify the package’s functionality by changing settings in the configu
132
132
  Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
133
133
 
134
134
  ```python
135
+ from sct import sct, config
135
136
  # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
136
137
  # then only CHECK_DETECT_LANGUAGE will be considered False.
137
- CHECK_DETECT_LANGUAGE = True
138
- CHECK_FIX_BAD_UNICODE = True
139
- CHECK_TO_ASCII_UNICODE = True
140
- CHECK_REPLACE_HTML = True
141
- CHECK_REPLACE_URLS = True
142
- CHECK_REPLACE_EMAILS = True
143
- CHECK_REPLACE_YEARS = True
144
- CHECK_REPLACE_PHONE_NUMBERS = True
145
- CHECK_REPLACE_NUMBERS = True
146
- CHECK_REPLACE_CURRENCY_SYMBOLS = True
147
- CHECK_NER_PROCESS = True
148
- CHECK_REMOVE_ISOLATED_LETTERS = True
149
- CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
150
- CHECK_NORMALIZE_WHITESPACE = True
151
- CHECK_STATISTICAL_MODEL_PROCESSING = True
152
- CHECK_CASEFOLD = True
153
- CHECK_REMOVE_STOPWORDS = True
154
- CHECK_REMOVE_PUNCTUATION = True
155
- CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
138
+ config.CHECK_DETECT_LANGUAGE = True
139
+ config.CHECK_FIX_BAD_UNICODE = True
140
+ config.CHECK_TO_ASCII_UNICODE = True
141
+ config.CHECK_REPLACE_HTML = True
142
+ config.CHECK_REPLACE_URLS = True
143
+ config.CHECK_REPLACE_EMAILS = True
144
+ config.CHECK_REPLACE_YEARS = True
145
+ config.CHECK_REPLACE_PHONE_NUMBERS = True
146
+ config.CHECK_REPLACE_NUMBERS = True
147
+ config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
148
+ config.CHECK_NER_PROCESS = True
149
+ config.CHECK_REMOVE_ISOLATED_LETTERS = True
150
+ config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
151
+ config.CHECK_NORMALIZE_WHITESPACE = True
152
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = True
153
+ config.CHECK_CASEFOLD = True
154
+ config.CHECK_REMOVE_STOPWORDS = True
155
+ config.CHECK_REMOVE_PUNCTUATION = True
156
+ config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
156
157
  # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
157
- REPLACE_WITH_URL = "<URL>"
158
- REPLACE_WITH_HTML = "<HTML>"
159
- REPLACE_WITH_EMAIL = "<EMAIL>"
160
- REPLACE_WITH_YEARS = "<YEAR>"
161
- REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
162
- REPLACE_WITH_NUMBERS = "<NUMBER>"
163
- REPLACE_WITH_CURRENCY_SYMBOLS = None
158
+ config.REPLACE_WITH_URL = "<URL>"
159
+ config.REPLACE_WITH_HTML = "<HTML>"
160
+ config.REPLACE_WITH_EMAIL = "<EMAIL>"
161
+ config.REPLACE_WITH_YEARS = "<YEAR>"
162
+ config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
163
+ config.REPLACE_WITH_NUMBERS = "<NUMBER>"
164
+ config.REPLACE_WITH_CURRENCY_SYMBOLS = None
164
165
  # You can remove any of the tags
165
- POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
166
- NER_CONFIDENCE_THRESHOLD = 0.85
166
+ config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
167
+ config.NER_CONFIDENCE_THRESHOLD = 0.85
167
168
  # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
168
- LANGUAGE = None
169
+ config.LANGUAGE = None
169
170
 
170
171
  # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
171
172
  # All models passed need to support transformers AutoModel
172
- NER_MODELS_LIST = [
173
+ config.NER_MODELS_LIST = [
173
174
  "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
174
175
  "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
175
176
  "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
176
177
  "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
177
178
  "Babelscape/wikineural-multilingual-ner"
178
179
  ]
180
+
181
+ sx = sct.TextCleaner()
179
182
  ```
180
183
 
181
184
  ## API
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: SqueakyCleanText
3
- Version: 0.2.3
3
+ Version: 0.2.5
4
4
  Summary: A comprehensive text cleaning and preprocessing pipeline.
5
5
  Home-page: https://github.com/rhnfzl/SqueakyCleanText
6
6
  Author: Rehan Fazal
@@ -156,50 +156,53 @@ You can modify the package’s functionality by changing settings in the configu
156
156
  Similarly, other aspects of the configuration can be changed. Simply modify the settings before initializing TextCleaner(). Below is the full list of configurable settings:
157
157
 
158
158
  ```python
159
+ from sct import sct, config
159
160
  # In case Language detection is not required as well as No NER and No Statistical Model stopwords are needed,
160
161
  # then only CHECK_DETECT_LANGUAGE will be considered False.
161
- CHECK_DETECT_LANGUAGE = True
162
- CHECK_FIX_BAD_UNICODE = True
163
- CHECK_TO_ASCII_UNICODE = True
164
- CHECK_REPLACE_HTML = True
165
- CHECK_REPLACE_URLS = True
166
- CHECK_REPLACE_EMAILS = True
167
- CHECK_REPLACE_YEARS = True
168
- CHECK_REPLACE_PHONE_NUMBERS = True
169
- CHECK_REPLACE_NUMBERS = True
170
- CHECK_REPLACE_CURRENCY_SYMBOLS = True
171
- CHECK_NER_PROCESS = True
172
- CHECK_REMOVE_ISOLATED_LETTERS = True
173
- CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
174
- CHECK_NORMALIZE_WHITESPACE = True
175
- CHECK_STATISTICAL_MODEL_PROCESSING = True
176
- CHECK_CASEFOLD = True
177
- CHECK_REMOVE_STOPWORDS = True
178
- CHECK_REMOVE_PUNCTUATION = True
179
- CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
162
+ config.CHECK_DETECT_LANGUAGE = True
163
+ config.CHECK_FIX_BAD_UNICODE = True
164
+ config.CHECK_TO_ASCII_UNICODE = True
165
+ config.CHECK_REPLACE_HTML = True
166
+ config.CHECK_REPLACE_URLS = True
167
+ config.CHECK_REPLACE_EMAILS = True
168
+ config.CHECK_REPLACE_YEARS = True
169
+ config.CHECK_REPLACE_PHONE_NUMBERS = True
170
+ config.CHECK_REPLACE_NUMBERS = True
171
+ config.CHECK_REPLACE_CURRENCY_SYMBOLS = True
172
+ config.CHECK_NER_PROCESS = True
173
+ config.CHECK_REMOVE_ISOLATED_LETTERS = True
174
+ config.CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
175
+ config.CHECK_NORMALIZE_WHITESPACE = True
176
+ config.CHECK_STATISTICAL_MODEL_PROCESSING = True
177
+ config.CHECK_CASEFOLD = True
178
+ config.CHECK_REMOVE_STOPWORDS = True
179
+ config.CHECK_REMOVE_PUNCTUATION = True
180
+ config.CHECK_REMOVE_SCT_CUSTOM_STOP_WORDS = True
180
181
  # Tags can be replaced if needed, like if no special tags are necessary "" can be passed
181
- REPLACE_WITH_URL = "<URL>"
182
- REPLACE_WITH_HTML = "<HTML>"
183
- REPLACE_WITH_EMAIL = "<EMAIL>"
184
- REPLACE_WITH_YEARS = "<YEAR>"
185
- REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
186
- REPLACE_WITH_NUMBERS = "<NUMBER>"
187
- REPLACE_WITH_CURRENCY_SYMBOLS = None
182
+ config.REPLACE_WITH_URL = "<URL>"
183
+ config.REPLACE_WITH_HTML = "<HTML>"
184
+ config.REPLACE_WITH_EMAIL = "<EMAIL>"
185
+ config.REPLACE_WITH_YEARS = "<YEAR>"
186
+ config.REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
187
+ config.REPLACE_WITH_NUMBERS = "<NUMBER>"
188
+ config.REPLACE_WITH_CURRENCY_SYMBOLS = None
188
189
  # You can remove any of the tags
189
- POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
190
- NER_CONFIDENCE_THRESHOLD = 0.85
190
+ config.POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
191
+ config.NER_CONFIDENCE_THRESHOLD = 0.85
191
192
  # Pass it as ENGLISH, DUTCH, GERMAN etc. if you know the language of text beforehand.
192
- LANGUAGE = None
193
+ config.LANGUAGE = None
193
194
 
194
195
  # Order of the model is Important: English Model, Dutch Model, German Model, Spanish Model, MULTILINGUAL Model
195
196
  # All models passed need to support transformers AutoModel
196
- NER_MODELS_LIST = [
197
+ config.NER_MODELS_LIST = [
197
198
  "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
198
199
  "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch",
199
200
  "FacebookAI/xlm-roberta-large-finetuned-conll03-german",
200
201
  "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish",
201
202
  "Babelscape/wikineural-multilingual-ner"
202
203
  ]
204
+
205
+ sx = sct.TextCleaner()
203
206
  ```
204
207
 
205
208
  ## API
@@ -0,0 +1,100 @@
1
+ """
2
+ Module containing the configuration parameters for the SCT package.
3
+ """
4
+
5
+ # Flag to detect the language automatically. If True, the language will be detected for each text.
6
+ CHECK_DETECT_LANGUAGE = True
7
+
8
+ # Flag to fix "broken" unicode such as mojibake and garbled HTML entities.
9
+ CHECK_FIX_BAD_UNICODE = True
10
+
11
+ # Flag to convert non-ASCII characters into their closest ASCII equivalents.
12
+ CHECK_TO_ASCII_UNICODE = True
13
+
14
+ # Flag to replace HTML tags with a special token.
15
+ CHECK_REPLACE_HTML = True
16
+
17
+ # Flag to replace URLs with a special token.
18
+ CHECK_REPLACE_URLS = True
19
+
20
+ # Flag to replace email addresses with a special token.
21
+ CHECK_REPLACE_EMAILS = True
22
+
23
+ # Flag to replace years with a special token.
24
+ CHECK_REPLACE_YEARS = True
25
+
26
+ # Flag to replace phone numbers with a special token.
27
+ CHECK_REPLACE_PHONE_NUMBERS = True
28
+
29
+ # Flag to replace numbers with a special token.
30
+ CHECK_REPLACE_NUMBERS = True
31
+
32
+ # Flag to replace currency symbols with their respective alphabetical equivalents.
33
+ CHECK_REPLACE_CURRENCY_SYMBOLS = True
34
+
35
+ # Flag to execute Named Entity Recognition (NER) to remove positional tags such as PER, LOC, ORG, MISC.
36
+ CHECK_NER_PROCESS = True
37
+
38
+ # Flag to remove any isolated letters which do not add any value to the text.
39
+ CHECK_REMOVE_ISOLATED_LETTERS = True
40
+
41
+ # Flag to remove any isolated symbols which should not be present in the text.
42
+ CHECK_REMOVE_ISOLATED_SPECIAL_SYMBOLS = True
43
+
44
+ # Flag to remove any unnecessary whitespace.
45
+ CHECK_NORMALIZE_WHITESPACE = True
46
+
47
+ # Flag to get the statistical model text, such as for fastText, SVM, LR.
48
+ CHECK_STATISTICAL_MODEL_PROCESSING = True
49
+
50
+ # Flag to convert all characters to lowercase.
51
+ CHECK_CASEFOLD = True
52
+
53
+ # Flag to remove stopwords based on the language. Uses NLTK stopwords.
54
+ CHECK_REMOVE_STOPWORDS = True
55
+
56
+ # Flag to remove all special symbols.
57
+ CHECK_REMOVE_PUNCTUATION = True
58
+
59
+ # Flag to remove custom stopwords specific to the SCT package.
60
+ CHECK_REMOVE_STEXT_CUSTOM_STOP_WORDS = True
61
+
62
+ # Special token to replace URLs.
63
+ REPLACE_WITH_URL = "<URL>"
64
+
65
+ # Special token to replace HTML tags.
66
+ REPLACE_WITH_HTML = "<HTML>"
67
+
68
+ # Special token to replace email addresses.
69
+ REPLACE_WITH_EMAIL = "<EMAIL>"
70
+
71
+ # Special token to replace years.
72
+ REPLACE_WITH_YEARS = "<YEAR>"
73
+
74
+ # Special token to replace phone numbers.
75
+ REPLACE_WITH_PHONE_NUMBERS = "<PHONE>"
76
+
77
+ # Special token to replace numbers.
78
+ REPLACE_WITH_NUMBERS = "<NUMBER>"
79
+
80
+ # Special token to replace currency symbols. If None, symbols will be replaced with their 3-letter abbreviations.
81
+ REPLACE_WITH_CURRENCY_SYMBOLS = None
82
+
83
+ # List of positional tags to be removed by NER.
84
+ POSITIONAL_TAGS = ['PER', 'LOC', 'ORG']
85
+
86
+ # Confidence threshold for NER.
87
+ NER_CONFIDENCE_THRESHOLD = 0.85
88
+
89
+ # Language to be used for NER. If None, the language will be detected automatically.
90
+ LANGUAGE = None
91
+
92
+ # List of pre-trained NER models in order of importance.
93
+ NER_MODELS_LIST = [
94
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-english", # English Model
95
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-dutch", # Dutch Model
96
+ "FacebookAI/xlm-roberta-large-finetuned-conll03-german", # German Model
97
+ "FacebookAI/xlm-roberta-large-finetuned-conll02-spanish", # Spanish Model
98
+ "Babelscape/wikineural-multilingual-ner" # Multilingual Model
99
+ ]
100
+