streamlit-octostar-utils 2.11a1__py3-none-any.whl → 2.11a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- streamlit_octostar_utils/nlp/language.py +40 -0
- streamlit_octostar_utils/nlp/ner.py +81 -31
- {streamlit_octostar_utils-2.11a1.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/METADATA +1 -1
- {streamlit_octostar_utils-2.11a1.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/RECORD +6 -6
- {streamlit_octostar_utils-2.11a1.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/WHEEL +1 -1
- {streamlit_octostar_utils-2.11a1.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/licenses/LICENSE +0 -0
@@ -3,6 +3,19 @@ import py3langid as langid
|
|
3
3
|
import iso639 as languages
|
4
4
|
|
5
5
|
|
6
|
+
def alpha2_to_language(alpha2: str) -> str:
|
7
|
+
if not alpha2:
|
8
|
+
return None
|
9
|
+
code = alpha2.strip().lower()
|
10
|
+
return languages.to_name(code)
|
11
|
+
|
12
|
+
def language_to_alpha2(language_name: str) -> str:
|
13
|
+
if not language_name:
|
14
|
+
return None
|
15
|
+
name = language_name.strip().lower()
|
16
|
+
data = languages.find(name)
|
17
|
+
return data["iso639_1"]
|
18
|
+
|
6
19
|
def detect_language(text, min_confidence=None):
|
7
20
|
detector = langid.langid.LanguageIdentifier.from_pickled_model(
|
8
21
|
langid.langid.MODEL_FILE, norm_probs=True
|
@@ -13,3 +26,30 @@ def detect_language(text, min_confidence=None):
|
|
13
26
|
detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
|
14
27
|
detected_lang = languages.to_name(detected_lang).lower()
|
15
28
|
return detected_lang, confidence
|
29
|
+
|
30
|
+
FLAIR_MODELS = {
|
31
|
+
"en": "flair/ner-english-large",
|
32
|
+
"es": "flair/ner-spanish-large",
|
33
|
+
"de": "flair/ner-german-large",
|
34
|
+
"nl": "flair/ner-dutch-large",
|
35
|
+
"multi": "flair/ner-multi",
|
36
|
+
"multi-fast": "flair/ner-multi-fast",
|
37
|
+
}
|
38
|
+
|
39
|
+
SPACY_MODELS = {
|
40
|
+
"en": 'en_core_web_sm',
|
41
|
+
}
|
42
|
+
|
43
|
+
def load_language_model(language, type):
|
44
|
+
from flair.models import SequenceTagger
|
45
|
+
from spacy_download import load_spacy
|
46
|
+
|
47
|
+
model = None
|
48
|
+
match type:
|
49
|
+
case "spacy":
|
50
|
+
model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
|
51
|
+
model = load_spacy(model_name)
|
52
|
+
case "flair":
|
53
|
+
model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
|
54
|
+
model = SequenceTagger.load(model_name)
|
55
|
+
return model
|
@@ -20,20 +20,67 @@ from sumy.summarizers.lsa import LsaSummarizer
|
|
20
20
|
from sumy.summarizers.luhn import LuhnSummarizer
|
21
21
|
from sumy.utils import get_stop_words
|
22
22
|
|
23
|
+
from nlp.language import alpha2_to_language
|
24
|
+
|
25
|
+
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
|
26
|
+
"CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
|
27
|
+
|
28
|
+
PRESIDIO_TO_BASE_ALIASES = {
|
29
|
+
"PHONE_NUMBER": "PHONE",
|
30
|
+
"EMAIL_ADDRESS": "EMAIL",
|
31
|
+
"IBAN_CODE": "IBAN",
|
32
|
+
"DRIVER_LICENSE": "US_DRIVER_LICENSE",
|
33
|
+
"US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
|
34
|
+
"US_DRIVERS_LICENSE": "US_DRIVER_LICENSE",
|
35
|
+
"PASSPORT": "US_PASSPORT",
|
36
|
+
"CREDIT_CARD": "CREDIT_CARD",
|
37
|
+
"URL": "URL",
|
38
|
+
"IP_ADDRESS": "IP_ADDRESS",
|
39
|
+
"CRYPTO": "CRYPTO",
|
40
|
+
"CRYPTO_WALLET": "CRYPTO",
|
41
|
+
"CRYPTO_WALLET_ADDRESS": "CRYPTO",
|
42
|
+
"DATE_TIME": "DATE",
|
43
|
+
"LOCATION": "LOC",
|
44
|
+
"ORGANIZATION": "ORG",
|
45
|
+
}
|
46
|
+
|
47
|
+
BASE_TO_RECOGNIZER_EXPANSIONS = {
|
48
|
+
"ORG": ["ORG", "ORGANIZATION"],
|
49
|
+
"LOC": ["LOC", "LOCATION"],
|
50
|
+
"PHONE": ["PHONE", "PHONE_NUMBER"],
|
51
|
+
"EMAIL": ["EMAIL", "EMAIL_ADDRESS"],
|
52
|
+
"IBAN": ["IBAN", "IBAN_CODE"],
|
53
|
+
"US_DRIVER_LICENSE": ["US_DRIVER_LICENSE", "US_DRIVERS_LICENSE", "DRIVER_LICENSE"],
|
54
|
+
"US_PASSPORT": ["US_PASSPORT", "PASSPORT"],
|
55
|
+
"DATE": ["DATE", "DATE_TIME"],
|
56
|
+
"PERSON": ["PERSON"],
|
57
|
+
"URL": ["URL"],
|
58
|
+
"IP_ADDRESS": ["IP_ADDRESS"],
|
59
|
+
"CRYPTO": ["CRYPTO", "CRYPTO_WALLET", "CRYPTO_WALLET_ADDRESS"],
|
60
|
+
"CREDIT_CARD": ["CREDIT_CARD"],
|
61
|
+
"US_SSN": ["US_SSN"],
|
62
|
+
"MEDICAL_LICENSE": ["MEDICAL_LICENSE"],
|
63
|
+
"NORP": ["NORP"],
|
64
|
+
"GPE": ["GPE"],
|
65
|
+
"PRODUCT": ["PRODUCT"],
|
66
|
+
}
|
67
|
+
|
68
|
+
BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
|
69
|
+
|
23
70
|
|
24
71
|
class FlairRecognizer(EntityRecognizer):
|
25
72
|
ENTITIES = [
|
26
|
-
"
|
73
|
+
"LOC",
|
27
74
|
"PERSON",
|
28
|
-
"
|
75
|
+
"ORG",
|
29
76
|
]
|
30
77
|
|
31
78
|
DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
|
32
79
|
|
33
80
|
CHECK_LABEL_GROUPS = [
|
34
|
-
({"
|
81
|
+
({"LOC"}, {"LOC", "LOCATION"}),
|
35
82
|
({"PERSON"}, {"PER", "PERSON"}),
|
36
|
-
({"
|
83
|
+
({"ORG"}, {"ORG", "ORGANIZATION"}),
|
37
84
|
]
|
38
85
|
|
39
86
|
MODEL_LANGUAGES = {
|
@@ -47,8 +94,8 @@ class FlairRecognizer(EntityRecognizer):
|
|
47
94
|
|
48
95
|
PRESIDIO_EQUIVALENCES = {
|
49
96
|
"PER": "PERSON",
|
50
|
-
"LOC": "
|
51
|
-
"ORG": "
|
97
|
+
"LOC": "LOC",
|
98
|
+
"ORG": "ORG"
|
52
99
|
}
|
53
100
|
|
54
101
|
def __init__(
|
@@ -138,8 +185,17 @@ class FlairRecognizer(EntityRecognizer):
|
|
138
185
|
)
|
139
186
|
|
140
187
|
|
141
|
-
|
142
|
-
|
188
|
+
def normalize_label(label: str) -> str:
|
189
|
+
return PRESIDIO_TO_BASE_ALIASES.get(label, label)
|
190
|
+
|
191
|
+
|
192
|
+
def expand_entities_for_analyzer(entities_list):
|
193
|
+
expanded = set()
|
194
|
+
for e in entities_list:
|
195
|
+
vals = BASE_TO_RECOGNIZER_EXPANSIONS.get(e, [e])
|
196
|
+
for v in vals:
|
197
|
+
expanded.add(v)
|
198
|
+
return list(expanded)
|
143
199
|
|
144
200
|
|
145
201
|
def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
|
@@ -175,17 +231,13 @@ def _sumy__luhn_call(summarizer, document):
|
|
175
231
|
|
176
232
|
|
177
233
|
def get_nltk_tokenizer(language: str) -> Tokenizer:
|
178
|
-
|
179
|
-
nltk_lang = "english"
|
180
|
-
elif language == "it":
|
181
|
-
nltk_lang = "italian"
|
182
|
-
else:
|
183
|
-
nltk_lang = language
|
234
|
+
nltk_lang = alpha2_to_language(language).lower()
|
184
235
|
|
185
236
|
try:
|
186
237
|
nltk.data.find("tokenizers/punkt")
|
187
238
|
except LookupError:
|
188
239
|
nltk.download("punkt")
|
240
|
+
|
189
241
|
return Tokenizer(nltk_lang)
|
190
242
|
|
191
243
|
|
@@ -251,7 +303,7 @@ def build_presidio_analyzer(language: str, engine_type: str = "spacy", model=Non
|
|
251
303
|
default_registry = RecognizerRegistry()
|
252
304
|
default_registry.load_predefined_recognizers()
|
253
305
|
|
254
|
-
flair_handled_entities = {"PERSON", "
|
306
|
+
flair_handled_entities = {"PERSON", "LOC", "ORG"}
|
255
307
|
|
256
308
|
for recognizer in default_registry.recognizers:
|
257
309
|
recognizer_entities = set(recognizer.supported_entities) if hasattr(recognizer, 'supported_entities') else set()
|
@@ -305,12 +357,12 @@ def analyze_column_sample(column_values: pd.Series, analyzer: AnalyzerEngine, la
|
|
305
357
|
results = analyzer.analyze(
|
306
358
|
text=text,
|
307
359
|
language=language,
|
308
|
-
entities=entities if entities else None
|
360
|
+
entities=(expand_entities_for_analyzer(entities) if entities else None)
|
309
361
|
)
|
310
362
|
|
311
363
|
for result in results:
|
312
364
|
if result.score >= score_threshold:
|
313
|
-
entity_counter[result.entity_type] += 1
|
365
|
+
entity_counter[normalize_label(result.entity_type)] += 1
|
314
366
|
|
315
367
|
if not entity_counter:
|
316
368
|
return None
|
@@ -390,7 +442,7 @@ def compute_ner_presidio(
|
|
390
442
|
language=language,
|
391
443
|
batch_size=batch_size,
|
392
444
|
n_process=n_process,
|
393
|
-
entities=entities if entities else None,
|
445
|
+
entities=(expand_entities_for_analyzer(entities) if entities else None),
|
394
446
|
)
|
395
447
|
|
396
448
|
all_results = list(results_generator)
|
@@ -405,7 +457,7 @@ def compute_ner_presidio(
|
|
405
457
|
|
406
458
|
ner_objects.append(NERObject(
|
407
459
|
name=text_item[result.start:result.end],
|
408
|
-
label=result.entity_type,
|
460
|
+
label=normalize_label(result.entity_type),
|
409
461
|
score=float(result.score),
|
410
462
|
start=int(result.start),
|
411
463
|
count=1,
|
@@ -417,7 +469,7 @@ def compute_ner_presidio(
|
|
417
469
|
results = analyzer.analyze(
|
418
470
|
text=text,
|
419
471
|
language=language,
|
420
|
-
entities=entities if entities else None
|
472
|
+
entities=(expand_entities_for_analyzer(entities) if entities else None)
|
421
473
|
)
|
422
474
|
|
423
475
|
ner_objects = []
|
@@ -430,7 +482,7 @@ def compute_ner_presidio(
|
|
430
482
|
|
431
483
|
ner_objects.append(NERObject(
|
432
484
|
name=text[result.start:result.end],
|
433
|
-
label=result.entity_type,
|
485
|
+
label=normalize_label(result.entity_type),
|
434
486
|
score=float(result.score),
|
435
487
|
start=int(result.start),
|
436
488
|
count=1,
|
@@ -517,11 +569,10 @@ def ner_pipe(
|
|
517
569
|
analyzer,
|
518
570
|
entities,
|
519
571
|
score_threshold,
|
520
|
-
|
521
|
-
|
522
|
-
|
523
|
-
|
524
|
-
n_process
|
572
|
+
with_comentions=with_comentions,
|
573
|
+
with_context=with_context,
|
574
|
+
batch_size=batch_size,
|
575
|
+
n_process=n_process
|
525
576
|
)
|
526
577
|
else:
|
527
578
|
if compression_ratio == "auto":
|
@@ -538,11 +589,10 @@ def ner_pipe(
|
|
538
589
|
analyzer,
|
539
590
|
entities,
|
540
591
|
score_threshold,
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
545
|
-
n_process
|
592
|
+
with_comentions=with_comentions,
|
593
|
+
with_context=with_context,
|
594
|
+
batch_size=batch_size,
|
595
|
+
n_process=n_process
|
546
596
|
)
|
547
597
|
|
548
598
|
return ner
|
{streamlit_octostar_utils-2.11a1.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/RECORD
RENAMED
@@ -20,8 +20,8 @@ streamlit_octostar_utils/core/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEkt
|
|
20
20
|
streamlit_octostar_utils/core/threading/key_queue.py,sha256=7CJpj0gvZMQd8eC5wKQi3Ak5SQQ4zQ1OPTs_OP_kD20,2255
|
21
21
|
streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_yN1OByl8ahLc8,383
|
22
22
|
streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
|
23
|
-
streamlit_octostar_utils/nlp/language.py,sha256=
|
24
|
-
streamlit_octostar_utils/nlp/ner.py,sha256=
|
23
|
+
streamlit_octostar_utils/nlp/language.py,sha256=2d8Wq8wTuo_ehjZekuoe3bgJD52ieEiZKDUPdKdOxZ0,1699
|
24
|
+
streamlit_octostar_utils/nlp/ner.py,sha256=fuEbmrzXODVqm5piZdfNGkLGSwkrYrJO8KaeKUh7Uk0,20384
|
25
25
|
streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
26
26
|
streamlit_octostar_utils/octostar/client.py,sha256=NUvHe9asd65g4-hJ4CuUvUns-9dNWes1XZRJlO9eAAc,1690
|
27
27
|
streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
|
@@ -36,7 +36,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
|
|
36
36
|
streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
|
37
37
|
streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
|
38
38
|
streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
|
39
|
-
streamlit_octostar_utils-2.
|
40
|
-
streamlit_octostar_utils-2.
|
41
|
-
streamlit_octostar_utils-2.
|
42
|
-
streamlit_octostar_utils-2.
|
39
|
+
streamlit_octostar_utils-2.11a2.dist-info/METADATA,sha256=lL8vvLY29MCTZ_gopVIlnWx436E3ZAyE6QGX9cY9qO8,2330
|
40
|
+
streamlit_octostar_utils-2.11a2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
41
|
+
streamlit_octostar_utils-2.11a2.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
|
42
|
+
streamlit_octostar_utils-2.11a2.dist-info/RECORD,,
|
File without changes
|