streamlit-octostar-utils 2.11a3__py3-none-any.whl → 2.11a5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- streamlit_octostar_utils/nlp/language.py +0 -27
- streamlit_octostar_utils/nlp/ner.py +39 -11
- {streamlit_octostar_utils-2.11a3.dist-info → streamlit_octostar_utils-2.11a5.dist-info}/METADATA +1 -1
- {streamlit_octostar_utils-2.11a3.dist-info → streamlit_octostar_utils-2.11a5.dist-info}/RECORD +6 -6
- {streamlit_octostar_utils-2.11a3.dist-info → streamlit_octostar_utils-2.11a5.dist-info}/WHEEL +0 -0
- {streamlit_octostar_utils-2.11a3.dist-info → streamlit_octostar_utils-2.11a5.dist-info}/licenses/LICENSE +0 -0
@@ -26,30 +26,3 @@ def detect_language(text, min_confidence=None):
|
|
26
26
|
detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
|
27
27
|
detected_lang = languages.to_name(detected_lang).lower()
|
28
28
|
return detected_lang, confidence
|
29
|
-
|
30
|
-
FLAIR_MODELS = {
|
31
|
-
"en": "flair/ner-english-large",
|
32
|
-
"es": "flair/ner-spanish-large",
|
33
|
-
"de": "flair/ner-german-large",
|
34
|
-
"nl": "flair/ner-dutch-large",
|
35
|
-
"multi": "flair/ner-multi",
|
36
|
-
"multi-fast": "flair/ner-multi-fast",
|
37
|
-
}
|
38
|
-
|
39
|
-
SPACY_MODELS = {
|
40
|
-
"en": 'en_core_web_sm',
|
41
|
-
}
|
42
|
-
|
43
|
-
def load_language_model(language, type):
|
44
|
-
from flair.models import SequenceTagger
|
45
|
-
from spacy_download import load_spacy
|
46
|
-
|
47
|
-
model = None
|
48
|
-
match type:
|
49
|
-
case "spacy":
|
50
|
-
model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
|
51
|
-
model = load_spacy(model_name)
|
52
|
-
case "flair":
|
53
|
-
model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
|
54
|
-
model = SequenceTagger.load(model_name)
|
55
|
-
return model
|
@@ -20,7 +20,7 @@ from sumy.summarizers.lsa import LsaSummarizer
|
|
20
20
|
from sumy.summarizers.luhn import LuhnSummarizer
|
21
21
|
from sumy.utils import get_stop_words
|
22
22
|
|
23
|
-
from language import alpha2_to_language
|
23
|
+
from .language import alpha2_to_language, language_to_alpha2
|
24
24
|
|
25
25
|
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
|
26
26
|
"CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
|
@@ -67,6 +67,38 @@ BASE_TO_RECOGNIZER_EXPANSIONS = {
|
|
67
67
|
|
68
68
|
BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
|
69
69
|
|
70
|
+
FLAIR_MODELS = {
|
71
|
+
"en": "flair/ner-english-large",
|
72
|
+
"es": "flair/ner-spanish-large",
|
73
|
+
"de": "flair/ner-german-large",
|
74
|
+
"nl": "flair/ner-dutch-large",
|
75
|
+
"multi": "flair/ner-multi", # English, German, French, Spanish
|
76
|
+
"multi-fast": "flair/ner-multi-fast", # English, German, Dutch, Spanish
|
77
|
+
}
|
78
|
+
|
79
|
+
SPACY_MODELS = {
|
80
|
+
"en": "en_core_web_sm",
|
81
|
+
"es": "es_core_news_sm",
|
82
|
+
"fr": "fr_core_news_sm",
|
83
|
+
"de": "de_core_news_sm",
|
84
|
+
"it": "it_core_news_sm"
|
85
|
+
}
|
86
|
+
|
87
|
+
def load_language_model(language, type):
|
88
|
+
from flair.models import SequenceTagger
|
89
|
+
|
90
|
+
model = None
|
91
|
+
|
92
|
+
match type:
|
93
|
+
case "spacy":
|
94
|
+
model = SPACY_MODELS.get(language_to_alpha2(language), SPACY_MODELS["en"])
|
95
|
+
|
96
|
+
case "flair":
|
97
|
+
model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
|
98
|
+
model = SequenceTagger.load(model_name)
|
99
|
+
|
100
|
+
return model
|
101
|
+
|
70
102
|
|
71
103
|
class FlairRecognizer(EntityRecognizer):
|
72
104
|
ENTITIES = [
|
@@ -83,15 +115,6 @@ class FlairRecognizer(EntityRecognizer):
|
|
83
115
|
({"ORG"}, {"ORG", "ORGANIZATION"}),
|
84
116
|
]
|
85
117
|
|
86
|
-
MODEL_LANGUAGES = {
|
87
|
-
"en": "flair/ner-english-large",
|
88
|
-
"es": "flair/ner-spanish-large",
|
89
|
-
"de": "flair/ner-german-large",
|
90
|
-
"nl": "flair/ner-dutch-large",
|
91
|
-
"multi": "flair/ner-multi",
|
92
|
-
"multi-fast": "flair/ner-multi-fast",
|
93
|
-
}
|
94
|
-
|
95
118
|
PRESIDIO_EQUIVALENCES = {
|
96
119
|
"PER": "PERSON",
|
97
120
|
"LOC": "LOC",
|
@@ -549,6 +572,7 @@ def ner_pipe(
|
|
549
572
|
engine_type="spacy",
|
550
573
|
fast=False,
|
551
574
|
compression_ratio="auto",
|
575
|
+
with_scores=False,
|
552
576
|
with_comentions=True,
|
553
577
|
with_context=True,
|
554
578
|
entities=None,
|
@@ -556,6 +580,9 @@ def ner_pipe(
|
|
556
580
|
batch_size=32,
|
557
581
|
n_process=4
|
558
582
|
):
|
583
|
+
if with_scores:
|
584
|
+
raise NotImplementedError("with_scores functionality is not implemented yet")
|
585
|
+
|
559
586
|
analyzer = build_presidio_analyzer(
|
560
587
|
language=language,
|
561
588
|
engine_type=engine_type,
|
@@ -613,13 +640,14 @@ def get_ner_handler(
|
|
613
640
|
except LookupError:
|
614
641
|
language = "en"
|
615
642
|
|
616
|
-
return lambda text, compression_ratio="auto", with_comentions=True, with_context=True: ner_pipe(
|
643
|
+
return lambda text, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
|
617
644
|
text,
|
618
645
|
language,
|
619
646
|
model,
|
620
647
|
engine_type,
|
621
648
|
fast,
|
622
649
|
compression_ratio,
|
650
|
+
with_scores,
|
623
651
|
with_comentions,
|
624
652
|
with_context,
|
625
653
|
entities,
|
{streamlit_octostar_utils-2.11a3.dist-info → streamlit_octostar_utils-2.11a5.dist-info}/RECORD
RENAMED
@@ -20,8 +20,8 @@ streamlit_octostar_utils/core/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEkt
|
|
20
20
|
streamlit_octostar_utils/core/threading/key_queue.py,sha256=7CJpj0gvZMQd8eC5wKQi3Ak5SQQ4zQ1OPTs_OP_kD20,2255
|
21
21
|
streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_yN1OByl8ahLc8,383
|
22
22
|
streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
|
23
|
-
streamlit_octostar_utils/nlp/language.py,sha256=
|
24
|
-
streamlit_octostar_utils/nlp/ner.py,sha256=
|
23
|
+
streamlit_octostar_utils/nlp/language.py,sha256=l48rBoLLBpTZz40N2KWNSpAWc8smcWMtiiDXREhmLtE,926
|
24
|
+
streamlit_octostar_utils/nlp/ner.py,sha256=LwnGbQHoT2mitroc0WjM2lVjtSUW7OUhqNmLsLMpNYQ,21196
|
25
25
|
streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
26
26
|
streamlit_octostar_utils/octostar/client.py,sha256=NUvHe9asd65g4-hJ4CuUvUns-9dNWes1XZRJlO9eAAc,1690
|
27
27
|
streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
|
@@ -36,7 +36,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
|
|
36
36
|
streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
|
37
37
|
streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
|
38
38
|
streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
|
39
|
-
streamlit_octostar_utils-2.
|
40
|
-
streamlit_octostar_utils-2.
|
41
|
-
streamlit_octostar_utils-2.
|
42
|
-
streamlit_octostar_utils-2.
|
39
|
+
streamlit_octostar_utils-2.11a5.dist-info/METADATA,sha256=sa3ksvvDUHpMWd_szqcaFI_x9u7dVwc9Ctj1gcAyujg,2330
|
40
|
+
streamlit_octostar_utils-2.11a5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
41
|
+
streamlit_octostar_utils-2.11a5.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
|
42
|
+
streamlit_octostar_utils-2.11a5.dist-info/RECORD,,
|
{streamlit_octostar_utils-2.11a3.dist-info → streamlit_octostar_utils-2.11a5.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|