streamlit-octostar-utils 2.11a5__py3-none-any.whl → 2.11a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- streamlit_octostar_utils/nlp/language.py +46 -10
- streamlit_octostar_utils/nlp/ner.py +16 -45
- {streamlit_octostar_utils-2.11a5.dist-info → streamlit_octostar_utils-2.11a6.dist-info}/METADATA +1 -1
- {streamlit_octostar_utils-2.11a5.dist-info → streamlit_octostar_utils-2.11a6.dist-info}/RECORD +6 -6
- {streamlit_octostar_utils-2.11a5.dist-info → streamlit_octostar_utils-2.11a6.dist-info}/WHEEL +0 -0
- {streamlit_octostar_utils-2.11a5.dist-info → streamlit_octostar_utils-2.11a6.dist-info}/licenses/LICENSE +0 -0
@@ -1,20 +1,39 @@
|
|
1
1
|
import re
|
2
2
|
import py3langid as langid
|
3
|
-
|
3
|
+
|
4
|
+
from iso639 import Lang
|
5
|
+
|
6
|
+
FLAIR_MODELS = {
|
7
|
+
"en": "flair/ner-english-large",
|
8
|
+
"es": "flair/ner-spanish-large",
|
9
|
+
"de": "flair/ner-german-large",
|
10
|
+
"nl": "flair/ner-dutch-large",
|
11
|
+
"multi": "flair/ner-multi", # English, German, French, Spanish
|
12
|
+
"multi-fast": "flair/ner-multi-fast", # English, German, Dutch, Spanish
|
13
|
+
}
|
14
|
+
|
15
|
+
SPACY_MODELS = {
|
16
|
+
"en": "en_core_web_sm",
|
17
|
+
"es": "es_core_news_sm",
|
18
|
+
"fr": "fr_core_news_sm",
|
19
|
+
"de": "de_core_news_sm",
|
20
|
+
"it": "it_core_news_sm"
|
21
|
+
}
|
4
22
|
|
5
23
|
|
6
24
|
def alpha2_to_language(alpha2: str) -> str:
|
7
25
|
if not alpha2:
|
8
|
-
|
9
|
-
|
10
|
-
|
26
|
+
raise ValueError("Language code must be a non-empty string.")
|
27
|
+
return Lang(alpha2).name
|
28
|
+
|
11
29
|
|
12
30
|
def language_to_alpha2(language_name: str) -> str:
|
13
31
|
if not language_name:
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
return
|
32
|
+
raise ValueError("Language name must be a non-empty string.")
|
33
|
+
|
34
|
+
name = re.sub(r'\b\w+', lambda m: m.group(0).capitalize(), name)
|
35
|
+
return Lang(name).pt1
|
36
|
+
|
18
37
|
|
19
38
|
def detect_language(text, min_confidence=None):
|
20
39
|
detector = langid.langid.LanguageIdentifier.from_pickled_model(
|
@@ -23,6 +42,23 @@ def detect_language(text, min_confidence=None):
|
|
23
42
|
detected_lang, confidence = detector.classify(text)
|
24
43
|
if min_confidence and confidence < min_confidence:
|
25
44
|
return None, confidence
|
26
|
-
detected_lang =
|
27
|
-
detected_lang = languages.to_name(detected_lang).lower()
|
45
|
+
detected_lang = alpha2_to_language(detected_lang)
|
28
46
|
return detected_lang, confidence
|
47
|
+
|
48
|
+
|
49
|
+
def load_language_model(language, type):
|
50
|
+
from flair.models import SequenceTagger
|
51
|
+
from spacy_download import load_spacy
|
52
|
+
|
53
|
+
model = None
|
54
|
+
|
55
|
+
match type:
|
56
|
+
case "spacy":
|
57
|
+
model_name = SPACY_MODELS.get(language_to_alpha2(language), SPACY_MODELS["en"])
|
58
|
+
model = load_spacy(model_name)
|
59
|
+
|
60
|
+
case "flair":
|
61
|
+
model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
|
62
|
+
model = SequenceTagger.load(model_name)
|
63
|
+
|
64
|
+
return model
|
@@ -20,7 +20,7 @@ from sumy.summarizers.lsa import LsaSummarizer
|
|
20
20
|
from sumy.summarizers.luhn import LuhnSummarizer
|
21
21
|
from sumy.utils import get_stop_words
|
22
22
|
|
23
|
-
from .language import alpha2_to_language
|
23
|
+
from .language import alpha2_to_language
|
24
24
|
|
25
25
|
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
|
26
26
|
"CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
|
@@ -67,38 +67,6 @@ BASE_TO_RECOGNIZER_EXPANSIONS = {
|
|
67
67
|
|
68
68
|
BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
|
69
69
|
|
70
|
-
FLAIR_MODELS = {
|
71
|
-
"en": "flair/ner-english-large",
|
72
|
-
"es": "flair/ner-spanish-large",
|
73
|
-
"de": "flair/ner-german-large",
|
74
|
-
"nl": "flair/ner-dutch-large",
|
75
|
-
"multi": "flair/ner-multi", # English, German, French, Spanish
|
76
|
-
"multi-fast": "flair/ner-multi-fast", # English, German, Dutch, Spanish
|
77
|
-
}
|
78
|
-
|
79
|
-
SPACY_MODELS = {
|
80
|
-
"en": "en_core_web_sm",
|
81
|
-
"es": "es_core_news_sm",
|
82
|
-
"fr": "fr_core_news_sm",
|
83
|
-
"de": "de_core_news_sm",
|
84
|
-
"it": "it_core_news_sm"
|
85
|
-
}
|
86
|
-
|
87
|
-
def load_language_model(language, type):
|
88
|
-
from flair.models import SequenceTagger
|
89
|
-
|
90
|
-
model = None
|
91
|
-
|
92
|
-
match type:
|
93
|
-
case "spacy":
|
94
|
-
model = SPACY_MODELS.get(language_to_alpha2(language), SPACY_MODELS["en"])
|
95
|
-
|
96
|
-
case "flair":
|
97
|
-
model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
|
98
|
-
model = SequenceTagger.load(model_name)
|
99
|
-
|
100
|
-
return model
|
101
|
-
|
102
70
|
|
103
71
|
class FlairRecognizer(EntityRecognizer):
|
104
72
|
ENTITIES = [
|
@@ -430,7 +398,7 @@ def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, lang
|
|
430
398
|
|
431
399
|
|
432
400
|
def compute_ner_presidio(
|
433
|
-
|
401
|
+
text_or_df,
|
434
402
|
language,
|
435
403
|
analyzer,
|
436
404
|
entities=None,
|
@@ -441,15 +409,15 @@ def compute_ner_presidio(
|
|
441
409
|
batch_size=32,
|
442
410
|
n_process=4
|
443
411
|
):
|
444
|
-
if isinstance(
|
445
|
-
if len(
|
446
|
-
return analyze_dataframe_optimized(
|
412
|
+
if isinstance(text_or_df, pd.DataFrame):
|
413
|
+
if len(text_or_df) >= 100:
|
414
|
+
return analyze_dataframe_optimized(text_or_df, analyzer, language, entities, score_threshold)
|
447
415
|
|
448
416
|
else:
|
449
417
|
texts = []
|
450
418
|
|
451
|
-
for col in
|
452
|
-
for idx, value in
|
419
|
+
for col in text_or_df.columns:
|
420
|
+
for idx, value in text_or_df[col].dropna().items():
|
453
421
|
text_value = str(value).strip()
|
454
422
|
|
455
423
|
if text_value:
|
@@ -457,7 +425,8 @@ def compute_ner_presidio(
|
|
457
425
|
|
458
426
|
text = "\n".join(texts)
|
459
427
|
|
460
|
-
elif isinstance(
|
428
|
+
elif isinstance(text_or_df, list):
|
429
|
+
text = text_or_df
|
461
430
|
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
462
431
|
|
463
432
|
results_generator = batch_analyzer.analyze_iterator(
|
@@ -566,7 +535,7 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
|
|
566
535
|
|
567
536
|
|
568
537
|
def ner_pipe(
|
569
|
-
|
538
|
+
text_or_df,
|
570
539
|
language,
|
571
540
|
model,
|
572
541
|
engine_type="spacy",
|
@@ -589,9 +558,9 @@ def ner_pipe(
|
|
589
558
|
model=model,
|
590
559
|
)
|
591
560
|
|
592
|
-
if isinstance(
|
561
|
+
if isinstance(text_or_df, pd.DataFrame):
|
593
562
|
ner = compute_ner_presidio(
|
594
|
-
|
563
|
+
text_or_df,
|
595
564
|
language,
|
596
565
|
analyzer,
|
597
566
|
entities,
|
@@ -602,6 +571,8 @@ def ner_pipe(
|
|
602
571
|
n_process=n_process
|
603
572
|
)
|
604
573
|
else:
|
574
|
+
text = text_or_df
|
575
|
+
|
605
576
|
if compression_ratio == "auto":
|
606
577
|
compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
|
607
578
|
|
@@ -640,8 +611,8 @@ def get_ner_handler(
|
|
640
611
|
except LookupError:
|
641
612
|
language = "en"
|
642
613
|
|
643
|
-
return lambda
|
644
|
-
|
614
|
+
return lambda text_or_df, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
|
615
|
+
text_or_df,
|
645
616
|
language,
|
646
617
|
model,
|
647
618
|
engine_type,
|
{streamlit_octostar_utils-2.11a5.dist-info → streamlit_octostar_utils-2.11a6.dist-info}/RECORD
RENAMED
@@ -20,8 +20,8 @@ streamlit_octostar_utils/core/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEkt
|
|
20
20
|
streamlit_octostar_utils/core/threading/key_queue.py,sha256=7CJpj0gvZMQd8eC5wKQi3Ak5SQQ4zQ1OPTs_OP_kD20,2255
|
21
21
|
streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_yN1OByl8ahLc8,383
|
22
22
|
streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
|
23
|
-
streamlit_octostar_utils/nlp/language.py,sha256=
|
24
|
-
streamlit_octostar_utils/nlp/ner.py,sha256=
|
23
|
+
streamlit_octostar_utils/nlp/language.py,sha256=zmzGVd_RcJ3O5DHLOTjntZgnxQ5vKhxWu24_ihC1y8w,1929
|
24
|
+
streamlit_octostar_utils/nlp/ner.py,sha256=5swAuH7r9xZ7c48ApqZfLqidjdf6f2qxK52KLk7-9Cc,20406
|
25
25
|
streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
26
26
|
streamlit_octostar_utils/octostar/client.py,sha256=NUvHe9asd65g4-hJ4CuUvUns-9dNWes1XZRJlO9eAAc,1690
|
27
27
|
streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
|
@@ -36,7 +36,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
|
|
36
36
|
streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
|
37
37
|
streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
|
38
38
|
streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
|
39
|
-
streamlit_octostar_utils-2.
|
40
|
-
streamlit_octostar_utils-2.
|
41
|
-
streamlit_octostar_utils-2.
|
42
|
-
streamlit_octostar_utils-2.
|
39
|
+
streamlit_octostar_utils-2.11a6.dist-info/METADATA,sha256=7FI-njG_MgeGy-YcXWQ_40COdjEHLnE3u3oSLRLIpNI,2330
|
40
|
+
streamlit_octostar_utils-2.11a6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
41
|
+
streamlit_octostar_utils-2.11a6.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
|
42
|
+
streamlit_octostar_utils-2.11a6.dist-info/RECORD,,
|
{streamlit_octostar_utils-2.11a5.dist-info → streamlit_octostar_utils-2.11a6.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|