streamlit-octostar-utils 2.11a4__tar.gz → 2.11a6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/PKG-INFO +1 -1
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/pyproject.toml +1 -1
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/nlp/language.py +33 -24
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/nlp/ner.py +20 -21
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/LICENSE +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/README.md +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/nifi.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/dict.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/filetypes.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/timestamp.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/nlp/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/octostar/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/octostar/client.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/octostar/context.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/octostar/permissions.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/ontology/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/ontology/expand_entities.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/ontology/validation.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/style/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/style/common.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/threading/__init__.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
- {streamlit_octostar_utils-2.11a4 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,20 +1,39 @@
|
|
1
1
|
import re
|
2
2
|
import py3langid as langid
|
3
|
-
|
3
|
+
|
4
|
+
from iso639 import Lang
|
5
|
+
|
6
|
+
FLAIR_MODELS = {
|
7
|
+
"en": "flair/ner-english-large",
|
8
|
+
"es": "flair/ner-spanish-large",
|
9
|
+
"de": "flair/ner-german-large",
|
10
|
+
"nl": "flair/ner-dutch-large",
|
11
|
+
"multi": "flair/ner-multi", # English, German, French, Spanish
|
12
|
+
"multi-fast": "flair/ner-multi-fast", # English, German, Dutch, Spanish
|
13
|
+
}
|
14
|
+
|
15
|
+
SPACY_MODELS = {
|
16
|
+
"en": "en_core_web_sm",
|
17
|
+
"es": "es_core_news_sm",
|
18
|
+
"fr": "fr_core_news_sm",
|
19
|
+
"de": "de_core_news_sm",
|
20
|
+
"it": "it_core_news_sm"
|
21
|
+
}
|
4
22
|
|
5
23
|
|
6
24
|
def alpha2_to_language(alpha2: str) -> str:
|
7
25
|
if not alpha2:
|
8
|
-
|
9
|
-
|
10
|
-
|
26
|
+
raise ValueError("Language code must be a non-empty string.")
|
27
|
+
return Lang(alpha2).name
|
28
|
+
|
11
29
|
|
12
30
|
def language_to_alpha2(language_name: str) -> str:
|
13
31
|
if not language_name:
|
14
|
-
|
15
|
-
|
16
|
-
|
17
|
-
return
|
32
|
+
raise ValueError("Language name must be a non-empty string.")
|
33
|
+
|
34
|
+
name = re.sub(r'\b\w+', lambda m: m.group(0).capitalize(), name)
|
35
|
+
return Lang(name).pt1
|
36
|
+
|
18
37
|
|
19
38
|
def detect_language(text, min_confidence=None):
|
20
39
|
detector = langid.langid.LanguageIdentifier.from_pickled_model(
|
@@ -23,33 +42,23 @@ def detect_language(text, min_confidence=None):
|
|
23
42
|
detected_lang, confidence = detector.classify(text)
|
24
43
|
if min_confidence and confidence < min_confidence:
|
25
44
|
return None, confidence
|
26
|
-
detected_lang =
|
27
|
-
detected_lang = languages.to_name(detected_lang).lower()
|
45
|
+
detected_lang = alpha2_to_language(detected_lang)
|
28
46
|
return detected_lang, confidence
|
29
47
|
|
30
|
-
FLAIR_MODELS = {
|
31
|
-
"en": "flair/ner-english-large",
|
32
|
-
"es": "flair/ner-spanish-large",
|
33
|
-
"de": "flair/ner-german-large",
|
34
|
-
"nl": "flair/ner-dutch-large",
|
35
|
-
"multi": "flair/ner-multi",
|
36
|
-
"multi-fast": "flair/ner-multi-fast",
|
37
|
-
}
|
38
|
-
|
39
|
-
SPACY_MODELS = {
|
40
|
-
"en": 'en_core_web_sm',
|
41
|
-
}
|
42
48
|
|
43
49
|
def load_language_model(language, type):
|
44
50
|
from flair.models import SequenceTagger
|
45
51
|
from spacy_download import load_spacy
|
46
52
|
|
47
53
|
model = None
|
54
|
+
|
48
55
|
match type:
|
49
56
|
case "spacy":
|
50
|
-
model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
|
57
|
+
model_name = SPACY_MODELS.get(language_to_alpha2(language), SPACY_MODELS["en"])
|
51
58
|
model = load_spacy(model_name)
|
59
|
+
|
52
60
|
case "flair":
|
53
61
|
model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
|
54
62
|
model = SequenceTagger.load(model_name)
|
55
|
-
|
63
|
+
|
64
|
+
return model
|
@@ -83,15 +83,6 @@ class FlairRecognizer(EntityRecognizer):
|
|
83
83
|
({"ORG"}, {"ORG", "ORGANIZATION"}),
|
84
84
|
]
|
85
85
|
|
86
|
-
MODEL_LANGUAGES = {
|
87
|
-
"en": "flair/ner-english-large",
|
88
|
-
"es": "flair/ner-spanish-large",
|
89
|
-
"de": "flair/ner-german-large",
|
90
|
-
"nl": "flair/ner-dutch-large",
|
91
|
-
"multi": "flair/ner-multi",
|
92
|
-
"multi-fast": "flair/ner-multi-fast",
|
93
|
-
}
|
94
|
-
|
95
86
|
PRESIDIO_EQUIVALENCES = {
|
96
87
|
"PER": "PERSON",
|
97
88
|
"LOC": "LOC",
|
@@ -407,7 +398,7 @@ def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, lang
|
|
407
398
|
|
408
399
|
|
409
400
|
def compute_ner_presidio(
|
410
|
-
|
401
|
+
text_or_df,
|
411
402
|
language,
|
412
403
|
analyzer,
|
413
404
|
entities=None,
|
@@ -418,15 +409,15 @@ def compute_ner_presidio(
|
|
418
409
|
batch_size=32,
|
419
410
|
n_process=4
|
420
411
|
):
|
421
|
-
if isinstance(
|
422
|
-
if len(
|
423
|
-
return analyze_dataframe_optimized(
|
412
|
+
if isinstance(text_or_df, pd.DataFrame):
|
413
|
+
if len(text_or_df) >= 100:
|
414
|
+
return analyze_dataframe_optimized(text_or_df, analyzer, language, entities, score_threshold)
|
424
415
|
|
425
416
|
else:
|
426
417
|
texts = []
|
427
418
|
|
428
|
-
for col in
|
429
|
-
for idx, value in
|
419
|
+
for col in text_or_df.columns:
|
420
|
+
for idx, value in text_or_df[col].dropna().items():
|
430
421
|
text_value = str(value).strip()
|
431
422
|
|
432
423
|
if text_value:
|
@@ -434,7 +425,8 @@ def compute_ner_presidio(
|
|
434
425
|
|
435
426
|
text = "\n".join(texts)
|
436
427
|
|
437
|
-
elif isinstance(
|
428
|
+
elif isinstance(text_or_df, list):
|
429
|
+
text = text_or_df
|
438
430
|
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
439
431
|
|
440
432
|
results_generator = batch_analyzer.analyze_iterator(
|
@@ -543,12 +535,13 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
|
|
543
535
|
|
544
536
|
|
545
537
|
def ner_pipe(
|
546
|
-
|
538
|
+
text_or_df,
|
547
539
|
language,
|
548
540
|
model,
|
549
541
|
engine_type="spacy",
|
550
542
|
fast=False,
|
551
543
|
compression_ratio="auto",
|
544
|
+
with_scores=False,
|
552
545
|
with_comentions=True,
|
553
546
|
with_context=True,
|
554
547
|
entities=None,
|
@@ -556,15 +549,18 @@ def ner_pipe(
|
|
556
549
|
batch_size=32,
|
557
550
|
n_process=4
|
558
551
|
):
|
552
|
+
if with_scores:
|
553
|
+
raise NotImplementedError("with_scores functionality is not implemented yet")
|
554
|
+
|
559
555
|
analyzer = build_presidio_analyzer(
|
560
556
|
language=language,
|
561
557
|
engine_type=engine_type,
|
562
558
|
model=model,
|
563
559
|
)
|
564
560
|
|
565
|
-
if isinstance(
|
561
|
+
if isinstance(text_or_df, pd.DataFrame):
|
566
562
|
ner = compute_ner_presidio(
|
567
|
-
|
563
|
+
text_or_df,
|
568
564
|
language,
|
569
565
|
analyzer,
|
570
566
|
entities,
|
@@ -575,6 +571,8 @@ def ner_pipe(
|
|
575
571
|
n_process=n_process
|
576
572
|
)
|
577
573
|
else:
|
574
|
+
text = text_or_df
|
575
|
+
|
578
576
|
if compression_ratio == "auto":
|
579
577
|
compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
|
580
578
|
|
@@ -613,13 +611,14 @@ def get_ner_handler(
|
|
613
611
|
except LookupError:
|
614
612
|
language = "en"
|
615
613
|
|
616
|
-
return lambda
|
617
|
-
|
614
|
+
return lambda text_or_df, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
|
615
|
+
text_or_df,
|
618
616
|
language,
|
619
617
|
model,
|
620
618
|
engine_type,
|
621
619
|
fast,
|
622
620
|
compression_ratio,
|
621
|
+
with_scores,
|
623
622
|
with_comentions,
|
624
623
|
with_context,
|
625
624
|
entities,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|