PyPI - streamlit-octostar-utils - Versions diffs - 2.11a1__tar.gz → 2.11a3__tar.gz - Mend

streamlit-octostar-utils 2.11a1tar.gz → 2.11a3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

{streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: streamlit-octostar-utils
-Version: 2.11a1
+Version: 2.11a3
 Summary:
 License: MIT
 License-File: LICENSE

{streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ include = '\.pyi?$'
 [tool.poetry]
 name = "streamlit-octostar-utils"
-version = "2.11a1"
+version = "2.11a3"
 description = ""
 license = "MIT"
 authors = ["Octostar"]

streamlit_octostar_utils-2.11a3/streamlit_octostar_utils/nlp/language.py ADDED Viewed

@@ -0,0 +1,55 @@
+import re
+import py3langid as langid
+import iso639 as languages
+def alpha2_to_language(alpha2: str) -> str:
+    if not alpha2:
+        return None
+    code = alpha2.strip().lower()
+    return languages.to_name(code)
+def language_to_alpha2(language_name: str) -> str:
+    if not language_name:
+        return None
+    name = language_name.strip().lower()
+    data = languages.find(name)
+    return data["iso639_1"]
+def detect_language(text, min_confidence=None):
+    detector = langid.langid.LanguageIdentifier.from_pickled_model(
+        langid.langid.MODEL_FILE, norm_probs=True
+    )
+    detected_lang, confidence = detector.classify(text)
+    if min_confidence and confidence < min_confidence:
+        return None, confidence
+    detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
+    detected_lang = languages.to_name(detected_lang).lower()
+    return detected_lang, confidence
+FLAIR_MODELS = {
+    "en": "flair/ner-english-large",
+    "es": "flair/ner-spanish-large",
+    "de": "flair/ner-german-large",
+    "nl": "flair/ner-dutch-large",
+    "multi": "flair/ner-multi",
+    "multi-fast": "flair/ner-multi-fast",
+}
+SPACY_MODELS = {
+    "en": 'en_core_web_sm',
+}
+def load_language_model(language, type):
+    from flair.models import SequenceTagger
+    from spacy_download import load_spacy
+    model = None
+    match type:
+        case "spacy":
+            model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
+            model = load_spacy(model_name)
+        case "flair":
+            model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
+            model = SequenceTagger.load(model_name)
+    return model

{streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/nlp/ner.py RENAMED Viewed

@@ -20,20 +20,67 @@ from sumy.summarizers.lsa import LsaSummarizer
 from sumy.summarizers.luhn import LuhnSummarizer
 from sumy.utils import get_stop_words
+from language import alpha2_to_language
+BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
+                       "CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
+PRESIDIO_TO_BASE_ALIASES = {
+    "PHONE_NUMBER": "PHONE",
+    "EMAIL_ADDRESS": "EMAIL",
+    "IBAN_CODE": "IBAN",
+    "DRIVER_LICENSE": "US_DRIVER_LICENSE",
+    "US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
+    "US_DRIVERS_LICENSE": "US_DRIVER_LICENSE",
+    "PASSPORT": "US_PASSPORT",
+    "CREDIT_CARD": "CREDIT_CARD",
+    "URL": "URL",
+    "IP_ADDRESS": "IP_ADDRESS",
+    "CRYPTO": "CRYPTO",
+    "CRYPTO_WALLET": "CRYPTO",
+    "CRYPTO_WALLET_ADDRESS": "CRYPTO",
+    "DATE_TIME": "DATE",
+    "LOCATION": "LOC",
+    "ORGANIZATION": "ORG",
+}
+BASE_TO_RECOGNIZER_EXPANSIONS = {
+    "ORG": ["ORG", "ORGANIZATION"],
+    "LOC": ["LOC", "LOCATION"],
+    "PHONE": ["PHONE", "PHONE_NUMBER"],
+    "EMAIL": ["EMAIL", "EMAIL_ADDRESS"],
+    "IBAN": ["IBAN", "IBAN_CODE"],
+    "US_DRIVER_LICENSE": ["US_DRIVER_LICENSE", "US_DRIVERS_LICENSE", "DRIVER_LICENSE"],
+    "US_PASSPORT": ["US_PASSPORT", "PASSPORT"],
+    "DATE": ["DATE", "DATE_TIME"],
+    "PERSON": ["PERSON"],
+    "URL": ["URL"],
+    "IP_ADDRESS": ["IP_ADDRESS"],
+    "CRYPTO": ["CRYPTO", "CRYPTO_WALLET", "CRYPTO_WALLET_ADDRESS"],
+    "CREDIT_CARD": ["CREDIT_CARD"],
+    "US_SSN": ["US_SSN"],
+    "MEDICAL_LICENSE": ["MEDICAL_LICENSE"],
+    "NORP": ["NORP"],
+    "GPE": ["GPE"],
+    "PRODUCT": ["PRODUCT"],
+}
+BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
 class FlairRecognizer(EntityRecognizer):
     ENTITIES = [
-        "LOCATION",
+        "LOC",
         "PERSON",
-        "ORGANIZATION",
+        "ORG",
     ]
     DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
     CHECK_LABEL_GROUPS = [
-        ({"LOCATION"}, {"LOC", "LOCATION"}),
+        ({"LOC"}, {"LOC", "LOCATION"}),
         ({"PERSON"}, {"PER", "PERSON"}),
-        ({"ORGANIZATION"}, {"ORG"}),
+        ({"ORG"}, {"ORG", "ORGANIZATION"}),
     ]
     MODEL_LANGUAGES = {
@@ -47,8 +94,8 @@ class FlairRecognizer(EntityRecognizer):
     PRESIDIO_EQUIVALENCES = {
         "PER": "PERSON",
-        "LOC": "LOCATION",
-        "ORG": "ORGANIZATION",
+        "LOC": "LOC",
+        "ORG": "ORG"
     }
     def __init__(
@@ -138,8 +185,17 @@ class FlairRecognizer(EntityRecognizer):
         )
-BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
-BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL"]
+def normalize_label(label: str) -> str:
+    return PRESIDIO_TO_BASE_ALIASES.get(label, label)
+def expand_entities_for_analyzer(entities_list):
+    expanded = set()
+    for e in entities_list:
+        vals = BASE_TO_RECOGNIZER_EXPANSIONS.get(e, [e])
+        for v in vals:
+            expanded.add(v)
+    return list(expanded)
 def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
@@ -175,17 +231,13 @@ def _sumy__luhn_call(summarizer, document):
 def get_nltk_tokenizer(language: str) -> Tokenizer:
-    if language == "en":
-        nltk_lang = "english"
-    elif language == "it":
-        nltk_lang = "italian"
-    else:
-        nltk_lang = language
+    nltk_lang = alpha2_to_language(language).lower()
     try:
         nltk.data.find("tokenizers/punkt")
     except LookupError:
         nltk.download("punkt")
     return Tokenizer(nltk_lang)
@@ -251,7 +303,7 @@ def build_presidio_analyzer(language: str, engine_type: str = "spacy", model=Non
         default_registry = RecognizerRegistry()
         default_registry.load_predefined_recognizers()
-        flair_handled_entities = {"PERSON", "LOCATION", "ORGANIZATION"}
+        flair_handled_entities = {"PERSON", "LOC", "ORG"}
         for recognizer in default_registry.recognizers:
             recognizer_entities = set(recognizer.supported_entities) if hasattr(recognizer, 'supported_entities') else set()
@@ -305,12 +357,12 @@ def analyze_column_sample(column_values: pd.Series, analyzer: AnalyzerEngine, la
         results = analyzer.analyze(
             text=text,
             language=language,
-            entities=entities if entities else None
+            entities=(expand_entities_for_analyzer(entities) if entities else None)
         )
         for result in results:
             if result.score >= score_threshold:
-                entity_counter[result.entity_type] += 1
+                entity_counter[normalize_label(result.entity_type)] += 1
     if not entity_counter:
         return None
@@ -390,7 +442,7 @@ def compute_ner_presidio(
             language=language,
             batch_size=batch_size,
             n_process=n_process,
-            entities=entities if entities else None,
+            entities=(expand_entities_for_analyzer(entities) if entities else None),
         )
         all_results = list(results_generator)
@@ -405,7 +457,7 @@ def compute_ner_presidio(
                     ner_objects.append(NERObject(
                         name=text_item[result.start:result.end],
-                        label=result.entity_type,
+                        label=normalize_label(result.entity_type),
                         score=float(result.score),
                         start=int(result.start),
                         count=1,
@@ -417,7 +469,7 @@ def compute_ner_presidio(
     results = analyzer.analyze(
         text=text,
         language=language,
-        entities=entities if entities else None
+        entities=(expand_entities_for_analyzer(entities) if entities else None)
     )
     ner_objects = []
@@ -430,7 +482,7 @@ def compute_ner_presidio(
             ner_objects.append(NERObject(
                 name=text[result.start:result.end],
-                label=result.entity_type,
+                label=normalize_label(result.entity_type),
                 score=float(result.score),
                 start=int(result.start),
                 count=1,
@@ -517,11 +569,10 @@ def ner_pipe(
             analyzer,
             entities,
             score_threshold,
-            150,
-            with_comentions,
-            with_context,
-            batch_size,
-            n_process
+            with_comentions=with_comentions,
+            with_context=with_context,
+            batch_size=batch_size,
+            n_process=n_process
         )
     else:
         if compression_ratio == "auto":
@@ -538,11 +589,10 @@ def ner_pipe(
             analyzer,
             entities,
             score_threshold,
-            150,
-            with_comentions,
-            with_context,
-            batch_size,
-            n_process
+            with_comentions=with_comentions,
+            with_context=with_context,
+            batch_size=batch_size,
+            n_process=n_process
         )
     return ner

streamlit_octostar_utils-2.11a1/streamlit_octostar_utils/nlp/language.py DELETED Viewed

@@ -1,15 +0,0 @@
-import re
-import py3langid as langid
-import iso639 as languages
-def detect_language(text, min_confidence=None):
-    detector = langid.langid.LanguageIdentifier.from_pickled_model(
-        langid.langid.MODEL_FILE, norm_probs=True
-    )
-    detected_lang, confidence = detector.classify(text)
-    if min_confidence and confidence < min_confidence:
-        return None, confidence
-    detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
-    detected_lang = languages.to_name(detected_lang).lower()
-    return detected_lang, confidence