streamlit-octostar-utils 2.11a5__py3-none-any.whl → 2.11a8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -330,7 +330,7 @@ class DefaultErrorRoute:
330
330
  if len(message) > MAX_ERROR_MESSAGE_BYTES:
331
331
  message = message[-MAX_ERROR_MESSAGE_BYTES:]
332
332
  try:
333
- tcbk = traceback.format_exception(exc)
333
+ tcbk = "\n".join(traceback.format_exception(exc))
334
334
  if len(tcbk) > MAX_ERROR_TRACEBACK_BYTES:
335
335
  tcbk = tcbk[-MAX_ERROR_TRACEBACK_BYTES:]
336
336
  except:
@@ -23,7 +23,7 @@ from octostar.client import make_client
23
23
  from ..core.dict import recursive_update_dict, travel_dict, jsondict_hash
24
24
  from ..core.timestamp import now, string_to_datetime
25
25
  from .fastapi import DefaultErrorRoute, Route
26
- from ..ontology.inheritance import is_child_concept as is_child_concept_fn
26
+ from ..ontology.inheritance import is_child_concept as is_child_concept_fn, get_label_keys
27
27
  from ..ontology.expand_entities import expand_entities
28
28
 
29
29
  RELATIONSHIP_ENTITY_NAME = "os_relationship"
@@ -69,6 +69,7 @@ class NifiEntityModel(BaseModel):
69
69
  class OntologyInfoModel(BaseModel):
70
70
  parents: List[str]
71
71
  relationships: List[str]
72
+ label_keys: List[str]
72
73
 
73
74
  class ContentsPointerModel(BaseModel):
74
75
  location: NifiContentsPointerLocationModel
@@ -806,15 +807,8 @@ class NifiEntity(object):
806
807
 
807
808
  @property
808
809
  def label(self):
809
- if not self.context.ontology:
810
- return None
811
- label_fields = self.context.ontology["concepts"][
812
- self.record.get("os_concept") or self.record.get("entity_type")
813
- ]["labelKeys"]
814
- label_fields = [field for field in label_fields if field]
815
- label = " ".join(
816
- [(self.record.get(field) or "") for field in label_fields]
817
- ).strip()
810
+ label_keys = self.request["ontology_info"]["label_keys"]
811
+ label = " ".join([(self.record.get(field) or "") for field in label_keys]).strip()
818
812
  if not label:
819
813
  label = None
820
814
  return label
@@ -923,14 +917,21 @@ class NifiEntity(object):
923
917
  now_time = now()
924
918
  random_id = str(uuid.uuid4())
925
919
  username = self.jwt_data["username"]
920
+ if entity_type == self.record["entity_type"]:
921
+ ont_parents = self.request["ontology_info"]["parents"]
922
+ ont_relationships = self.request["ontology_info"]["relationships"]
923
+ ont_label_keys = self.request["ontology_info"]["label_keys"]
924
+ else:
925
+ ont_parents = self.context.ontology["concepts"][entity_type]["parents"]
926
+ ont_relationships = self.context.ontology["concepts"][entity_type]["relationships"]
927
+ ont_label_keys = get_label_keys(entity_type, self.context.ontology)
926
928
  child_request = {
927
929
  "jwt": self.request["jwt"],
928
930
  "ontology_name": self.request["ontology_name"],
929
931
  "ontology_info": {
930
- "parents": self.context.ontology["concepts"][entity_type]["parents"],
931
- "relationships": self.context.ontology["concepts"][entity_type][
932
- "relationships"
933
- ],
932
+ "parents": ont_parents,
933
+ "relationships": ont_relationships,
934
+ "label_keys": ont_label_keys,
934
935
  },
935
936
  "entity_timestamp": None,
936
937
  "sync_params": {},
@@ -1,20 +1,41 @@
1
1
  import re
2
+ from typing import Optional
3
+
2
4
  import py3langid as langid
3
- import iso639 as languages
5
+
6
+ from iso639 import Lang, NonExistentLanguageError
7
+
8
+ FLAIR_MODELS = {
9
+ "en": "flair/ner-english-large",
10
+ "es": "flair/ner-spanish-large",
11
+ "de": "flair/ner-german-large",
12
+ "nl": "flair/ner-dutch-large",
13
+ "multi": "flair/ner-multi", # English, German, French, Spanish
14
+ "multi-fast": "flair/ner-multi-fast", # English, German, Dutch, Spanish
15
+ }
16
+
17
+ SPACY_MODELS = {
18
+ "en": "en_core_web_sm",
19
+ "es": "es_core_news_sm",
20
+ "fr": "fr_core_news_sm",
21
+ "de": "de_core_news_sm",
22
+ "it": "it_core_news_sm"
23
+ }
4
24
 
5
25
 
6
26
  def alpha2_to_language(alpha2: str) -> str:
7
27
  if not alpha2:
8
- return None
9
- code = alpha2.strip().lower()
10
- return languages.to_name(code)
28
+ raise ValueError("Language code must be a non-empty string.")
29
+ return Lang(alpha2).name
30
+
11
31
 
12
32
  def language_to_alpha2(language_name: str) -> str:
13
33
  if not language_name:
14
- return None
15
- name = language_name.strip().lower()
16
- data = languages.find(name)
17
- return data["iso639_1"]
34
+ raise ValueError("Language name must be a non-empty string.")
35
+
36
+ name = re.sub(r'\b\w+', lambda m: m.group(0).capitalize(), language_name)
37
+ return Lang(name).pt1
38
+
18
39
 
19
40
  def detect_language(text, min_confidence=None):
20
41
  detector = langid.langid.LanguageIdentifier.from_pickled_model(
@@ -23,6 +44,43 @@ def detect_language(text, min_confidence=None):
23
44
  detected_lang, confidence = detector.classify(text)
24
45
  if min_confidence and confidence < min_confidence:
25
46
  return None, confidence
26
- detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
27
- detected_lang = languages.to_name(detected_lang).lower()
47
+ detected_lang = alpha2_to_language(detected_lang)
28
48
  return detected_lang, confidence
49
+
50
+
51
+ def is_language_available(language: Optional[str], type: str) -> bool:
52
+ if not language:
53
+ return False
54
+
55
+ try:
56
+ lang_code = language_to_alpha2(language)
57
+
58
+ except NonExistentLanguageError:
59
+ lang_code = language
60
+
61
+ match type:
62
+ case "spacy":
63
+ return SPACY_MODELS.get(lang_code, None) is not None
64
+
65
+ case "flair":
66
+ return FLAIR_MODELS.get(lang_code, None) is not None
67
+
68
+
69
+ def load_language_model(language, type):
70
+ from flair.models import SequenceTagger
71
+ from spacy_download import load_spacy
72
+
73
+ match type:
74
+ case "spacy":
75
+ if is_language_available(language, "spacy"):
76
+ model_name = SPACY_MODELS.get(language_to_alpha2(language), SPACY_MODELS["en"])
77
+ return load_spacy(model_name)
78
+
79
+ raise Exception(f"SpaCy model for language '{language}' is not available.")
80
+
81
+ case "flair":
82
+ if is_language_available(language, "flair"):
83
+ model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
84
+ return SequenceTagger.load(model_name)
85
+
86
+ raise Exception(f"Flair model for language '{language}' is not available.")
@@ -20,7 +20,7 @@ from sumy.summarizers.lsa import LsaSummarizer
20
20
  from sumy.summarizers.luhn import LuhnSummarizer
21
21
  from sumy.utils import get_stop_words
22
22
 
23
- from .language import alpha2_to_language, language_to_alpha2
23
+ from .language import alpha2_to_language
24
24
 
25
25
  BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
26
26
  "CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
@@ -67,38 +67,6 @@ BASE_TO_RECOGNIZER_EXPANSIONS = {
67
67
 
68
68
  BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
69
69
 
70
- FLAIR_MODELS = {
71
- "en": "flair/ner-english-large",
72
- "es": "flair/ner-spanish-large",
73
- "de": "flair/ner-german-large",
74
- "nl": "flair/ner-dutch-large",
75
- "multi": "flair/ner-multi", # English, German, French, Spanish
76
- "multi-fast": "flair/ner-multi-fast", # English, German, Dutch, Spanish
77
- }
78
-
79
- SPACY_MODELS = {
80
- "en": "en_core_web_sm",
81
- "es": "es_core_news_sm",
82
- "fr": "fr_core_news_sm",
83
- "de": "de_core_news_sm",
84
- "it": "it_core_news_sm"
85
- }
86
-
87
- def load_language_model(language, type):
88
- from flair.models import SequenceTagger
89
-
90
- model = None
91
-
92
- match type:
93
- case "spacy":
94
- model = SPACY_MODELS.get(language_to_alpha2(language), SPACY_MODELS["en"])
95
-
96
- case "flair":
97
- model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
98
- model = SequenceTagger.load(model_name)
99
-
100
- return model
101
-
102
70
 
103
71
  class FlairRecognizer(EntityRecognizer):
104
72
  ENTITIES = [
@@ -430,7 +398,7 @@ def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, lang
430
398
 
431
399
 
432
400
  def compute_ner_presidio(
433
- text,
401
+ text_or_df,
434
402
  language,
435
403
  analyzer,
436
404
  entities=None,
@@ -441,15 +409,15 @@ def compute_ner_presidio(
441
409
  batch_size=32,
442
410
  n_process=4
443
411
  ):
444
- if isinstance(text, pd.DataFrame):
445
- if len(text) >= 100:
446
- return analyze_dataframe_optimized(text, analyzer, language, entities, score_threshold)
412
+ if isinstance(text_or_df, pd.DataFrame):
413
+ if len(text_or_df) >= 100:
414
+ return analyze_dataframe_optimized(text_or_df, analyzer, language, entities, score_threshold)
447
415
 
448
416
  else:
449
417
  texts = []
450
418
 
451
- for col in text.columns:
452
- for idx, value in text[col].dropna().items():
419
+ for col in text_or_df.columns:
420
+ for idx, value in text_or_df[col].dropna().items():
453
421
  text_value = str(value).strip()
454
422
 
455
423
  if text_value:
@@ -457,7 +425,8 @@ def compute_ner_presidio(
457
425
 
458
426
  text = "\n".join(texts)
459
427
 
460
- elif isinstance(text, list):
428
+ elif isinstance(text_or_df, list):
429
+ text = text_or_df
461
430
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
462
431
 
463
432
  results_generator = batch_analyzer.analyze_iterator(
@@ -566,7 +535,7 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
566
535
 
567
536
 
568
537
  def ner_pipe(
569
- text,
538
+ text_or_df,
570
539
  language,
571
540
  model,
572
541
  engine_type="spacy",
@@ -589,9 +558,9 @@ def ner_pipe(
589
558
  model=model,
590
559
  )
591
560
 
592
- if isinstance(text, pd.DataFrame):
561
+ if isinstance(text_or_df, pd.DataFrame):
593
562
  ner = compute_ner_presidio(
594
- text,
563
+ text_or_df,
595
564
  language,
596
565
  analyzer,
597
566
  entities,
@@ -602,6 +571,8 @@ def ner_pipe(
602
571
  n_process=n_process
603
572
  )
604
573
  else:
574
+ text = text_or_df
575
+
605
576
  if compression_ratio == "auto":
606
577
  compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
607
578
 
@@ -640,8 +611,8 @@ def get_ner_handler(
640
611
  except LookupError:
641
612
  language = "en"
642
613
 
643
- return lambda text, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
644
- text,
614
+ return lambda text_or_df, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
615
+ text_or_df,
645
616
  language,
646
617
  model,
647
618
  engine_type,
@@ -1,2 +1,15 @@
1
1
  def is_child_concept(type, parent_type, ontology):
2
2
  return type == parent_type or parent_type in ontology["concepts"][type]["parents"]
3
+
4
+ def get_label_keys(type, ontology):
5
+ parents = set(ontology["concepts"][type]["parents"])
6
+ parents.add(type)
7
+ parents = list(parents)
8
+ parents.reverse()
9
+ label_keys = {} # for guaranteed insertion order
10
+ for parent in parents:
11
+ for label_key in ontology["concepts"][parent]["labelKeys"]:
12
+ if not label_key:
13
+ continue
14
+ label_keys[label_key] = None
15
+ return list(label_keys.keys())
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 2.11a5
3
+ Version: 2.11a8
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -1,8 +1,8 @@
1
1
  streamlit_octostar_utils/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
2
2
  streamlit_octostar_utils/api_crafter/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
3
3
  streamlit_octostar_utils/api_crafter/celery.py,sha256=BXOTGN9egdD75qf-PkccLGAoniilB9PZ_NRchFIjWdw,30051
4
- streamlit_octostar_utils/api_crafter/fastapi.py,sha256=2bktT5Mwjs9XixWcOqUKMoLM_cgKl-cqZDUa2Imf4xA,14357
5
- streamlit_octostar_utils/api_crafter/nifi.py,sha256=yFs1HXpSVfWpOC1aJnNahjPofGzZ8fpuqvChloqM4rQ,45541
4
+ streamlit_octostar_utils/api_crafter/fastapi.py,sha256=RKQrStPzG1I1pxsPJvGs_DRrnjlMJbVmu9ObMF2LgZ0,14368
5
+ streamlit_octostar_utils/api_crafter/nifi.py,sha256=6PSWIFKjv8nzlFGH9IFRI3VrYsISNjDIPyi1RvLJoKk,45810
6
6
  streamlit_octostar_utils/api_crafter/parser/__init__.py,sha256=YeYWF6sdQiCFV_RKNW2t9Vs6KJExE2pbXxWTe_DOayY,107
7
7
  streamlit_octostar_utils/api_crafter/parser/combine_fields.py,sha256=ddc44xkajw8MU0peAX_263DL7rPXbTKbHUjpOhRgvyU,8790
8
8
  streamlit_octostar_utils/api_crafter/parser/entities_parser.py,sha256=zOQoN-p1Gz6ZzxvoX4M1b4Fi3mfmQr5zaNUcp_8gCjw,30016
@@ -20,15 +20,15 @@ streamlit_octostar_utils/core/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEkt
20
20
  streamlit_octostar_utils/core/threading/key_queue.py,sha256=7CJpj0gvZMQd8eC5wKQi3Ak5SQQ4zQ1OPTs_OP_kD20,2255
21
21
  streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_yN1OByl8ahLc8,383
22
22
  streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
23
- streamlit_octostar_utils/nlp/language.py,sha256=l48rBoLLBpTZz40N2KWNSpAWc8smcWMtiiDXREhmLtE,926
24
- streamlit_octostar_utils/nlp/ner.py,sha256=LwnGbQHoT2mitroc0WjM2lVjtSUW7OUhqNmLsLMpNYQ,21196
23
+ streamlit_octostar_utils/nlp/language.py,sha256=WEBhjr2UYgBGQnki0cY7d9kjp5RX5cYewUh57H6Om6o,2718
24
+ streamlit_octostar_utils/nlp/ner.py,sha256=5swAuH7r9xZ7c48ApqZfLqidjdf6f2qxK52KLk7-9Cc,20406
25
25
  streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
26
26
  streamlit_octostar_utils/octostar/client.py,sha256=NUvHe9asd65g4-hJ4CuUvUns-9dNWes1XZRJlO9eAAc,1690
27
27
  streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
28
28
  streamlit_octostar_utils/octostar/permissions.py,sha256=G5nZQLR-k-5_Xeto4nDTb32828Ga-SHm1mvSB9tz-t4,1565
29
29
  streamlit_octostar_utils/ontology/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
30
30
  streamlit_octostar_utils/ontology/expand_entities.py,sha256=bBt32Dnts3VSzu13QQtPyfYe05IRodD9WfnhNTiBg_w,22749
31
- streamlit_octostar_utils/ontology/inheritance.py,sha256=oSd6xDAlmI7iYOv3VJ7t8CRN2zK7_Cln26YHS20qAqw,138
31
+ streamlit_octostar_utils/ontology/inheritance.py,sha256=8GA2an1hbHfa6p993tIyfFLrewJHRUIFOw7dmvL8geU,583
32
32
  streamlit_octostar_utils/ontology/validation.py,sha256=0cXxEq8vQ63qxn4WianTioTcsmpsg4jEXVyI4R6x1gE,1051
33
33
  streamlit_octostar_utils/style/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
34
34
  streamlit_octostar_utils/style/common.py,sha256=TKfjV9-sIoJChGM7Ewg3uPsz5sMmPxFwmc0o3L4D8Qo,1496
@@ -36,7 +36,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
36
36
  streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
37
37
  streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
38
38
  streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
39
- streamlit_octostar_utils-2.11a5.dist-info/METADATA,sha256=sa3ksvvDUHpMWd_szqcaFI_x9u7dVwc9Ctj1gcAyujg,2330
40
- streamlit_octostar_utils-2.11a5.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
41
- streamlit_octostar_utils-2.11a5.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
42
- streamlit_octostar_utils-2.11a5.dist-info/RECORD,,
39
+ streamlit_octostar_utils-2.11a8.dist-info/METADATA,sha256=uoeaIC6YWiZgZAa36JP45z-AyS-s-PP2uIpfkKjtm1k,2330
40
+ streamlit_octostar_utils-2.11a8.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
41
+ streamlit_octostar_utils-2.11a8.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
42
+ streamlit_octostar_utils-2.11a8.dist-info/RECORD,,