streamlit-octostar-utils 2.11a5__tar.gz → 2.11a6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/pyproject.toml +1 -1
  3. streamlit_octostar_utils-2.11a6/streamlit_octostar_utils/nlp/language.py +64 -0
  4. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/nlp/ner.py +16 -45
  5. streamlit_octostar_utils-2.11a5/streamlit_octostar_utils/nlp/language.py +0 -28
  6. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/LICENSE +0 -0
  7. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/README.md +0 -0
  8. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/__init__.py +0 -0
  9. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  10. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
  11. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
  12. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/nifi.py +0 -0
  13. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  14. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  15. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  16. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  17. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  18. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  19. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  20. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  21. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  22. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  23. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/__init__.py +0 -0
  24. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/dict.py +0 -0
  25. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/filetypes.py +0 -0
  26. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  27. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  28. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/core/timestamp.py +0 -0
  29. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  30. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  31. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/octostar/client.py +0 -0
  32. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/octostar/context.py +0 -0
  33. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  34. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  35. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/ontology/expand_entities.py +0 -0
  36. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  37. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/ontology/validation.py +0 -0
  38. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/style/__init__.py +0 -0
  39. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/style/common.py +0 -0
  40. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/threading/__init__.py +0 -0
  41. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  42. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  43. {streamlit_octostar_utils-2.11a5 → streamlit_octostar_utils-2.11a6}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 2.11a5
3
+ Version: 2.11a6
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "2.11a5"
8
+ version = "2.11a6"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -0,0 +1,64 @@
1
+ import re
2
+ import py3langid as langid
3
+
4
+ from iso639 import Lang
5
+
6
+ FLAIR_MODELS = {
7
+ "en": "flair/ner-english-large",
8
+ "es": "flair/ner-spanish-large",
9
+ "de": "flair/ner-german-large",
10
+ "nl": "flair/ner-dutch-large",
11
+ "multi": "flair/ner-multi", # English, German, French, Spanish
12
+ "multi-fast": "flair/ner-multi-fast", # English, German, Dutch, Spanish
13
+ }
14
+
15
+ SPACY_MODELS = {
16
+ "en": "en_core_web_sm",
17
+ "es": "es_core_news_sm",
18
+ "fr": "fr_core_news_sm",
19
+ "de": "de_core_news_sm",
20
+ "it": "it_core_news_sm"
21
+ }
22
+
23
+
24
+ def alpha2_to_language(alpha2: str) -> str:
25
+ if not alpha2:
26
+ raise ValueError("Language code must be a non-empty string.")
27
+ return Lang(alpha2).name
28
+
29
+
30
+ def language_to_alpha2(language_name: str) -> str:
31
+ if not language_name:
32
+ raise ValueError("Language name must be a non-empty string.")
33
+
34
+ name = re.sub(r'\b\w+', lambda m: m.group(0).capitalize(), name)
35
+ return Lang(name).pt1
36
+
37
+
38
+ def detect_language(text, min_confidence=None):
39
+ detector = langid.langid.LanguageIdentifier.from_pickled_model(
40
+ langid.langid.MODEL_FILE, norm_probs=True
41
+ )
42
+ detected_lang, confidence = detector.classify(text)
43
+ if min_confidence and confidence < min_confidence:
44
+ return None, confidence
45
+ detected_lang = alpha2_to_language(detected_lang)
46
+ return detected_lang, confidence
47
+
48
+
49
+ def load_language_model(language, type):
50
+ from flair.models import SequenceTagger
51
+ from spacy_download import load_spacy
52
+
53
+ model = None
54
+
55
+ match type:
56
+ case "spacy":
57
+ model_name = SPACY_MODELS.get(language_to_alpha2(language), SPACY_MODELS["en"])
58
+ model = load_spacy(model_name)
59
+
60
+ case "flair":
61
+ model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
62
+ model = SequenceTagger.load(model_name)
63
+
64
+ return model
@@ -20,7 +20,7 @@ from sumy.summarizers.lsa import LsaSummarizer
20
20
  from sumy.summarizers.luhn import LuhnSummarizer
21
21
  from sumy.utils import get_stop_words
22
22
 
23
- from .language import alpha2_to_language, language_to_alpha2
23
+ from .language import alpha2_to_language
24
24
 
25
25
  BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
26
26
  "CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
@@ -67,38 +67,6 @@ BASE_TO_RECOGNIZER_EXPANSIONS = {
67
67
 
68
68
  BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
69
69
 
70
- FLAIR_MODELS = {
71
- "en": "flair/ner-english-large",
72
- "es": "flair/ner-spanish-large",
73
- "de": "flair/ner-german-large",
74
- "nl": "flair/ner-dutch-large",
75
- "multi": "flair/ner-multi", # English, German, French, Spanish
76
- "multi-fast": "flair/ner-multi-fast", # English, German, Dutch, Spanish
77
- }
78
-
79
- SPACY_MODELS = {
80
- "en": "en_core_web_sm",
81
- "es": "es_core_news_sm",
82
- "fr": "fr_core_news_sm",
83
- "de": "de_core_news_sm",
84
- "it": "it_core_news_sm"
85
- }
86
-
87
- def load_language_model(language, type):
88
- from flair.models import SequenceTagger
89
-
90
- model = None
91
-
92
- match type:
93
- case "spacy":
94
- model = SPACY_MODELS.get(language_to_alpha2(language), SPACY_MODELS["en"])
95
-
96
- case "flair":
97
- model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
98
- model = SequenceTagger.load(model_name)
99
-
100
- return model
101
-
102
70
 
103
71
  class FlairRecognizer(EntityRecognizer):
104
72
  ENTITIES = [
@@ -430,7 +398,7 @@ def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, lang
430
398
 
431
399
 
432
400
  def compute_ner_presidio(
433
- text,
401
+ text_or_df,
434
402
  language,
435
403
  analyzer,
436
404
  entities=None,
@@ -441,15 +409,15 @@ def compute_ner_presidio(
441
409
  batch_size=32,
442
410
  n_process=4
443
411
  ):
444
- if isinstance(text, pd.DataFrame):
445
- if len(text) >= 100:
446
- return analyze_dataframe_optimized(text, analyzer, language, entities, score_threshold)
412
+ if isinstance(text_or_df, pd.DataFrame):
413
+ if len(text_or_df) >= 100:
414
+ return analyze_dataframe_optimized(text_or_df, analyzer, language, entities, score_threshold)
447
415
 
448
416
  else:
449
417
  texts = []
450
418
 
451
- for col in text.columns:
452
- for idx, value in text[col].dropna().items():
419
+ for col in text_or_df.columns:
420
+ for idx, value in text_or_df[col].dropna().items():
453
421
  text_value = str(value).strip()
454
422
 
455
423
  if text_value:
@@ -457,7 +425,8 @@ def compute_ner_presidio(
457
425
 
458
426
  text = "\n".join(texts)
459
427
 
460
- elif isinstance(text, list):
428
+ elif isinstance(text_or_df, list):
429
+ text = text_or_df
461
430
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
462
431
 
463
432
  results_generator = batch_analyzer.analyze_iterator(
@@ -566,7 +535,7 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
566
535
 
567
536
 
568
537
  def ner_pipe(
569
- text,
538
+ text_or_df,
570
539
  language,
571
540
  model,
572
541
  engine_type="spacy",
@@ -589,9 +558,9 @@ def ner_pipe(
589
558
  model=model,
590
559
  )
591
560
 
592
- if isinstance(text, pd.DataFrame):
561
+ if isinstance(text_or_df, pd.DataFrame):
593
562
  ner = compute_ner_presidio(
594
- text,
563
+ text_or_df,
595
564
  language,
596
565
  analyzer,
597
566
  entities,
@@ -602,6 +571,8 @@ def ner_pipe(
602
571
  n_process=n_process
603
572
  )
604
573
  else:
574
+ text = text_or_df
575
+
605
576
  if compression_ratio == "auto":
606
577
  compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
607
578
 
@@ -640,8 +611,8 @@ def get_ner_handler(
640
611
  except LookupError:
641
612
  language = "en"
642
613
 
643
- return lambda text, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
644
- text,
614
+ return lambda text_or_df, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
615
+ text_or_df,
645
616
  language,
646
617
  model,
647
618
  engine_type,
@@ -1,28 +0,0 @@
1
- import re
2
- import py3langid as langid
3
- import iso639 as languages
4
-
5
-
6
- def alpha2_to_language(alpha2: str) -> str:
7
- if not alpha2:
8
- return None
9
- code = alpha2.strip().lower()
10
- return languages.to_name(code)
11
-
12
- def language_to_alpha2(language_name: str) -> str:
13
- if not language_name:
14
- return None
15
- name = language_name.strip().lower()
16
- data = languages.find(name)
17
- return data["iso639_1"]
18
-
19
- def detect_language(text, min_confidence=None):
20
- detector = langid.langid.LanguageIdentifier.from_pickled_model(
21
- langid.langid.MODEL_FILE, norm_probs=True
22
- )
23
- detected_lang, confidence = detector.classify(text)
24
- if min_confidence and confidence < min_confidence:
25
- return None, confidence
26
- detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
27
- detected_lang = languages.to_name(detected_lang).lower()
28
- return detected_lang, confidence