streamlit-octostar-utils 2.11a1__tar.gz → 2.11a3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/pyproject.toml +1 -1
  3. streamlit_octostar_utils-2.11a3/streamlit_octostar_utils/nlp/language.py +55 -0
  4. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/nlp/ner.py +81 -31
  5. streamlit_octostar_utils-2.11a1/streamlit_octostar_utils/nlp/language.py +0 -15
  6. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/LICENSE +0 -0
  7. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/README.md +0 -0
  8. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/__init__.py +0 -0
  9. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  10. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
  11. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
  12. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/nifi.py +0 -0
  13. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  14. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  15. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  16. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  17. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  18. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  19. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  20. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  21. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  22. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  23. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/core/__init__.py +0 -0
  24. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/core/dict.py +0 -0
  25. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/core/filetypes.py +0 -0
  26. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  27. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  28. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/core/timestamp.py +0 -0
  29. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  30. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  31. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/octostar/client.py +0 -0
  32. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/octostar/context.py +0 -0
  33. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  34. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  35. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/ontology/expand_entities.py +0 -0
  36. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  37. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/ontology/validation.py +0 -0
  38. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/style/__init__.py +0 -0
  39. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/style/common.py +0 -0
  40. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/threading/__init__.py +0 -0
  41. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  42. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  43. {streamlit_octostar_utils-2.11a1 → streamlit_octostar_utils-2.11a3}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 2.11a1
3
+ Version: 2.11a3
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "2.11a1"
8
+ version = "2.11a3"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -0,0 +1,55 @@
1
+ import re
2
+ import py3langid as langid
3
+ import iso639 as languages
4
+
5
+
6
+ def alpha2_to_language(alpha2: str) -> str:
7
+ if not alpha2:
8
+ return None
9
+ code = alpha2.strip().lower()
10
+ return languages.to_name(code)
11
+
12
+ def language_to_alpha2(language_name: str) -> str:
13
+ if not language_name:
14
+ return None
15
+ name = language_name.strip().lower()
16
+ data = languages.find(name)
17
+ return data["iso639_1"]
18
+
19
+ def detect_language(text, min_confidence=None):
20
+ detector = langid.langid.LanguageIdentifier.from_pickled_model(
21
+ langid.langid.MODEL_FILE, norm_probs=True
22
+ )
23
+ detected_lang, confidence = detector.classify(text)
24
+ if min_confidence and confidence < min_confidence:
25
+ return None, confidence
26
+ detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
27
+ detected_lang = languages.to_name(detected_lang).lower()
28
+ return detected_lang, confidence
29
+
30
+ FLAIR_MODELS = {
31
+ "en": "flair/ner-english-large",
32
+ "es": "flair/ner-spanish-large",
33
+ "de": "flair/ner-german-large",
34
+ "nl": "flair/ner-dutch-large",
35
+ "multi": "flair/ner-multi",
36
+ "multi-fast": "flair/ner-multi-fast",
37
+ }
38
+
39
+ SPACY_MODELS = {
40
+ "en": 'en_core_web_sm',
41
+ }
42
+
43
+ def load_language_model(language, type):
44
+ from flair.models import SequenceTagger
45
+ from spacy_download import load_spacy
46
+
47
+ model = None
48
+ match type:
49
+ case "spacy":
50
+ model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
51
+ model = load_spacy(model_name)
52
+ case "flair":
53
+ model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
54
+ model = SequenceTagger.load(model_name)
55
+ return model
@@ -20,20 +20,67 @@ from sumy.summarizers.lsa import LsaSummarizer
20
20
  from sumy.summarizers.luhn import LuhnSummarizer
21
21
  from sumy.utils import get_stop_words
22
22
 
23
+ from language import alpha2_to_language
24
+
25
+ BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
26
+ "CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
27
+
28
+ PRESIDIO_TO_BASE_ALIASES = {
29
+ "PHONE_NUMBER": "PHONE",
30
+ "EMAIL_ADDRESS": "EMAIL",
31
+ "IBAN_CODE": "IBAN",
32
+ "DRIVER_LICENSE": "US_DRIVER_LICENSE",
33
+ "US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
34
+ "US_DRIVERS_LICENSE": "US_DRIVER_LICENSE",
35
+ "PASSPORT": "US_PASSPORT",
36
+ "CREDIT_CARD": "CREDIT_CARD",
37
+ "URL": "URL",
38
+ "IP_ADDRESS": "IP_ADDRESS",
39
+ "CRYPTO": "CRYPTO",
40
+ "CRYPTO_WALLET": "CRYPTO",
41
+ "CRYPTO_WALLET_ADDRESS": "CRYPTO",
42
+ "DATE_TIME": "DATE",
43
+ "LOCATION": "LOC",
44
+ "ORGANIZATION": "ORG",
45
+ }
46
+
47
+ BASE_TO_RECOGNIZER_EXPANSIONS = {
48
+ "ORG": ["ORG", "ORGANIZATION"],
49
+ "LOC": ["LOC", "LOCATION"],
50
+ "PHONE": ["PHONE", "PHONE_NUMBER"],
51
+ "EMAIL": ["EMAIL", "EMAIL_ADDRESS"],
52
+ "IBAN": ["IBAN", "IBAN_CODE"],
53
+ "US_DRIVER_LICENSE": ["US_DRIVER_LICENSE", "US_DRIVERS_LICENSE", "DRIVER_LICENSE"],
54
+ "US_PASSPORT": ["US_PASSPORT", "PASSPORT"],
55
+ "DATE": ["DATE", "DATE_TIME"],
56
+ "PERSON": ["PERSON"],
57
+ "URL": ["URL"],
58
+ "IP_ADDRESS": ["IP_ADDRESS"],
59
+ "CRYPTO": ["CRYPTO", "CRYPTO_WALLET", "CRYPTO_WALLET_ADDRESS"],
60
+ "CREDIT_CARD": ["CREDIT_CARD"],
61
+ "US_SSN": ["US_SSN"],
62
+ "MEDICAL_LICENSE": ["MEDICAL_LICENSE"],
63
+ "NORP": ["NORP"],
64
+ "GPE": ["GPE"],
65
+ "PRODUCT": ["PRODUCT"],
66
+ }
67
+
68
+ BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
69
+
23
70
 
24
71
  class FlairRecognizer(EntityRecognizer):
25
72
  ENTITIES = [
26
- "LOCATION",
73
+ "LOC",
27
74
  "PERSON",
28
- "ORGANIZATION",
75
+ "ORG",
29
76
  ]
30
77
 
31
78
  DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
32
79
 
33
80
  CHECK_LABEL_GROUPS = [
34
- ({"LOCATION"}, {"LOC", "LOCATION"}),
81
+ ({"LOC"}, {"LOC", "LOCATION"}),
35
82
  ({"PERSON"}, {"PER", "PERSON"}),
36
- ({"ORGANIZATION"}, {"ORG"}),
83
+ ({"ORG"}, {"ORG", "ORGANIZATION"}),
37
84
  ]
38
85
 
39
86
  MODEL_LANGUAGES = {
@@ -47,8 +94,8 @@ class FlairRecognizer(EntityRecognizer):
47
94
 
48
95
  PRESIDIO_EQUIVALENCES = {
49
96
  "PER": "PERSON",
50
- "LOC": "LOCATION",
51
- "ORG": "ORGANIZATION",
97
+ "LOC": "LOC",
98
+ "ORG": "ORG"
52
99
  }
53
100
 
54
101
  def __init__(
@@ -138,8 +185,17 @@ class FlairRecognizer(EntityRecognizer):
138
185
  )
139
186
 
140
187
 
141
- BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
142
- BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL"]
188
+ def normalize_label(label: str) -> str:
189
+ return PRESIDIO_TO_BASE_ALIASES.get(label, label)
190
+
191
+
192
+ def expand_entities_for_analyzer(entities_list):
193
+ expanded = set()
194
+ for e in entities_list:
195
+ vals = BASE_TO_RECOGNIZER_EXPANSIONS.get(e, [e])
196
+ for v in vals:
197
+ expanded.add(v)
198
+ return list(expanded)
143
199
 
144
200
 
145
201
  def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
@@ -175,17 +231,13 @@ def _sumy__luhn_call(summarizer, document):
175
231
 
176
232
 
177
233
  def get_nltk_tokenizer(language: str) -> Tokenizer:
178
- if language == "en":
179
- nltk_lang = "english"
180
- elif language == "it":
181
- nltk_lang = "italian"
182
- else:
183
- nltk_lang = language
234
+ nltk_lang = alpha2_to_language(language).lower()
184
235
 
185
236
  try:
186
237
  nltk.data.find("tokenizers/punkt")
187
238
  except LookupError:
188
239
  nltk.download("punkt")
240
+
189
241
  return Tokenizer(nltk_lang)
190
242
 
191
243
 
@@ -251,7 +303,7 @@ def build_presidio_analyzer(language: str, engine_type: str = "spacy", model=Non
251
303
  default_registry = RecognizerRegistry()
252
304
  default_registry.load_predefined_recognizers()
253
305
 
254
- flair_handled_entities = {"PERSON", "LOCATION", "ORGANIZATION"}
306
+ flair_handled_entities = {"PERSON", "LOC", "ORG"}
255
307
 
256
308
  for recognizer in default_registry.recognizers:
257
309
  recognizer_entities = set(recognizer.supported_entities) if hasattr(recognizer, 'supported_entities') else set()
@@ -305,12 +357,12 @@ def analyze_column_sample(column_values: pd.Series, analyzer: AnalyzerEngine, la
305
357
  results = analyzer.analyze(
306
358
  text=text,
307
359
  language=language,
308
- entities=entities if entities else None
360
+ entities=(expand_entities_for_analyzer(entities) if entities else None)
309
361
  )
310
362
 
311
363
  for result in results:
312
364
  if result.score >= score_threshold:
313
- entity_counter[result.entity_type] += 1
365
+ entity_counter[normalize_label(result.entity_type)] += 1
314
366
 
315
367
  if not entity_counter:
316
368
  return None
@@ -390,7 +442,7 @@ def compute_ner_presidio(
390
442
  language=language,
391
443
  batch_size=batch_size,
392
444
  n_process=n_process,
393
- entities=entities if entities else None,
445
+ entities=(expand_entities_for_analyzer(entities) if entities else None),
394
446
  )
395
447
 
396
448
  all_results = list(results_generator)
@@ -405,7 +457,7 @@ def compute_ner_presidio(
405
457
 
406
458
  ner_objects.append(NERObject(
407
459
  name=text_item[result.start:result.end],
408
- label=result.entity_type,
460
+ label=normalize_label(result.entity_type),
409
461
  score=float(result.score),
410
462
  start=int(result.start),
411
463
  count=1,
@@ -417,7 +469,7 @@ def compute_ner_presidio(
417
469
  results = analyzer.analyze(
418
470
  text=text,
419
471
  language=language,
420
- entities=entities if entities else None
472
+ entities=(expand_entities_for_analyzer(entities) if entities else None)
421
473
  )
422
474
 
423
475
  ner_objects = []
@@ -430,7 +482,7 @@ def compute_ner_presidio(
430
482
 
431
483
  ner_objects.append(NERObject(
432
484
  name=text[result.start:result.end],
433
- label=result.entity_type,
485
+ label=normalize_label(result.entity_type),
434
486
  score=float(result.score),
435
487
  start=int(result.start),
436
488
  count=1,
@@ -517,11 +569,10 @@ def ner_pipe(
517
569
  analyzer,
518
570
  entities,
519
571
  score_threshold,
520
- 150,
521
- with_comentions,
522
- with_context,
523
- batch_size,
524
- n_process
572
+ with_comentions=with_comentions,
573
+ with_context=with_context,
574
+ batch_size=batch_size,
575
+ n_process=n_process
525
576
  )
526
577
  else:
527
578
  if compression_ratio == "auto":
@@ -538,11 +589,10 @@ def ner_pipe(
538
589
  analyzer,
539
590
  entities,
540
591
  score_threshold,
541
- 150,
542
- with_comentions,
543
- with_context,
544
- batch_size,
545
- n_process
592
+ with_comentions=with_comentions,
593
+ with_context=with_context,
594
+ batch_size=batch_size,
595
+ n_process=n_process
546
596
  )
547
597
 
548
598
  return ner
@@ -1,15 +0,0 @@
1
- import re
2
- import py3langid as langid
3
- import iso639 as languages
4
-
5
-
6
- def detect_language(text, min_confidence=None):
7
- detector = langid.langid.LanguageIdentifier.from_pickled_model(
8
- langid.langid.MODEL_FILE, norm_probs=True
9
- )
10
- detected_lang, confidence = detector.classify(text)
11
- if min_confidence and confidence < min_confidence:
12
- return None, confidence
13
- detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
14
- detected_lang = languages.to_name(detected_lang).lower()
15
- return detected_lang, confidence