streamlit-octostar-utils 2.11a4__py3-none-any.whl → 2.11a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,20 +1,39 @@
1
1
  import re
2
2
  import py3langid as langid
3
- import iso639 as languages
3
+
4
+ from iso639 import Lang
5
+
6
+ FLAIR_MODELS = {
7
+ "en": "flair/ner-english-large",
8
+ "es": "flair/ner-spanish-large",
9
+ "de": "flair/ner-german-large",
10
+ "nl": "flair/ner-dutch-large",
11
+ "multi": "flair/ner-multi", # English, German, French, Spanish
12
+ "multi-fast": "flair/ner-multi-fast", # English, German, Dutch, Spanish
13
+ }
14
+
15
+ SPACY_MODELS = {
16
+ "en": "en_core_web_sm",
17
+ "es": "es_core_news_sm",
18
+ "fr": "fr_core_news_sm",
19
+ "de": "de_core_news_sm",
20
+ "it": "it_core_news_sm"
21
+ }
4
22
 
5
23
 
6
24
  def alpha2_to_language(alpha2: str) -> str:
7
25
  if not alpha2:
8
- return None
9
- code = alpha2.strip().lower()
10
- return languages.to_name(code)
26
+ raise ValueError("Language code must be a non-empty string.")
27
+ return Lang(alpha2).name
28
+
11
29
 
12
30
  def language_to_alpha2(language_name: str) -> str:
13
31
  if not language_name:
14
- return None
15
- name = language_name.strip().lower()
16
- data = languages.find(name)
17
- return data["iso639_1"]
32
+ raise ValueError("Language name must be a non-empty string.")
33
+
34
+ name = re.sub(r'\b\w+', lambda m: m.group(0).capitalize(), name)
35
+ return Lang(name).pt1
36
+
18
37
 
19
38
  def detect_language(text, min_confidence=None):
20
39
  detector = langid.langid.LanguageIdentifier.from_pickled_model(
@@ -23,33 +42,23 @@ def detect_language(text, min_confidence=None):
23
42
  detected_lang, confidence = detector.classify(text)
24
43
  if min_confidence and confidence < min_confidence:
25
44
  return None, confidence
26
- detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
27
- detected_lang = languages.to_name(detected_lang).lower()
45
+ detected_lang = alpha2_to_language(detected_lang)
28
46
  return detected_lang, confidence
29
47
 
30
- FLAIR_MODELS = {
31
- "en": "flair/ner-english-large",
32
- "es": "flair/ner-spanish-large",
33
- "de": "flair/ner-german-large",
34
- "nl": "flair/ner-dutch-large",
35
- "multi": "flair/ner-multi",
36
- "multi-fast": "flair/ner-multi-fast",
37
- }
38
-
39
- SPACY_MODELS = {
40
- "en": 'en_core_web_sm',
41
- }
42
48
 
43
49
  def load_language_model(language, type):
44
50
  from flair.models import SequenceTagger
45
51
  from spacy_download import load_spacy
46
52
 
47
53
  model = None
54
+
48
55
  match type:
49
56
  case "spacy":
50
- model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
57
+ model_name = SPACY_MODELS.get(language_to_alpha2(language), SPACY_MODELS["en"])
51
58
  model = load_spacy(model_name)
59
+
52
60
  case "flair":
53
61
  model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
54
62
  model = SequenceTagger.load(model_name)
55
- return model
63
+
64
+ return model
@@ -83,15 +83,6 @@ class FlairRecognizer(EntityRecognizer):
83
83
  ({"ORG"}, {"ORG", "ORGANIZATION"}),
84
84
  ]
85
85
 
86
- MODEL_LANGUAGES = {
87
- "en": "flair/ner-english-large",
88
- "es": "flair/ner-spanish-large",
89
- "de": "flair/ner-german-large",
90
- "nl": "flair/ner-dutch-large",
91
- "multi": "flair/ner-multi",
92
- "multi-fast": "flair/ner-multi-fast",
93
- }
94
-
95
86
  PRESIDIO_EQUIVALENCES = {
96
87
  "PER": "PERSON",
97
88
  "LOC": "LOC",
@@ -407,7 +398,7 @@ def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, lang
407
398
 
408
399
 
409
400
  def compute_ner_presidio(
410
- text,
401
+ text_or_df,
411
402
  language,
412
403
  analyzer,
413
404
  entities=None,
@@ -418,15 +409,15 @@ def compute_ner_presidio(
418
409
  batch_size=32,
419
410
  n_process=4
420
411
  ):
421
- if isinstance(text, pd.DataFrame):
422
- if len(text) >= 100:
423
- return analyze_dataframe_optimized(text, analyzer, language, entities, score_threshold)
412
+ if isinstance(text_or_df, pd.DataFrame):
413
+ if len(text_or_df) >= 100:
414
+ return analyze_dataframe_optimized(text_or_df, analyzer, language, entities, score_threshold)
424
415
 
425
416
  else:
426
417
  texts = []
427
418
 
428
- for col in text.columns:
429
- for idx, value in text[col].dropna().items():
419
+ for col in text_or_df.columns:
420
+ for idx, value in text_or_df[col].dropna().items():
430
421
  text_value = str(value).strip()
431
422
 
432
423
  if text_value:
@@ -434,7 +425,8 @@ def compute_ner_presidio(
434
425
 
435
426
  text = "\n".join(texts)
436
427
 
437
- elif isinstance(text, list):
428
+ elif isinstance(text_or_df, list):
429
+ text = text_or_df
438
430
  batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
439
431
 
440
432
  results_generator = batch_analyzer.analyze_iterator(
@@ -543,12 +535,13 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
543
535
 
544
536
 
545
537
  def ner_pipe(
546
- text,
538
+ text_or_df,
547
539
  language,
548
540
  model,
549
541
  engine_type="spacy",
550
542
  fast=False,
551
543
  compression_ratio="auto",
544
+ with_scores=False,
552
545
  with_comentions=True,
553
546
  with_context=True,
554
547
  entities=None,
@@ -556,15 +549,18 @@ def ner_pipe(
556
549
  batch_size=32,
557
550
  n_process=4
558
551
  ):
552
+ if with_scores:
553
+ raise NotImplementedError("with_scores functionality is not implemented yet")
554
+
559
555
  analyzer = build_presidio_analyzer(
560
556
  language=language,
561
557
  engine_type=engine_type,
562
558
  model=model,
563
559
  )
564
560
 
565
- if isinstance(text, pd.DataFrame):
561
+ if isinstance(text_or_df, pd.DataFrame):
566
562
  ner = compute_ner_presidio(
567
- text,
563
+ text_or_df,
568
564
  language,
569
565
  analyzer,
570
566
  entities,
@@ -575,6 +571,8 @@ def ner_pipe(
575
571
  n_process=n_process
576
572
  )
577
573
  else:
574
+ text = text_or_df
575
+
578
576
  if compression_ratio == "auto":
579
577
  compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
580
578
 
@@ -613,13 +611,14 @@ def get_ner_handler(
613
611
  except LookupError:
614
612
  language = "en"
615
613
 
616
- return lambda text, compression_ratio="auto", with_comentions=True, with_context=True: ner_pipe(
617
- text,
614
+ return lambda text_or_df, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
615
+ text_or_df,
618
616
  language,
619
617
  model,
620
618
  engine_type,
621
619
  fast,
622
620
  compression_ratio,
621
+ with_scores,
623
622
  with_comentions,
624
623
  with_context,
625
624
  entities,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 2.11a4
3
+ Version: 2.11a6
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -20,8 +20,8 @@ streamlit_octostar_utils/core/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEkt
20
20
  streamlit_octostar_utils/core/threading/key_queue.py,sha256=7CJpj0gvZMQd8eC5wKQi3Ak5SQQ4zQ1OPTs_OP_kD20,2255
21
21
  streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_yN1OByl8ahLc8,383
22
22
  streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
23
- streamlit_octostar_utils/nlp/language.py,sha256=2d8Wq8wTuo_ehjZekuoe3bgJD52ieEiZKDUPdKdOxZ0,1699
24
- streamlit_octostar_utils/nlp/ner.py,sha256=BP32wkZUNaKVIyzREEAgluPfwiISmNE4uITg7g1p0PM,20381
23
+ streamlit_octostar_utils/nlp/language.py,sha256=zmzGVd_RcJ3O5DHLOTjntZgnxQ5vKhxWu24_ihC1y8w,1929
24
+ streamlit_octostar_utils/nlp/ner.py,sha256=5swAuH7r9xZ7c48ApqZfLqidjdf6f2qxK52KLk7-9Cc,20406
25
25
  streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
26
26
  streamlit_octostar_utils/octostar/client.py,sha256=NUvHe9asd65g4-hJ4CuUvUns-9dNWes1XZRJlO9eAAc,1690
27
27
  streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
@@ -36,7 +36,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
36
36
  streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
37
37
  streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
38
38
  streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
39
- streamlit_octostar_utils-2.11a4.dist-info/METADATA,sha256=kRXpmh9YsBrKHNIPYW0gzzASNEwJr8Yj5SJb6gnr4WU,2330
40
- streamlit_octostar_utils-2.11a4.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
41
- streamlit_octostar_utils-2.11a4.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
42
- streamlit_octostar_utils-2.11a4.dist-info/RECORD,,
39
+ streamlit_octostar_utils-2.11a6.dist-info/METADATA,sha256=7FI-njG_MgeGy-YcXWQ_40COdjEHLnE3u3oSLRLIpNI,2330
40
+ streamlit_octostar_utils-2.11a6.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
41
+ streamlit_octostar_utils-2.11a6.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
42
+ streamlit_octostar_utils-2.11a6.dist-info/RECORD,,