streamlit-octostar-utils 0.2.10__py3-none-any.whl → 2.11a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -330,7 +330,7 @@ class DefaultErrorRoute:
330
330
  if len(message) > MAX_ERROR_MESSAGE_BYTES:
331
331
  message = message[-MAX_ERROR_MESSAGE_BYTES:]
332
332
  try:
333
- tcbk = "\n".join(traceback.format_exception(exc))
333
+ tcbk = traceback.format_exception(exc)
334
334
  if len(tcbk) > MAX_ERROR_TRACEBACK_BYTES:
335
335
  tcbk = tcbk[-MAX_ERROR_TRACEBACK_BYTES:]
336
336
  except:
@@ -3,6 +3,19 @@ import py3langid as langid
3
3
  import iso639 as languages
4
4
 
5
5
 
6
+ def alpha2_to_language(alpha2: str) -> str:
7
+ if not alpha2:
8
+ return None
9
+ code = alpha2.strip().lower()
10
+ return languages.to_name(code)
11
+
12
+ def language_to_alpha2(language_name: str) -> str:
13
+ if not language_name:
14
+ return None
15
+ name = language_name.strip().lower()
16
+ data = languages.find(name)
17
+ return data["iso639_1"]
18
+
6
19
  def detect_language(text, min_confidence=None):
7
20
  detector = langid.langid.LanguageIdentifier.from_pickled_model(
8
21
  langid.langid.MODEL_FILE, norm_probs=True
@@ -13,3 +26,30 @@ def detect_language(text, min_confidence=None):
13
26
  detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
14
27
  detected_lang = languages.to_name(detected_lang).lower()
15
28
  return detected_lang, confidence
29
+
30
+ FLAIR_MODELS = {
31
+ "en": "flair/ner-english-large",
32
+ "es": "flair/ner-spanish-large",
33
+ "de": "flair/ner-german-large",
34
+ "nl": "flair/ner-dutch-large",
35
+ "multi": "flair/ner-multi",
36
+ "multi-fast": "flair/ner-multi-fast",
37
+ }
38
+
39
+ SPACY_MODELS = {
40
+ "en": 'en_core_web_sm',
41
+ }
42
+
43
+ def load_language_model(language, type):
44
+ from flair.models import SequenceTagger
45
+ from spacy_download import load_spacy
46
+
47
+ model = None
48
+ match type:
49
+ case "spacy":
50
+ model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
51
+ model = load_spacy(model_name)
52
+ case "flair":
53
+ model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
54
+ model = SequenceTagger.load(model_name)
55
+ return model
@@ -1,39 +1,201 @@
1
- import re
1
+ import itertools
2
+ import math
3
+ from typing import Optional, List, Tuple
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+ from collections import Counter
6
+
7
+ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerRegistry, AnalysisExplanation, \
8
+ EntityRecognizer, RecognizerResult
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
2
10
  import streamlit as st
3
- from spacy_download import load_spacy
11
+ import nltk
12
+ import pandas as pd
4
13
  from flair.data import Sentence
5
14
  from flair.models import SequenceTagger
15
+
6
16
  from sumy.parsers.plaintext import PlaintextParser
7
17
  from sumy.nlp.tokenizers import Tokenizer
8
18
  from sumy.nlp.stemmers import Stemmer
9
19
  from sumy.summarizers.lsa import LsaSummarizer
10
20
  from sumy.summarizers.luhn import LuhnSummarizer
11
21
  from sumy.utils import get_stop_words
12
- import itertools
13
- import numpy as np
14
- import math
15
- import nltk
16
- from typing import Optional, List
17
- from pydantic import BaseModel, ConfigDict, Field
18
22
 
19
- SPACY_NER_MODELS = {
20
- "english": lambda: load_spacy(
21
- "en_core_web_sm",
22
- disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"],
23
- )
23
+ from nlp.language import alpha2_to_language
24
+
25
+ BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
26
+ "CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
27
+
28
+ PRESIDIO_TO_BASE_ALIASES = {
29
+ "PHONE_NUMBER": "PHONE",
30
+ "EMAIL_ADDRESS": "EMAIL",
31
+ "IBAN_CODE": "IBAN",
32
+ "DRIVER_LICENSE": "US_DRIVER_LICENSE",
33
+ "US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
34
+ "US_DRIVERS_LICENSE": "US_DRIVER_LICENSE",
35
+ "PASSPORT": "US_PASSPORT",
36
+ "CREDIT_CARD": "CREDIT_CARD",
37
+ "URL": "URL",
38
+ "IP_ADDRESS": "IP_ADDRESS",
39
+ "CRYPTO": "CRYPTO",
40
+ "CRYPTO_WALLET": "CRYPTO",
41
+ "CRYPTO_WALLET_ADDRESS": "CRYPTO",
42
+ "DATE_TIME": "DATE",
43
+ "LOCATION": "LOC",
44
+ "ORGANIZATION": "ORG",
24
45
  }
25
- FLAIR_NER_MODELS = {"english": lambda: SequenceTagger.load("flair/ner-english")}
26
- REGEX_NER_MODELS = {
27
- "IP_ADDRESS": [
28
- r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::(?:[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?\b",
29
- ],
30
- "PHONE": r"(?:(?:\+(?:\d{1,3}[ .-]?)?(?:\(\d{1,3}\)[ .-]?)?)(?:\d{2,5}[ .-]?){1,3}|\d{2,5}[ .-]\d{2,5}(?:[ .-]\d{2,5}){0,2})\b",
31
- "EMAIL": r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)+\b",
32
- "URL": r"\b(?:(?:https?|ftp|sftp|ftps|ssh|file|mailto|git|onion|ipfs|ipns):\/\/|www\.)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}(?::\d+)?(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?|(?:https?:\/\/)?[a-z2-7]{16,56}\.onion(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)\b",
46
+
47
+ BASE_TO_RECOGNIZER_EXPANSIONS = {
48
+ "ORG": ["ORG", "ORGANIZATION"],
49
+ "LOC": ["LOC", "LOCATION"],
50
+ "PHONE": ["PHONE", "PHONE_NUMBER"],
51
+ "EMAIL": ["EMAIL", "EMAIL_ADDRESS"],
52
+ "IBAN": ["IBAN", "IBAN_CODE"],
53
+ "US_DRIVER_LICENSE": ["US_DRIVER_LICENSE", "US_DRIVERS_LICENSE", "DRIVER_LICENSE"],
54
+ "US_PASSPORT": ["US_PASSPORT", "PASSPORT"],
55
+ "DATE": ["DATE", "DATE_TIME"],
56
+ "PERSON": ["PERSON"],
57
+ "URL": ["URL"],
58
+ "IP_ADDRESS": ["IP_ADDRESS"],
59
+ "CRYPTO": ["CRYPTO", "CRYPTO_WALLET", "CRYPTO_WALLET_ADDRESS"],
60
+ "CREDIT_CARD": ["CREDIT_CARD"],
61
+ "US_SSN": ["US_SSN"],
62
+ "MEDICAL_LICENSE": ["MEDICAL_LICENSE"],
63
+ "NORP": ["NORP"],
64
+ "GPE": ["GPE"],
65
+ "PRODUCT": ["PRODUCT"],
33
66
  }
34
67
 
35
68
  BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
36
- BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL"]
69
+
70
+
71
+ class FlairRecognizer(EntityRecognizer):
72
+ ENTITIES = [
73
+ "LOC",
74
+ "PERSON",
75
+ "ORG",
76
+ ]
77
+
78
+ DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
79
+
80
+ CHECK_LABEL_GROUPS = [
81
+ ({"LOC"}, {"LOC", "LOCATION"}),
82
+ ({"PERSON"}, {"PER", "PERSON"}),
83
+ ({"ORG"}, {"ORG", "ORGANIZATION"}),
84
+ ]
85
+
86
+ MODEL_LANGUAGES = {
87
+ "en": "flair/ner-english-large",
88
+ "es": "flair/ner-spanish-large",
89
+ "de": "flair/ner-german-large",
90
+ "nl": "flair/ner-dutch-large",
91
+ "multi": "flair/ner-multi",
92
+ "multi-fast": "flair/ner-multi-fast",
93
+ }
94
+
95
+ PRESIDIO_EQUIVALENCES = {
96
+ "PER": "PERSON",
97
+ "LOC": "LOC",
98
+ "ORG": "ORG"
99
+ }
100
+
101
+ def __init__(
102
+ self,
103
+ model: SequenceTagger = None,
104
+ supported_language: str = "en",
105
+ supported_entities: Optional[List[str]] = None,
106
+ check_label_groups: Optional[Tuple[set, set]] = None,
107
+ ):
108
+ self.check_label_groups = (
109
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
110
+ )
111
+
112
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
113
+ self.model = model
114
+
115
+ super().__init__(
116
+ supported_entities=supported_entities,
117
+ supported_language=supported_language,
118
+ name="Flair Analytics",
119
+ )
120
+
121
+ def load(self) -> None:
122
+ pass
123
+
124
+ def get_supported_entities(self) -> List[str]:
125
+ return self.supported_entities
126
+
127
+ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None) -> List[RecognizerResult]:
128
+ results = []
129
+
130
+ sentences = Sentence(text)
131
+ self.model.predict(sentences)
132
+
133
+ if not entities:
134
+ entities = self.supported_entities
135
+
136
+ for entity in entities:
137
+ if entity not in self.supported_entities:
138
+ continue
139
+
140
+ for ent in sentences.get_spans("ner"):
141
+ if not self.__check_label(
142
+ entity, ent.labels[0].value, self.check_label_groups
143
+ ):
144
+ continue
145
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
146
+ ent.labels[0].value
147
+ )
148
+ explanation = self.build_flair_explanation(
149
+ round(ent.score, 2), textual_explanation
150
+ )
151
+ flair_result = self._convert_to_recognizer_result(ent, explanation)
152
+
153
+ results.append(flair_result)
154
+
155
+ return results
156
+
157
+ def build_flair_explanation(self, original_score: float, explanation: str) -> AnalysisExplanation:
158
+ explanation = AnalysisExplanation(
159
+ recognizer=self.__class__.__name__,
160
+ original_score=original_score,
161
+ textual_explanation=explanation,
162
+ )
163
+ return explanation
164
+
165
+ def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
166
+ entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
167
+ flair_score = round(entity.score, 2)
168
+
169
+ flair_results = RecognizerResult(
170
+ entity_type=entity_type,
171
+ start=entity.start_position,
172
+ end=entity.end_position,
173
+ score=flair_score,
174
+ analysis_explanation=explanation,
175
+ )
176
+
177
+ return flair_results
178
+
179
+ @staticmethod
180
+ def __check_label(
181
+ entity: str, label: str, check_label_groups: Tuple[set, set]
182
+ ) -> bool:
183
+ return any(
184
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
185
+ )
186
+
187
+
188
+ def normalize_label(label: str) -> str:
189
+ return PRESIDIO_TO_BASE_ALIASES.get(label, label)
190
+
191
+
192
+ def expand_entities_for_analyzer(entities_list):
193
+ expanded = set()
194
+ for e in entities_list:
195
+ vals = BASE_TO_RECOGNIZER_EXPANSIONS.get(e, [e])
196
+ for v in vals:
197
+ expanded.add(v)
198
+ return list(expanded)
37
199
 
38
200
 
39
201
  def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
@@ -69,8 +231,14 @@ def _sumy__luhn_call(summarizer, document):
69
231
 
70
232
 
71
233
  def get_nltk_tokenizer(language: str) -> Tokenizer:
72
- nltk.data.find("tokenizers/punkt")
73
- return Tokenizer(language)
234
+ nltk_lang = alpha2_to_language(language).lower()
235
+
236
+ try:
237
+ nltk.data.find("tokenizers/punkt")
238
+ except LookupError:
239
+ nltk.download("punkt")
240
+
241
+ return Tokenizer(nltk_lang)
74
242
 
75
243
 
76
244
  class NERObject(BaseModel):
@@ -121,137 +289,217 @@ def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_enti
121
289
  return final_entities
122
290
 
123
291
 
124
- def compute_ner(
125
- language,
126
- sentences,
127
- spacy_model,
128
- flair_model=None,
129
- context_width=150,
130
- with_scores=True,
131
- with_comentions=True,
132
- with_context=True,
292
+ def build_presidio_analyzer(language: str, engine_type: str = "spacy", model=None) -> AnalyzerEngine:
293
+ registry = RecognizerRegistry()
294
+
295
+ if engine_type == "flair":
296
+
297
+ flair_recognizer = FlairRecognizer(
298
+ model=model,
299
+ supported_language=language
300
+ )
301
+ registry.add_recognizer(flair_recognizer)
302
+
303
+ default_registry = RecognizerRegistry()
304
+ default_registry.load_predefined_recognizers()
305
+
306
+ flair_handled_entities = {"PERSON", "LOC", "ORG"}
307
+
308
+ for recognizer in default_registry.recognizers:
309
+ recognizer_entities = set(recognizer.supported_entities) if hasattr(recognizer, 'supported_entities') else set()
310
+
311
+ if recognizer_entities and recognizer_entities.issubset(flair_handled_entities):
312
+ continue
313
+
314
+ registry.add_recognizer(recognizer)
315
+
316
+ return AnalyzerEngine(
317
+ registry=registry,
318
+ supported_languages=[language]
319
+ )
320
+
321
+ else:
322
+ registry.load_predefined_recognizers()
323
+
324
+ if model is None:
325
+ raise ValueError("SpaCy model name must be provided")
326
+
327
+ configuration = {
328
+ "nlp_engine_name": "spacy",
329
+ "models": [{"lang_code": language, "model_name": model}],
330
+ }
331
+
332
+ provider = NlpEngineProvider(nlp_configuration=configuration)
333
+ nlp_engine = provider.create_engine()
334
+
335
+ return AnalyzerEngine(
336
+ nlp_engine=nlp_engine,
337
+ registry=registry,
338
+ supported_languages=[language],
339
+ )
340
+
341
+
342
+ def analyze_column_sample(column_values: pd.Series, analyzer: AnalyzerEngine, language: str,
343
+ entities: Optional[List[str]], score_threshold: float) -> Optional[str]:
344
+ sample_values = column_values.dropna().head(50)
345
+
346
+ if sample_values.empty:
347
+ return None
348
+
349
+ entity_counter = Counter()
350
+
351
+ for value in sample_values:
352
+ text = str(value).strip()
353
+
354
+ if not text:
355
+ continue
356
+
357
+ results = analyzer.analyze(
358
+ text=text,
359
+ language=language,
360
+ entities=(expand_entities_for_analyzer(entities) if entities else None)
361
+ )
362
+
363
+ for result in results:
364
+ if result.score >= score_threshold:
365
+ entity_counter[normalize_label(result.entity_type)] += 1
366
+
367
+ if not entity_counter:
368
+ return None
369
+
370
+ most_common = entity_counter.most_common(1)[0]
371
+ total_detections = sum(entity_counter.values())
372
+
373
+ if most_common[1] > total_detections * 0.5:
374
+ return most_common[0]
375
+
376
+ return most_common[0] if entity_counter else None
377
+
378
+
379
+ def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, language: str,
380
+ entities: Optional[List[str]] = None, score_threshold: float = 0.5) -> List[NERObject]:
381
+ ner_objects = []
382
+
383
+ for column_name in df.columns:
384
+ entity_type = analyze_column_sample(
385
+ df[column_name],
386
+ analyzer,
387
+ language,
388
+ entities,
389
+ score_threshold
390
+ )
391
+
392
+ if entity_type:
393
+ for idx, value in df[column_name].dropna().items():
394
+ text = str(value).strip()
395
+
396
+ if text:
397
+ ner_objects.append(NERObject(
398
+ name=text[:100],
399
+ label=entity_type,
400
+ score=0.9,
401
+ start=0,
402
+ count=1,
403
+ context=text[:100]
404
+ ))
405
+
406
+ return ner_objects
407
+
408
+
409
+ def compute_ner_presidio(
410
+ text,
411
+ language,
412
+ analyzer,
413
+ entities=None,
414
+ score_threshold=0.5,
415
+ context_width=150,
416
+ with_comentions=True,
417
+ with_context=True,
418
+ batch_size=32,
419
+ n_process=4
133
420
  ):
134
- sentence_starts = [0] + [len(s[0]) + 1 for s in sentences]
135
- del sentence_starts[-1]
136
- sentence_starts = list(np.cumsum(sentence_starts))
137
- text = "\n".join([s[0] for s in sentences])
138
- min_score = 1.0
139
- entities: list[NERObject] = []
140
-
141
- # FLAIR model (if not fast)
142
- if flair_model:
143
- input = [Sentence(sentence[0]) for sentence in sentences]
144
- flair_model.predict(input)
145
- output = [e for sentence in input for e in sentence.get_spans("ner")]
146
- flair_entities = [
147
- NERObject(
148
- name=entity.text,
149
- label=BASE_TO_ONTONOTES_LABELMAP.get(
150
- entity.annotation_layers["ner"][0].value,
151
- entity.annotation_layers["ner"][0].value,
152
- ),
153
- score=entity.score,
154
- start=sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
421
+ if isinstance(text, pd.DataFrame):
422
+ if len(text) >= 100:
423
+ return analyze_dataframe_optimized(text, analyzer, language, entities, score_threshold)
424
+
425
+ else:
426
+ texts = []
427
+
428
+ for col in text.columns:
429
+ for idx, value in text[col].dropna().items():
430
+ text_value = str(value).strip()
431
+
432
+ if text_value:
433
+ texts.append(text_value)
434
+
435
+ text = "\n".join(texts)
436
+
437
+ elif isinstance(text, list):
438
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
439
+
440
+ results_generator = batch_analyzer.analyze_iterator(
441
+ texts=text,
442
+ language=language,
443
+ batch_size=batch_size,
444
+ n_process=n_process,
445
+ entities=(expand_entities_for_analyzer(entities) if entities else None),
446
+ )
447
+
448
+ all_results = list(results_generator)
449
+ ner_objects = []
450
+
451
+ for text_item, results in zip(text, all_results):
452
+ for result in results:
453
+ if result.score >= score_threshold:
454
+ context_start = max(0, result.start - 30)
455
+ context_end = min(len(text_item), result.end + 30)
456
+ context = text_item[context_start:context_end] if with_context else None
457
+
458
+ ner_objects.append(NERObject(
459
+ name=text_item[result.start:result.end],
460
+ label=normalize_label(result.entity_type),
461
+ score=float(result.score),
462
+ start=int(result.start),
463
+ count=1,
464
+ context=context
465
+ ))
466
+
467
+ return ner_objects
468
+
469
+ results = analyzer.analyze(
470
+ text=text,
471
+ language=language,
472
+ entities=(expand_entities_for_analyzer(entities) if entities else None)
473
+ )
474
+
475
+ ner_objects = []
476
+
477
+ for result in results:
478
+ if result.score >= score_threshold:
479
+ context_start = max(0, result.start - math.floor(context_width / 2))
480
+ context_end = min(len(text), result.end + math.ceil(context_width / 2))
481
+ context = text[context_start:context_end] if with_context else None
482
+
483
+ ner_objects.append(NERObject(
484
+ name=text[result.start:result.end],
485
+ label=normalize_label(result.entity_type),
486
+ score=float(result.score),
487
+ start=int(result.start),
155
488
  count=1,
156
- )
157
- for entity in output
158
- ]
159
- min_score = min([min_score] + [e.score for e in flair_entities])
160
- entities += flair_entities
161
- del flair_entities
162
-
163
- # REGEX model
164
- for label, regexes in REGEX_NER_MODELS.items():
165
- if not isinstance(regexes, list):
166
- regexes = [regexes]
167
- for regex in regexes:
168
- regex_entities = [
169
- NERObject(
170
- name=match.group(),
171
- label=label,
172
- score=min_score - 0.5,
173
- count=1,
174
- start=match.start(),
175
- )
176
- for match in re.finditer(regex, text)
177
- ]
178
- entities += regex_entities
179
- min_score = min([min_score] + [e.score for e in regex_entities])
180
-
181
- # SPACY model
182
- chunks = []
183
- chunk_start_offsets = []
184
- current_chunk = []
185
- current_length = 0
186
- offset = 0
187
- for sentence, _ in sentences:
188
- sentence_len = len(sentence) + 1
189
- if sentence_len > spacy_model.max_length:
190
- truncated = sentence[: spacy_model.max_length - 1]
191
- chunks.append(truncated)
192
- chunk_start_offsets.append(offset)
193
- offset += sentence_len
194
- continue
195
- if current_length + sentence_len > spacy_model.max_length:
196
- chunks.append("\n".join(current_chunk))
197
- chunk_start_offsets.append(offset - current_length)
198
- current_chunk = []
199
- current_length = 0
200
- current_chunk.append(sentence)
201
- current_length += sentence_len
202
- offset += sentence_len
203
- if current_chunk:
204
- chunks.append("\n".join(current_chunk))
205
- chunk_start_offsets.append(offset - current_length)
206
- for i, chunk in enumerate(chunks):
207
- doc = spacy_model(chunk)
208
- chunk_offset = chunk_start_offsets[i]
209
- for entity in doc.ents:
210
- entities.append(
211
- NERObject(
212
- name=entity.text,
213
- label=BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
214
- score=min_score - 0.5,
215
- start=chunk_offset + entity.start_char,
216
- count=1,
217
- )
218
- )
219
-
220
- # Reformatting for consistency
221
- if not entities:
222
- return []
223
- if with_scores:
224
- min_entity_score = min([e.score for e in entities])
225
- max_entity_score = max([e.score for e in entities])
226
- entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
227
- for e in entities:
228
- e.score = (e.score - min_entity_score) / entity_score_range
229
- scores = list(np.searchsorted(sentence_starts, [e.start + 1 for e in entities]))
230
- scores = [sentences[i - 1][1] for i in scores]
231
- scores = [scores[i] + 10 * entities[i].score for i in range(len(entities))]
232
- for i in range(len(entities)):
233
- entities[i].score = scores[i]
234
- else:
235
- for i in range(len(entities)):
236
- entities[i].score = 0.0
489
+ context=context
490
+ ))
491
+
237
492
  if with_comentions:
238
- for i in range(len(entities)):
239
- entity = entities[i]
493
+ for i in range(len(ner_objects)):
494
+ entity = ner_objects[i]
240
495
  comentions = [
241
- entities[j].name
242
- for j in range(len(entities))
243
- if j != i and abs(entities[j].start - entity.start) < math.ceil(context_width / 2)
496
+ ner_objects[j].name
497
+ for j in range(len(ner_objects))
498
+ if j != i and abs(ner_objects[j].start - entity.start) < math.ceil(context_width / 2)
244
499
  ]
245
- entities[i].comentions = comentions
246
- if with_context:
247
- for i in range(len(entities)):
248
- entity = entities[i]
249
- if entity.start >= 0 and entity.start < len(text):
250
- left = max(0, entity.start - math.floor(context_width / 2))
251
- right = min(len(text), entity.start + math.ceil(context_width / 2))
252
- context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
253
- entities[i].context = context
254
- return entities
500
+ ner_objects[i].comentions = comentions
501
+
502
+ return ner_objects
255
503
 
256
504
 
257
505
  def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
@@ -295,35 +543,92 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
295
543
 
296
544
 
297
545
  def ner_pipe(
298
- text,
299
- language,
300
- spacy_model,
301
- flair_model=None,
302
- fast=False,
303
- compression_ratio="auto",
304
- with_scores=True,
305
- with_comentions=True,
306
- with_context=True,
546
+ text,
547
+ language,
548
+ model,
549
+ engine_type="spacy",
550
+ fast=False,
551
+ compression_ratio="auto",
552
+ with_comentions=True,
553
+ with_context=True,
554
+ entities=None,
555
+ score_threshold=0.5,
556
+ batch_size=32,
557
+ n_process=4
307
558
  ):
308
- if compression_ratio == "auto":
309
- compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
310
- sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True)
311
- ner = compute_ner(language, sentences, spacy_model, flair_model, 150, with_scores, with_comentions, with_context)
559
+ analyzer = build_presidio_analyzer(
560
+ language=language,
561
+ engine_type=engine_type,
562
+ model=model,
563
+ )
564
+
565
+ if isinstance(text, pd.DataFrame):
566
+ ner = compute_ner_presidio(
567
+ text,
568
+ language,
569
+ analyzer,
570
+ entities,
571
+ score_threshold,
572
+ with_comentions=with_comentions,
573
+ with_context=with_context,
574
+ batch_size=batch_size,
575
+ n_process=n_process
576
+ )
577
+ else:
578
+ if compression_ratio == "auto":
579
+ compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
580
+
581
+ if compression_ratio > 1.0:
582
+ sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast,
583
+ with_scores=True)
584
+ text = " ".join([s[0] for s in sentences])
585
+
586
+ ner = compute_ner_presidio(
587
+ text,
588
+ language,
589
+ analyzer,
590
+ entities,
591
+ score_threshold,
592
+ with_comentions=with_comentions,
593
+ with_context=with_context,
594
+ batch_size=batch_size,
595
+ n_process=n_process
596
+ )
597
+
312
598
  return ner
313
599
 
314
600
 
315
- def get_ner_handler(language, fast=False):
601
+ def get_ner_handler(
602
+ language,
603
+ model,
604
+ engine_type="spacy",
605
+ fast=False,
606
+ entities=None,
607
+ score_threshold=0.5,
608
+ batch_size=32,
609
+ n_process=4
610
+ ):
316
611
  try:
317
- get_nltk_tokenizer(language) # raises a LookupError if the language is not valid
612
+ get_nltk_tokenizer(language)
318
613
  except LookupError:
319
- language = "english"
320
- spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS["english"])()
321
- flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS["english"])()
322
- return lambda text, compression_ratio="auto", with_scores=True, with_comentions=True, with_context=True: ner_pipe(
323
- text, language, spacy_model, flair_model, fast, compression_ratio, with_scores, with_comentions, with_context
614
+ language = "en"
615
+
616
+ return lambda text, compression_ratio="auto", with_comentions=True, with_context=True: ner_pipe(
617
+ text,
618
+ language,
619
+ model,
620
+ engine_type,
621
+ fast,
622
+ compression_ratio,
623
+ with_comentions,
624
+ with_context,
625
+ entities,
626
+ score_threshold,
627
+ batch_size,
628
+ n_process
324
629
  )
325
630
 
326
631
 
327
632
  @st.cache_resource
328
- def get_cached_ner_handler(language, fast):
329
- return get_ner_handler(language, fast)
633
+ def get_cached_ner_handler(language, model):
634
+ return get_ner_handler(language, model)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.2.10
3
+ Version: 2.11a2
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -1,7 +1,7 @@
1
1
  streamlit_octostar_utils/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
2
2
  streamlit_octostar_utils/api_crafter/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
3
3
  streamlit_octostar_utils/api_crafter/celery.py,sha256=BXOTGN9egdD75qf-PkccLGAoniilB9PZ_NRchFIjWdw,30051
4
- streamlit_octostar_utils/api_crafter/fastapi.py,sha256=RKQrStPzG1I1pxsPJvGs_DRrnjlMJbVmu9ObMF2LgZ0,14368
4
+ streamlit_octostar_utils/api_crafter/fastapi.py,sha256=2bktT5Mwjs9XixWcOqUKMoLM_cgKl-cqZDUa2Imf4xA,14357
5
5
  streamlit_octostar_utils/api_crafter/nifi.py,sha256=yFs1HXpSVfWpOC1aJnNahjPofGzZ8fpuqvChloqM4rQ,45541
6
6
  streamlit_octostar_utils/api_crafter/parser/__init__.py,sha256=YeYWF6sdQiCFV_RKNW2t9Vs6KJExE2pbXxWTe_DOayY,107
7
7
  streamlit_octostar_utils/api_crafter/parser/combine_fields.py,sha256=ddc44xkajw8MU0peAX_263DL7rPXbTKbHUjpOhRgvyU,8790
@@ -20,8 +20,8 @@ streamlit_octostar_utils/core/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEkt
20
20
  streamlit_octostar_utils/core/threading/key_queue.py,sha256=7CJpj0gvZMQd8eC5wKQi3Ak5SQQ4zQ1OPTs_OP_kD20,2255
21
21
  streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_yN1OByl8ahLc8,383
22
22
  streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
23
- streamlit_octostar_utils/nlp/language.py,sha256=BBBT8wtwWtVrCin5fNLMqGg5WdgHVotFkIvouk2qKh0,561
24
- streamlit_octostar_utils/nlp/ner.py,sha256=saE7A251JcAr6bFDGzRuSfXeqqRh5xbWRhgWbiKGeDM,13258
23
+ streamlit_octostar_utils/nlp/language.py,sha256=2d8Wq8wTuo_ehjZekuoe3bgJD52ieEiZKDUPdKdOxZ0,1699
24
+ streamlit_octostar_utils/nlp/ner.py,sha256=fuEbmrzXODVqm5piZdfNGkLGSwkrYrJO8KaeKUh7Uk0,20384
25
25
  streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
26
26
  streamlit_octostar_utils/octostar/client.py,sha256=NUvHe9asd65g4-hJ4CuUvUns-9dNWes1XZRJlO9eAAc,1690
27
27
  streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
@@ -36,7 +36,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
36
36
  streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
37
37
  streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
38
38
  streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
39
- streamlit_octostar_utils-0.2.10.dist-info/METADATA,sha256=YKHSxwF_9RwZOr2uzqwhjZA_Q9LWhsMl-GEvanSL9mE,2330
40
- streamlit_octostar_utils-0.2.10.dist-info/WHEEL,sha256=M5asmiAlL6HEcOq52Yi5mmk9KmTVjY2RDPtO4p9DMrc,88
41
- streamlit_octostar_utils-0.2.10.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
42
- streamlit_octostar_utils-0.2.10.dist-info/RECORD,,
39
+ streamlit_octostar_utils-2.11a2.dist-info/METADATA,sha256=lL8vvLY29MCTZ_gopVIlnWx436E3ZAyE6QGX9cY9qO8,2330
40
+ streamlit_octostar_utils-2.11a2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
41
+ streamlit_octostar_utils-2.11a2.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
42
+ streamlit_octostar_utils-2.11a2.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: poetry-core 2.2.0
2
+ Generator: poetry-core 2.2.1
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any