streamlit-octostar-utils 0.2.10__py3-none-any.whl → 2.11a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- streamlit_octostar_utils/api_crafter/fastapi.py +1 -1
- streamlit_octostar_utils/nlp/language.py +40 -0
- streamlit_octostar_utils/nlp/ner.py +477 -172
- {streamlit_octostar_utils-0.2.10.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/METADATA +1 -1
- {streamlit_octostar_utils-0.2.10.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/RECORD +7 -7
- {streamlit_octostar_utils-0.2.10.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/WHEEL +1 -1
- {streamlit_octostar_utils-0.2.10.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/licenses/LICENSE +0 -0
@@ -330,7 +330,7 @@ class DefaultErrorRoute:
|
|
330
330
|
if len(message) > MAX_ERROR_MESSAGE_BYTES:
|
331
331
|
message = message[-MAX_ERROR_MESSAGE_BYTES:]
|
332
332
|
try:
|
333
|
-
tcbk =
|
333
|
+
tcbk = traceback.format_exception(exc)
|
334
334
|
if len(tcbk) > MAX_ERROR_TRACEBACK_BYTES:
|
335
335
|
tcbk = tcbk[-MAX_ERROR_TRACEBACK_BYTES:]
|
336
336
|
except:
|
@@ -3,6 +3,19 @@ import py3langid as langid
|
|
3
3
|
import iso639 as languages
|
4
4
|
|
5
5
|
|
6
|
+
def alpha2_to_language(alpha2: str) -> str:
|
7
|
+
if not alpha2:
|
8
|
+
return None
|
9
|
+
code = alpha2.strip().lower()
|
10
|
+
return languages.to_name(code)
|
11
|
+
|
12
|
+
def language_to_alpha2(language_name: str) -> str:
|
13
|
+
if not language_name:
|
14
|
+
return None
|
15
|
+
name = language_name.strip().lower()
|
16
|
+
data = languages.find(name)
|
17
|
+
return data["iso639_1"]
|
18
|
+
|
6
19
|
def detect_language(text, min_confidence=None):
|
7
20
|
detector = langid.langid.LanguageIdentifier.from_pickled_model(
|
8
21
|
langid.langid.MODEL_FILE, norm_probs=True
|
@@ -13,3 +26,30 @@ def detect_language(text, min_confidence=None):
|
|
13
26
|
detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
|
14
27
|
detected_lang = languages.to_name(detected_lang).lower()
|
15
28
|
return detected_lang, confidence
|
29
|
+
|
30
|
+
FLAIR_MODELS = {
|
31
|
+
"en": "flair/ner-english-large",
|
32
|
+
"es": "flair/ner-spanish-large",
|
33
|
+
"de": "flair/ner-german-large",
|
34
|
+
"nl": "flair/ner-dutch-large",
|
35
|
+
"multi": "flair/ner-multi",
|
36
|
+
"multi-fast": "flair/ner-multi-fast",
|
37
|
+
}
|
38
|
+
|
39
|
+
SPACY_MODELS = {
|
40
|
+
"en": 'en_core_web_sm',
|
41
|
+
}
|
42
|
+
|
43
|
+
def load_language_model(language, type):
|
44
|
+
from flair.models import SequenceTagger
|
45
|
+
from spacy_download import load_spacy
|
46
|
+
|
47
|
+
model = None
|
48
|
+
match type:
|
49
|
+
case "spacy":
|
50
|
+
model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
|
51
|
+
model = load_spacy(model_name)
|
52
|
+
case "flair":
|
53
|
+
model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
|
54
|
+
model = SequenceTagger.load(model_name)
|
55
|
+
return model
|
@@ -1,39 +1,201 @@
|
|
1
|
-
import
|
1
|
+
import itertools
|
2
|
+
import math
|
3
|
+
from typing import Optional, List, Tuple
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
5
|
+
from collections import Counter
|
6
|
+
|
7
|
+
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerRegistry, AnalysisExplanation, \
|
8
|
+
EntityRecognizer, RecognizerResult
|
9
|
+
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
|
2
10
|
import streamlit as st
|
3
|
-
|
11
|
+
import nltk
|
12
|
+
import pandas as pd
|
4
13
|
from flair.data import Sentence
|
5
14
|
from flair.models import SequenceTagger
|
15
|
+
|
6
16
|
from sumy.parsers.plaintext import PlaintextParser
|
7
17
|
from sumy.nlp.tokenizers import Tokenizer
|
8
18
|
from sumy.nlp.stemmers import Stemmer
|
9
19
|
from sumy.summarizers.lsa import LsaSummarizer
|
10
20
|
from sumy.summarizers.luhn import LuhnSummarizer
|
11
21
|
from sumy.utils import get_stop_words
|
12
|
-
import itertools
|
13
|
-
import numpy as np
|
14
|
-
import math
|
15
|
-
import nltk
|
16
|
-
from typing import Optional, List
|
17
|
-
from pydantic import BaseModel, ConfigDict, Field
|
18
22
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
23
|
+
from nlp.language import alpha2_to_language
|
24
|
+
|
25
|
+
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
|
26
|
+
"CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
|
27
|
+
|
28
|
+
PRESIDIO_TO_BASE_ALIASES = {
|
29
|
+
"PHONE_NUMBER": "PHONE",
|
30
|
+
"EMAIL_ADDRESS": "EMAIL",
|
31
|
+
"IBAN_CODE": "IBAN",
|
32
|
+
"DRIVER_LICENSE": "US_DRIVER_LICENSE",
|
33
|
+
"US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
|
34
|
+
"US_DRIVERS_LICENSE": "US_DRIVER_LICENSE",
|
35
|
+
"PASSPORT": "US_PASSPORT",
|
36
|
+
"CREDIT_CARD": "CREDIT_CARD",
|
37
|
+
"URL": "URL",
|
38
|
+
"IP_ADDRESS": "IP_ADDRESS",
|
39
|
+
"CRYPTO": "CRYPTO",
|
40
|
+
"CRYPTO_WALLET": "CRYPTO",
|
41
|
+
"CRYPTO_WALLET_ADDRESS": "CRYPTO",
|
42
|
+
"DATE_TIME": "DATE",
|
43
|
+
"LOCATION": "LOC",
|
44
|
+
"ORGANIZATION": "ORG",
|
24
45
|
}
|
25
|
-
|
26
|
-
|
27
|
-
"
|
28
|
-
|
29
|
-
],
|
30
|
-
"
|
31
|
-
"
|
32
|
-
"
|
46
|
+
|
47
|
+
BASE_TO_RECOGNIZER_EXPANSIONS = {
|
48
|
+
"ORG": ["ORG", "ORGANIZATION"],
|
49
|
+
"LOC": ["LOC", "LOCATION"],
|
50
|
+
"PHONE": ["PHONE", "PHONE_NUMBER"],
|
51
|
+
"EMAIL": ["EMAIL", "EMAIL_ADDRESS"],
|
52
|
+
"IBAN": ["IBAN", "IBAN_CODE"],
|
53
|
+
"US_DRIVER_LICENSE": ["US_DRIVER_LICENSE", "US_DRIVERS_LICENSE", "DRIVER_LICENSE"],
|
54
|
+
"US_PASSPORT": ["US_PASSPORT", "PASSPORT"],
|
55
|
+
"DATE": ["DATE", "DATE_TIME"],
|
56
|
+
"PERSON": ["PERSON"],
|
57
|
+
"URL": ["URL"],
|
58
|
+
"IP_ADDRESS": ["IP_ADDRESS"],
|
59
|
+
"CRYPTO": ["CRYPTO", "CRYPTO_WALLET", "CRYPTO_WALLET_ADDRESS"],
|
60
|
+
"CREDIT_CARD": ["CREDIT_CARD"],
|
61
|
+
"US_SSN": ["US_SSN"],
|
62
|
+
"MEDICAL_LICENSE": ["MEDICAL_LICENSE"],
|
63
|
+
"NORP": ["NORP"],
|
64
|
+
"GPE": ["GPE"],
|
65
|
+
"PRODUCT": ["PRODUCT"],
|
33
66
|
}
|
34
67
|
|
35
68
|
BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
|
36
|
-
|
69
|
+
|
70
|
+
|
71
|
+
class FlairRecognizer(EntityRecognizer):
|
72
|
+
ENTITIES = [
|
73
|
+
"LOC",
|
74
|
+
"PERSON",
|
75
|
+
"ORG",
|
76
|
+
]
|
77
|
+
|
78
|
+
DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
|
79
|
+
|
80
|
+
CHECK_LABEL_GROUPS = [
|
81
|
+
({"LOC"}, {"LOC", "LOCATION"}),
|
82
|
+
({"PERSON"}, {"PER", "PERSON"}),
|
83
|
+
({"ORG"}, {"ORG", "ORGANIZATION"}),
|
84
|
+
]
|
85
|
+
|
86
|
+
MODEL_LANGUAGES = {
|
87
|
+
"en": "flair/ner-english-large",
|
88
|
+
"es": "flair/ner-spanish-large",
|
89
|
+
"de": "flair/ner-german-large",
|
90
|
+
"nl": "flair/ner-dutch-large",
|
91
|
+
"multi": "flair/ner-multi",
|
92
|
+
"multi-fast": "flair/ner-multi-fast",
|
93
|
+
}
|
94
|
+
|
95
|
+
PRESIDIO_EQUIVALENCES = {
|
96
|
+
"PER": "PERSON",
|
97
|
+
"LOC": "LOC",
|
98
|
+
"ORG": "ORG"
|
99
|
+
}
|
100
|
+
|
101
|
+
def __init__(
|
102
|
+
self,
|
103
|
+
model: SequenceTagger = None,
|
104
|
+
supported_language: str = "en",
|
105
|
+
supported_entities: Optional[List[str]] = None,
|
106
|
+
check_label_groups: Optional[Tuple[set, set]] = None,
|
107
|
+
):
|
108
|
+
self.check_label_groups = (
|
109
|
+
check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
|
110
|
+
)
|
111
|
+
|
112
|
+
supported_entities = supported_entities if supported_entities else self.ENTITIES
|
113
|
+
self.model = model
|
114
|
+
|
115
|
+
super().__init__(
|
116
|
+
supported_entities=supported_entities,
|
117
|
+
supported_language=supported_language,
|
118
|
+
name="Flair Analytics",
|
119
|
+
)
|
120
|
+
|
121
|
+
def load(self) -> None:
|
122
|
+
pass
|
123
|
+
|
124
|
+
def get_supported_entities(self) -> List[str]:
|
125
|
+
return self.supported_entities
|
126
|
+
|
127
|
+
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None) -> List[RecognizerResult]:
|
128
|
+
results = []
|
129
|
+
|
130
|
+
sentences = Sentence(text)
|
131
|
+
self.model.predict(sentences)
|
132
|
+
|
133
|
+
if not entities:
|
134
|
+
entities = self.supported_entities
|
135
|
+
|
136
|
+
for entity in entities:
|
137
|
+
if entity not in self.supported_entities:
|
138
|
+
continue
|
139
|
+
|
140
|
+
for ent in sentences.get_spans("ner"):
|
141
|
+
if not self.__check_label(
|
142
|
+
entity, ent.labels[0].value, self.check_label_groups
|
143
|
+
):
|
144
|
+
continue
|
145
|
+
textual_explanation = self.DEFAULT_EXPLANATION.format(
|
146
|
+
ent.labels[0].value
|
147
|
+
)
|
148
|
+
explanation = self.build_flair_explanation(
|
149
|
+
round(ent.score, 2), textual_explanation
|
150
|
+
)
|
151
|
+
flair_result = self._convert_to_recognizer_result(ent, explanation)
|
152
|
+
|
153
|
+
results.append(flair_result)
|
154
|
+
|
155
|
+
return results
|
156
|
+
|
157
|
+
def build_flair_explanation(self, original_score: float, explanation: str) -> AnalysisExplanation:
|
158
|
+
explanation = AnalysisExplanation(
|
159
|
+
recognizer=self.__class__.__name__,
|
160
|
+
original_score=original_score,
|
161
|
+
textual_explanation=explanation,
|
162
|
+
)
|
163
|
+
return explanation
|
164
|
+
|
165
|
+
def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
|
166
|
+
entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
|
167
|
+
flair_score = round(entity.score, 2)
|
168
|
+
|
169
|
+
flair_results = RecognizerResult(
|
170
|
+
entity_type=entity_type,
|
171
|
+
start=entity.start_position,
|
172
|
+
end=entity.end_position,
|
173
|
+
score=flair_score,
|
174
|
+
analysis_explanation=explanation,
|
175
|
+
)
|
176
|
+
|
177
|
+
return flair_results
|
178
|
+
|
179
|
+
@staticmethod
|
180
|
+
def __check_label(
|
181
|
+
entity: str, label: str, check_label_groups: Tuple[set, set]
|
182
|
+
) -> bool:
|
183
|
+
return any(
|
184
|
+
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
185
|
+
)
|
186
|
+
|
187
|
+
|
188
|
+
def normalize_label(label: str) -> str:
|
189
|
+
return PRESIDIO_TO_BASE_ALIASES.get(label, label)
|
190
|
+
|
191
|
+
|
192
|
+
def expand_entities_for_analyzer(entities_list):
|
193
|
+
expanded = set()
|
194
|
+
for e in entities_list:
|
195
|
+
vals = BASE_TO_RECOGNIZER_EXPANSIONS.get(e, [e])
|
196
|
+
for v in vals:
|
197
|
+
expanded.add(v)
|
198
|
+
return list(expanded)
|
37
199
|
|
38
200
|
|
39
201
|
def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
|
@@ -69,8 +231,14 @@ def _sumy__luhn_call(summarizer, document):
|
|
69
231
|
|
70
232
|
|
71
233
|
def get_nltk_tokenizer(language: str) -> Tokenizer:
|
72
|
-
|
73
|
-
|
234
|
+
nltk_lang = alpha2_to_language(language).lower()
|
235
|
+
|
236
|
+
try:
|
237
|
+
nltk.data.find("tokenizers/punkt")
|
238
|
+
except LookupError:
|
239
|
+
nltk.download("punkt")
|
240
|
+
|
241
|
+
return Tokenizer(nltk_lang)
|
74
242
|
|
75
243
|
|
76
244
|
class NERObject(BaseModel):
|
@@ -121,137 +289,217 @@ def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_enti
|
|
121
289
|
return final_entities
|
122
290
|
|
123
291
|
|
124
|
-
def
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
292
|
+
def build_presidio_analyzer(language: str, engine_type: str = "spacy", model=None) -> AnalyzerEngine:
|
293
|
+
registry = RecognizerRegistry()
|
294
|
+
|
295
|
+
if engine_type == "flair":
|
296
|
+
|
297
|
+
flair_recognizer = FlairRecognizer(
|
298
|
+
model=model,
|
299
|
+
supported_language=language
|
300
|
+
)
|
301
|
+
registry.add_recognizer(flair_recognizer)
|
302
|
+
|
303
|
+
default_registry = RecognizerRegistry()
|
304
|
+
default_registry.load_predefined_recognizers()
|
305
|
+
|
306
|
+
flair_handled_entities = {"PERSON", "LOC", "ORG"}
|
307
|
+
|
308
|
+
for recognizer in default_registry.recognizers:
|
309
|
+
recognizer_entities = set(recognizer.supported_entities) if hasattr(recognizer, 'supported_entities') else set()
|
310
|
+
|
311
|
+
if recognizer_entities and recognizer_entities.issubset(flair_handled_entities):
|
312
|
+
continue
|
313
|
+
|
314
|
+
registry.add_recognizer(recognizer)
|
315
|
+
|
316
|
+
return AnalyzerEngine(
|
317
|
+
registry=registry,
|
318
|
+
supported_languages=[language]
|
319
|
+
)
|
320
|
+
|
321
|
+
else:
|
322
|
+
registry.load_predefined_recognizers()
|
323
|
+
|
324
|
+
if model is None:
|
325
|
+
raise ValueError("SpaCy model name must be provided")
|
326
|
+
|
327
|
+
configuration = {
|
328
|
+
"nlp_engine_name": "spacy",
|
329
|
+
"models": [{"lang_code": language, "model_name": model}],
|
330
|
+
}
|
331
|
+
|
332
|
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
333
|
+
nlp_engine = provider.create_engine()
|
334
|
+
|
335
|
+
return AnalyzerEngine(
|
336
|
+
nlp_engine=nlp_engine,
|
337
|
+
registry=registry,
|
338
|
+
supported_languages=[language],
|
339
|
+
)
|
340
|
+
|
341
|
+
|
342
|
+
def analyze_column_sample(column_values: pd.Series, analyzer: AnalyzerEngine, language: str,
|
343
|
+
entities: Optional[List[str]], score_threshold: float) -> Optional[str]:
|
344
|
+
sample_values = column_values.dropna().head(50)
|
345
|
+
|
346
|
+
if sample_values.empty:
|
347
|
+
return None
|
348
|
+
|
349
|
+
entity_counter = Counter()
|
350
|
+
|
351
|
+
for value in sample_values:
|
352
|
+
text = str(value).strip()
|
353
|
+
|
354
|
+
if not text:
|
355
|
+
continue
|
356
|
+
|
357
|
+
results = analyzer.analyze(
|
358
|
+
text=text,
|
359
|
+
language=language,
|
360
|
+
entities=(expand_entities_for_analyzer(entities) if entities else None)
|
361
|
+
)
|
362
|
+
|
363
|
+
for result in results:
|
364
|
+
if result.score >= score_threshold:
|
365
|
+
entity_counter[normalize_label(result.entity_type)] += 1
|
366
|
+
|
367
|
+
if not entity_counter:
|
368
|
+
return None
|
369
|
+
|
370
|
+
most_common = entity_counter.most_common(1)[0]
|
371
|
+
total_detections = sum(entity_counter.values())
|
372
|
+
|
373
|
+
if most_common[1] > total_detections * 0.5:
|
374
|
+
return most_common[0]
|
375
|
+
|
376
|
+
return most_common[0] if entity_counter else None
|
377
|
+
|
378
|
+
|
379
|
+
def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, language: str,
|
380
|
+
entities: Optional[List[str]] = None, score_threshold: float = 0.5) -> List[NERObject]:
|
381
|
+
ner_objects = []
|
382
|
+
|
383
|
+
for column_name in df.columns:
|
384
|
+
entity_type = analyze_column_sample(
|
385
|
+
df[column_name],
|
386
|
+
analyzer,
|
387
|
+
language,
|
388
|
+
entities,
|
389
|
+
score_threshold
|
390
|
+
)
|
391
|
+
|
392
|
+
if entity_type:
|
393
|
+
for idx, value in df[column_name].dropna().items():
|
394
|
+
text = str(value).strip()
|
395
|
+
|
396
|
+
if text:
|
397
|
+
ner_objects.append(NERObject(
|
398
|
+
name=text[:100],
|
399
|
+
label=entity_type,
|
400
|
+
score=0.9,
|
401
|
+
start=0,
|
402
|
+
count=1,
|
403
|
+
context=text[:100]
|
404
|
+
))
|
405
|
+
|
406
|
+
return ner_objects
|
407
|
+
|
408
|
+
|
409
|
+
def compute_ner_presidio(
|
410
|
+
text,
|
411
|
+
language,
|
412
|
+
analyzer,
|
413
|
+
entities=None,
|
414
|
+
score_threshold=0.5,
|
415
|
+
context_width=150,
|
416
|
+
with_comentions=True,
|
417
|
+
with_context=True,
|
418
|
+
batch_size=32,
|
419
|
+
n_process=4
|
133
420
|
):
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
421
|
+
if isinstance(text, pd.DataFrame):
|
422
|
+
if len(text) >= 100:
|
423
|
+
return analyze_dataframe_optimized(text, analyzer, language, entities, score_threshold)
|
424
|
+
|
425
|
+
else:
|
426
|
+
texts = []
|
427
|
+
|
428
|
+
for col in text.columns:
|
429
|
+
for idx, value in text[col].dropna().items():
|
430
|
+
text_value = str(value).strip()
|
431
|
+
|
432
|
+
if text_value:
|
433
|
+
texts.append(text_value)
|
434
|
+
|
435
|
+
text = "\n".join(texts)
|
436
|
+
|
437
|
+
elif isinstance(text, list):
|
438
|
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
439
|
+
|
440
|
+
results_generator = batch_analyzer.analyze_iterator(
|
441
|
+
texts=text,
|
442
|
+
language=language,
|
443
|
+
batch_size=batch_size,
|
444
|
+
n_process=n_process,
|
445
|
+
entities=(expand_entities_for_analyzer(entities) if entities else None),
|
446
|
+
)
|
447
|
+
|
448
|
+
all_results = list(results_generator)
|
449
|
+
ner_objects = []
|
450
|
+
|
451
|
+
for text_item, results in zip(text, all_results):
|
452
|
+
for result in results:
|
453
|
+
if result.score >= score_threshold:
|
454
|
+
context_start = max(0, result.start - 30)
|
455
|
+
context_end = min(len(text_item), result.end + 30)
|
456
|
+
context = text_item[context_start:context_end] if with_context else None
|
457
|
+
|
458
|
+
ner_objects.append(NERObject(
|
459
|
+
name=text_item[result.start:result.end],
|
460
|
+
label=normalize_label(result.entity_type),
|
461
|
+
score=float(result.score),
|
462
|
+
start=int(result.start),
|
463
|
+
count=1,
|
464
|
+
context=context
|
465
|
+
))
|
466
|
+
|
467
|
+
return ner_objects
|
468
|
+
|
469
|
+
results = analyzer.analyze(
|
470
|
+
text=text,
|
471
|
+
language=language,
|
472
|
+
entities=(expand_entities_for_analyzer(entities) if entities else None)
|
473
|
+
)
|
474
|
+
|
475
|
+
ner_objects = []
|
476
|
+
|
477
|
+
for result in results:
|
478
|
+
if result.score >= score_threshold:
|
479
|
+
context_start = max(0, result.start - math.floor(context_width / 2))
|
480
|
+
context_end = min(len(text), result.end + math.ceil(context_width / 2))
|
481
|
+
context = text[context_start:context_end] if with_context else None
|
482
|
+
|
483
|
+
ner_objects.append(NERObject(
|
484
|
+
name=text[result.start:result.end],
|
485
|
+
label=normalize_label(result.entity_type),
|
486
|
+
score=float(result.score),
|
487
|
+
start=int(result.start),
|
155
488
|
count=1,
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
min_score = min([min_score] + [e.score for e in flair_entities])
|
160
|
-
entities += flair_entities
|
161
|
-
del flair_entities
|
162
|
-
|
163
|
-
# REGEX model
|
164
|
-
for label, regexes in REGEX_NER_MODELS.items():
|
165
|
-
if not isinstance(regexes, list):
|
166
|
-
regexes = [regexes]
|
167
|
-
for regex in regexes:
|
168
|
-
regex_entities = [
|
169
|
-
NERObject(
|
170
|
-
name=match.group(),
|
171
|
-
label=label,
|
172
|
-
score=min_score - 0.5,
|
173
|
-
count=1,
|
174
|
-
start=match.start(),
|
175
|
-
)
|
176
|
-
for match in re.finditer(regex, text)
|
177
|
-
]
|
178
|
-
entities += regex_entities
|
179
|
-
min_score = min([min_score] + [e.score for e in regex_entities])
|
180
|
-
|
181
|
-
# SPACY model
|
182
|
-
chunks = []
|
183
|
-
chunk_start_offsets = []
|
184
|
-
current_chunk = []
|
185
|
-
current_length = 0
|
186
|
-
offset = 0
|
187
|
-
for sentence, _ in sentences:
|
188
|
-
sentence_len = len(sentence) + 1
|
189
|
-
if sentence_len > spacy_model.max_length:
|
190
|
-
truncated = sentence[: spacy_model.max_length - 1]
|
191
|
-
chunks.append(truncated)
|
192
|
-
chunk_start_offsets.append(offset)
|
193
|
-
offset += sentence_len
|
194
|
-
continue
|
195
|
-
if current_length + sentence_len > spacy_model.max_length:
|
196
|
-
chunks.append("\n".join(current_chunk))
|
197
|
-
chunk_start_offsets.append(offset - current_length)
|
198
|
-
current_chunk = []
|
199
|
-
current_length = 0
|
200
|
-
current_chunk.append(sentence)
|
201
|
-
current_length += sentence_len
|
202
|
-
offset += sentence_len
|
203
|
-
if current_chunk:
|
204
|
-
chunks.append("\n".join(current_chunk))
|
205
|
-
chunk_start_offsets.append(offset - current_length)
|
206
|
-
for i, chunk in enumerate(chunks):
|
207
|
-
doc = spacy_model(chunk)
|
208
|
-
chunk_offset = chunk_start_offsets[i]
|
209
|
-
for entity in doc.ents:
|
210
|
-
entities.append(
|
211
|
-
NERObject(
|
212
|
-
name=entity.text,
|
213
|
-
label=BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
|
214
|
-
score=min_score - 0.5,
|
215
|
-
start=chunk_offset + entity.start_char,
|
216
|
-
count=1,
|
217
|
-
)
|
218
|
-
)
|
219
|
-
|
220
|
-
# Reformatting for consistency
|
221
|
-
if not entities:
|
222
|
-
return []
|
223
|
-
if with_scores:
|
224
|
-
min_entity_score = min([e.score for e in entities])
|
225
|
-
max_entity_score = max([e.score for e in entities])
|
226
|
-
entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
|
227
|
-
for e in entities:
|
228
|
-
e.score = (e.score - min_entity_score) / entity_score_range
|
229
|
-
scores = list(np.searchsorted(sentence_starts, [e.start + 1 for e in entities]))
|
230
|
-
scores = [sentences[i - 1][1] for i in scores]
|
231
|
-
scores = [scores[i] + 10 * entities[i].score for i in range(len(entities))]
|
232
|
-
for i in range(len(entities)):
|
233
|
-
entities[i].score = scores[i]
|
234
|
-
else:
|
235
|
-
for i in range(len(entities)):
|
236
|
-
entities[i].score = 0.0
|
489
|
+
context=context
|
490
|
+
))
|
491
|
+
|
237
492
|
if with_comentions:
|
238
|
-
for i in range(len(
|
239
|
-
entity =
|
493
|
+
for i in range(len(ner_objects)):
|
494
|
+
entity = ner_objects[i]
|
240
495
|
comentions = [
|
241
|
-
|
242
|
-
for j in range(len(
|
243
|
-
if j != i and abs(
|
496
|
+
ner_objects[j].name
|
497
|
+
for j in range(len(ner_objects))
|
498
|
+
if j != i and abs(ner_objects[j].start - entity.start) < math.ceil(context_width / 2)
|
244
499
|
]
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
entity = entities[i]
|
249
|
-
if entity.start >= 0 and entity.start < len(text):
|
250
|
-
left = max(0, entity.start - math.floor(context_width / 2))
|
251
|
-
right = min(len(text), entity.start + math.ceil(context_width / 2))
|
252
|
-
context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
|
253
|
-
entities[i].context = context
|
254
|
-
return entities
|
500
|
+
ner_objects[i].comentions = comentions
|
501
|
+
|
502
|
+
return ner_objects
|
255
503
|
|
256
504
|
|
257
505
|
def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
|
@@ -295,35 +543,92 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
|
|
295
543
|
|
296
544
|
|
297
545
|
def ner_pipe(
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
546
|
+
text,
|
547
|
+
language,
|
548
|
+
model,
|
549
|
+
engine_type="spacy",
|
550
|
+
fast=False,
|
551
|
+
compression_ratio="auto",
|
552
|
+
with_comentions=True,
|
553
|
+
with_context=True,
|
554
|
+
entities=None,
|
555
|
+
score_threshold=0.5,
|
556
|
+
batch_size=32,
|
557
|
+
n_process=4
|
307
558
|
):
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
559
|
+
analyzer = build_presidio_analyzer(
|
560
|
+
language=language,
|
561
|
+
engine_type=engine_type,
|
562
|
+
model=model,
|
563
|
+
)
|
564
|
+
|
565
|
+
if isinstance(text, pd.DataFrame):
|
566
|
+
ner = compute_ner_presidio(
|
567
|
+
text,
|
568
|
+
language,
|
569
|
+
analyzer,
|
570
|
+
entities,
|
571
|
+
score_threshold,
|
572
|
+
with_comentions=with_comentions,
|
573
|
+
with_context=with_context,
|
574
|
+
batch_size=batch_size,
|
575
|
+
n_process=n_process
|
576
|
+
)
|
577
|
+
else:
|
578
|
+
if compression_ratio == "auto":
|
579
|
+
compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
|
580
|
+
|
581
|
+
if compression_ratio > 1.0:
|
582
|
+
sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast,
|
583
|
+
with_scores=True)
|
584
|
+
text = " ".join([s[0] for s in sentences])
|
585
|
+
|
586
|
+
ner = compute_ner_presidio(
|
587
|
+
text,
|
588
|
+
language,
|
589
|
+
analyzer,
|
590
|
+
entities,
|
591
|
+
score_threshold,
|
592
|
+
with_comentions=with_comentions,
|
593
|
+
with_context=with_context,
|
594
|
+
batch_size=batch_size,
|
595
|
+
n_process=n_process
|
596
|
+
)
|
597
|
+
|
312
598
|
return ner
|
313
599
|
|
314
600
|
|
315
|
-
def get_ner_handler(
|
601
|
+
def get_ner_handler(
|
602
|
+
language,
|
603
|
+
model,
|
604
|
+
engine_type="spacy",
|
605
|
+
fast=False,
|
606
|
+
entities=None,
|
607
|
+
score_threshold=0.5,
|
608
|
+
batch_size=32,
|
609
|
+
n_process=4
|
610
|
+
):
|
316
611
|
try:
|
317
|
-
get_nltk_tokenizer(language)
|
612
|
+
get_nltk_tokenizer(language)
|
318
613
|
except LookupError:
|
319
|
-
language = "
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
614
|
+
language = "en"
|
615
|
+
|
616
|
+
return lambda text, compression_ratio="auto", with_comentions=True, with_context=True: ner_pipe(
|
617
|
+
text,
|
618
|
+
language,
|
619
|
+
model,
|
620
|
+
engine_type,
|
621
|
+
fast,
|
622
|
+
compression_ratio,
|
623
|
+
with_comentions,
|
624
|
+
with_context,
|
625
|
+
entities,
|
626
|
+
score_threshold,
|
627
|
+
batch_size,
|
628
|
+
n_process
|
324
629
|
)
|
325
630
|
|
326
631
|
|
327
632
|
@st.cache_resource
|
328
|
-
def get_cached_ner_handler(language,
|
329
|
-
return get_ner_handler(language,
|
633
|
+
def get_cached_ner_handler(language, model):
|
634
|
+
return get_ner_handler(language, model)
|
{streamlit_octostar_utils-0.2.10.dist-info → streamlit_octostar_utils-2.11a2.dist-info}/RECORD
RENAMED
@@ -1,7 +1,7 @@
|
|
1
1
|
streamlit_octostar_utils/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
2
2
|
streamlit_octostar_utils/api_crafter/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
3
3
|
streamlit_octostar_utils/api_crafter/celery.py,sha256=BXOTGN9egdD75qf-PkccLGAoniilB9PZ_NRchFIjWdw,30051
|
4
|
-
streamlit_octostar_utils/api_crafter/fastapi.py,sha256=
|
4
|
+
streamlit_octostar_utils/api_crafter/fastapi.py,sha256=2bktT5Mwjs9XixWcOqUKMoLM_cgKl-cqZDUa2Imf4xA,14357
|
5
5
|
streamlit_octostar_utils/api_crafter/nifi.py,sha256=yFs1HXpSVfWpOC1aJnNahjPofGzZ8fpuqvChloqM4rQ,45541
|
6
6
|
streamlit_octostar_utils/api_crafter/parser/__init__.py,sha256=YeYWF6sdQiCFV_RKNW2t9Vs6KJExE2pbXxWTe_DOayY,107
|
7
7
|
streamlit_octostar_utils/api_crafter/parser/combine_fields.py,sha256=ddc44xkajw8MU0peAX_263DL7rPXbTKbHUjpOhRgvyU,8790
|
@@ -20,8 +20,8 @@ streamlit_octostar_utils/core/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEkt
|
|
20
20
|
streamlit_octostar_utils/core/threading/key_queue.py,sha256=7CJpj0gvZMQd8eC5wKQi3Ak5SQQ4zQ1OPTs_OP_kD20,2255
|
21
21
|
streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_yN1OByl8ahLc8,383
|
22
22
|
streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
|
23
|
-
streamlit_octostar_utils/nlp/language.py,sha256=
|
24
|
-
streamlit_octostar_utils/nlp/ner.py,sha256=
|
23
|
+
streamlit_octostar_utils/nlp/language.py,sha256=2d8Wq8wTuo_ehjZekuoe3bgJD52ieEiZKDUPdKdOxZ0,1699
|
24
|
+
streamlit_octostar_utils/nlp/ner.py,sha256=fuEbmrzXODVqm5piZdfNGkLGSwkrYrJO8KaeKUh7Uk0,20384
|
25
25
|
streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
26
26
|
streamlit_octostar_utils/octostar/client.py,sha256=NUvHe9asd65g4-hJ4CuUvUns-9dNWes1XZRJlO9eAAc,1690
|
27
27
|
streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
|
@@ -36,7 +36,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
|
|
36
36
|
streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
|
37
37
|
streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
|
38
38
|
streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
|
39
|
-
streamlit_octostar_utils-
|
40
|
-
streamlit_octostar_utils-
|
41
|
-
streamlit_octostar_utils-
|
42
|
-
streamlit_octostar_utils-
|
39
|
+
streamlit_octostar_utils-2.11a2.dist-info/METADATA,sha256=lL8vvLY29MCTZ_gopVIlnWx436E3ZAyE6QGX9cY9qO8,2330
|
40
|
+
streamlit_octostar_utils-2.11a2.dist-info/WHEEL,sha256=zp0Cn7JsFoX2ATtOhtaFYIiE2rmFAD4OcMhtUki8W3U,88
|
41
|
+
streamlit_octostar_utils-2.11a2.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
|
42
|
+
streamlit_octostar_utils-2.11a2.dist-info/RECORD,,
|
File without changes
|