streamlit-octostar-utils 0.2.10__tar.gz → 2.11a2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/PKG-INFO +1 -1
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/pyproject.toml +1 -1
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/fastapi.py +1 -1
- streamlit_octostar_utils-2.11a2/streamlit_octostar_utils/nlp/language.py +55 -0
- streamlit_octostar_utils-2.11a2/streamlit_octostar_utils/nlp/ner.py +634 -0
- streamlit_octostar_utils-0.2.10/streamlit_octostar_utils/nlp/language.py +0 -15
- streamlit_octostar_utils-0.2.10/streamlit_octostar_utils/nlp/ner.py +0 -329
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/LICENSE +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/README.md +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/nifi.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/dict.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/filetypes.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/timestamp.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/nlp/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/octostar/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/octostar/client.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/octostar/context.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/octostar/permissions.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/ontology/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/ontology/expand_entities.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/ontology/validation.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/style/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/style/common.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
- {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -330,7 +330,7 @@ class DefaultErrorRoute:
|
|
330
330
|
if len(message) > MAX_ERROR_MESSAGE_BYTES:
|
331
331
|
message = message[-MAX_ERROR_MESSAGE_BYTES:]
|
332
332
|
try:
|
333
|
-
tcbk =
|
333
|
+
tcbk = traceback.format_exception(exc)
|
334
334
|
if len(tcbk) > MAX_ERROR_TRACEBACK_BYTES:
|
335
335
|
tcbk = tcbk[-MAX_ERROR_TRACEBACK_BYTES:]
|
336
336
|
except:
|
@@ -0,0 +1,55 @@
|
|
1
|
+
import re
|
2
|
+
import py3langid as langid
|
3
|
+
import iso639 as languages
|
4
|
+
|
5
|
+
|
6
|
+
def alpha2_to_language(alpha2: str) -> str:
|
7
|
+
if not alpha2:
|
8
|
+
return None
|
9
|
+
code = alpha2.strip().lower()
|
10
|
+
return languages.to_name(code)
|
11
|
+
|
12
|
+
def language_to_alpha2(language_name: str) -> str:
|
13
|
+
if not language_name:
|
14
|
+
return None
|
15
|
+
name = language_name.strip().lower()
|
16
|
+
data = languages.find(name)
|
17
|
+
return data["iso639_1"]
|
18
|
+
|
19
|
+
def detect_language(text, min_confidence=None):
|
20
|
+
detector = langid.langid.LanguageIdentifier.from_pickled_model(
|
21
|
+
langid.langid.MODEL_FILE, norm_probs=True
|
22
|
+
)
|
23
|
+
detected_lang, confidence = detector.classify(text)
|
24
|
+
if min_confidence and confidence < min_confidence:
|
25
|
+
return None, confidence
|
26
|
+
detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
|
27
|
+
detected_lang = languages.to_name(detected_lang).lower()
|
28
|
+
return detected_lang, confidence
|
29
|
+
|
30
|
+
FLAIR_MODELS = {
|
31
|
+
"en": "flair/ner-english-large",
|
32
|
+
"es": "flair/ner-spanish-large",
|
33
|
+
"de": "flair/ner-german-large",
|
34
|
+
"nl": "flair/ner-dutch-large",
|
35
|
+
"multi": "flair/ner-multi",
|
36
|
+
"multi-fast": "flair/ner-multi-fast",
|
37
|
+
}
|
38
|
+
|
39
|
+
SPACY_MODELS = {
|
40
|
+
"en": 'en_core_web_sm',
|
41
|
+
}
|
42
|
+
|
43
|
+
def load_language_model(language, type):
|
44
|
+
from flair.models import SequenceTagger
|
45
|
+
from spacy_download import load_spacy
|
46
|
+
|
47
|
+
model = None
|
48
|
+
match type:
|
49
|
+
case "spacy":
|
50
|
+
model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
|
51
|
+
model = load_spacy(model_name)
|
52
|
+
case "flair":
|
53
|
+
model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
|
54
|
+
model = SequenceTagger.load(model_name)
|
55
|
+
return model
|
@@ -0,0 +1,634 @@
|
|
1
|
+
import itertools
|
2
|
+
import math
|
3
|
+
from typing import Optional, List, Tuple
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
5
|
+
from collections import Counter
|
6
|
+
|
7
|
+
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerRegistry, AnalysisExplanation, \
|
8
|
+
EntityRecognizer, RecognizerResult
|
9
|
+
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
|
10
|
+
import streamlit as st
|
11
|
+
import nltk
|
12
|
+
import pandas as pd
|
13
|
+
from flair.data import Sentence
|
14
|
+
from flair.models import SequenceTagger
|
15
|
+
|
16
|
+
from sumy.parsers.plaintext import PlaintextParser
|
17
|
+
from sumy.nlp.tokenizers import Tokenizer
|
18
|
+
from sumy.nlp.stemmers import Stemmer
|
19
|
+
from sumy.summarizers.lsa import LsaSummarizer
|
20
|
+
from sumy.summarizers.luhn import LuhnSummarizer
|
21
|
+
from sumy.utils import get_stop_words
|
22
|
+
|
23
|
+
from nlp.language import alpha2_to_language
|
24
|
+
|
25
|
+
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
|
26
|
+
"CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
|
27
|
+
|
28
|
+
PRESIDIO_TO_BASE_ALIASES = {
|
29
|
+
"PHONE_NUMBER": "PHONE",
|
30
|
+
"EMAIL_ADDRESS": "EMAIL",
|
31
|
+
"IBAN_CODE": "IBAN",
|
32
|
+
"DRIVER_LICENSE": "US_DRIVER_LICENSE",
|
33
|
+
"US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
|
34
|
+
"US_DRIVERS_LICENSE": "US_DRIVER_LICENSE",
|
35
|
+
"PASSPORT": "US_PASSPORT",
|
36
|
+
"CREDIT_CARD": "CREDIT_CARD",
|
37
|
+
"URL": "URL",
|
38
|
+
"IP_ADDRESS": "IP_ADDRESS",
|
39
|
+
"CRYPTO": "CRYPTO",
|
40
|
+
"CRYPTO_WALLET": "CRYPTO",
|
41
|
+
"CRYPTO_WALLET_ADDRESS": "CRYPTO",
|
42
|
+
"DATE_TIME": "DATE",
|
43
|
+
"LOCATION": "LOC",
|
44
|
+
"ORGANIZATION": "ORG",
|
45
|
+
}
|
46
|
+
|
47
|
+
BASE_TO_RECOGNIZER_EXPANSIONS = {
|
48
|
+
"ORG": ["ORG", "ORGANIZATION"],
|
49
|
+
"LOC": ["LOC", "LOCATION"],
|
50
|
+
"PHONE": ["PHONE", "PHONE_NUMBER"],
|
51
|
+
"EMAIL": ["EMAIL", "EMAIL_ADDRESS"],
|
52
|
+
"IBAN": ["IBAN", "IBAN_CODE"],
|
53
|
+
"US_DRIVER_LICENSE": ["US_DRIVER_LICENSE", "US_DRIVERS_LICENSE", "DRIVER_LICENSE"],
|
54
|
+
"US_PASSPORT": ["US_PASSPORT", "PASSPORT"],
|
55
|
+
"DATE": ["DATE", "DATE_TIME"],
|
56
|
+
"PERSON": ["PERSON"],
|
57
|
+
"URL": ["URL"],
|
58
|
+
"IP_ADDRESS": ["IP_ADDRESS"],
|
59
|
+
"CRYPTO": ["CRYPTO", "CRYPTO_WALLET", "CRYPTO_WALLET_ADDRESS"],
|
60
|
+
"CREDIT_CARD": ["CREDIT_CARD"],
|
61
|
+
"US_SSN": ["US_SSN"],
|
62
|
+
"MEDICAL_LICENSE": ["MEDICAL_LICENSE"],
|
63
|
+
"NORP": ["NORP"],
|
64
|
+
"GPE": ["GPE"],
|
65
|
+
"PRODUCT": ["PRODUCT"],
|
66
|
+
}
|
67
|
+
|
68
|
+
BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
|
69
|
+
|
70
|
+
|
71
|
+
class FlairRecognizer(EntityRecognizer):
|
72
|
+
ENTITIES = [
|
73
|
+
"LOC",
|
74
|
+
"PERSON",
|
75
|
+
"ORG",
|
76
|
+
]
|
77
|
+
|
78
|
+
DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
|
79
|
+
|
80
|
+
CHECK_LABEL_GROUPS = [
|
81
|
+
({"LOC"}, {"LOC", "LOCATION"}),
|
82
|
+
({"PERSON"}, {"PER", "PERSON"}),
|
83
|
+
({"ORG"}, {"ORG", "ORGANIZATION"}),
|
84
|
+
]
|
85
|
+
|
86
|
+
MODEL_LANGUAGES = {
|
87
|
+
"en": "flair/ner-english-large",
|
88
|
+
"es": "flair/ner-spanish-large",
|
89
|
+
"de": "flair/ner-german-large",
|
90
|
+
"nl": "flair/ner-dutch-large",
|
91
|
+
"multi": "flair/ner-multi",
|
92
|
+
"multi-fast": "flair/ner-multi-fast",
|
93
|
+
}
|
94
|
+
|
95
|
+
PRESIDIO_EQUIVALENCES = {
|
96
|
+
"PER": "PERSON",
|
97
|
+
"LOC": "LOC",
|
98
|
+
"ORG": "ORG"
|
99
|
+
}
|
100
|
+
|
101
|
+
def __init__(
|
102
|
+
self,
|
103
|
+
model: SequenceTagger = None,
|
104
|
+
supported_language: str = "en",
|
105
|
+
supported_entities: Optional[List[str]] = None,
|
106
|
+
check_label_groups: Optional[Tuple[set, set]] = None,
|
107
|
+
):
|
108
|
+
self.check_label_groups = (
|
109
|
+
check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
|
110
|
+
)
|
111
|
+
|
112
|
+
supported_entities = supported_entities if supported_entities else self.ENTITIES
|
113
|
+
self.model = model
|
114
|
+
|
115
|
+
super().__init__(
|
116
|
+
supported_entities=supported_entities,
|
117
|
+
supported_language=supported_language,
|
118
|
+
name="Flair Analytics",
|
119
|
+
)
|
120
|
+
|
121
|
+
def load(self) -> None:
|
122
|
+
pass
|
123
|
+
|
124
|
+
def get_supported_entities(self) -> List[str]:
|
125
|
+
return self.supported_entities
|
126
|
+
|
127
|
+
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None) -> List[RecognizerResult]:
|
128
|
+
results = []
|
129
|
+
|
130
|
+
sentences = Sentence(text)
|
131
|
+
self.model.predict(sentences)
|
132
|
+
|
133
|
+
if not entities:
|
134
|
+
entities = self.supported_entities
|
135
|
+
|
136
|
+
for entity in entities:
|
137
|
+
if entity not in self.supported_entities:
|
138
|
+
continue
|
139
|
+
|
140
|
+
for ent in sentences.get_spans("ner"):
|
141
|
+
if not self.__check_label(
|
142
|
+
entity, ent.labels[0].value, self.check_label_groups
|
143
|
+
):
|
144
|
+
continue
|
145
|
+
textual_explanation = self.DEFAULT_EXPLANATION.format(
|
146
|
+
ent.labels[0].value
|
147
|
+
)
|
148
|
+
explanation = self.build_flair_explanation(
|
149
|
+
round(ent.score, 2), textual_explanation
|
150
|
+
)
|
151
|
+
flair_result = self._convert_to_recognizer_result(ent, explanation)
|
152
|
+
|
153
|
+
results.append(flair_result)
|
154
|
+
|
155
|
+
return results
|
156
|
+
|
157
|
+
def build_flair_explanation(self, original_score: float, explanation: str) -> AnalysisExplanation:
|
158
|
+
explanation = AnalysisExplanation(
|
159
|
+
recognizer=self.__class__.__name__,
|
160
|
+
original_score=original_score,
|
161
|
+
textual_explanation=explanation,
|
162
|
+
)
|
163
|
+
return explanation
|
164
|
+
|
165
|
+
def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
|
166
|
+
entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
|
167
|
+
flair_score = round(entity.score, 2)
|
168
|
+
|
169
|
+
flair_results = RecognizerResult(
|
170
|
+
entity_type=entity_type,
|
171
|
+
start=entity.start_position,
|
172
|
+
end=entity.end_position,
|
173
|
+
score=flair_score,
|
174
|
+
analysis_explanation=explanation,
|
175
|
+
)
|
176
|
+
|
177
|
+
return flair_results
|
178
|
+
|
179
|
+
@staticmethod
|
180
|
+
def __check_label(
|
181
|
+
entity: str, label: str, check_label_groups: Tuple[set, set]
|
182
|
+
) -> bool:
|
183
|
+
return any(
|
184
|
+
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
185
|
+
)
|
186
|
+
|
187
|
+
|
188
|
+
def normalize_label(label: str) -> str:
|
189
|
+
return PRESIDIO_TO_BASE_ALIASES.get(label, label)
|
190
|
+
|
191
|
+
|
192
|
+
def expand_entities_for_analyzer(entities_list):
|
193
|
+
expanded = set()
|
194
|
+
for e in entities_list:
|
195
|
+
vals = BASE_TO_RECOGNIZER_EXPANSIONS.get(e, [e])
|
196
|
+
for v in vals:
|
197
|
+
expanded.add(v)
|
198
|
+
return list(expanded)
|
199
|
+
|
200
|
+
|
201
|
+
def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
|
202
|
+
from operator import attrgetter
|
203
|
+
from sumy.summarizers._summarizer import SentenceInfo
|
204
|
+
|
205
|
+
rate = rating
|
206
|
+
if isinstance(rating, dict):
|
207
|
+
assert not args and not kwargs
|
208
|
+
rate = lambda s: rating[s]
|
209
|
+
infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
|
210
|
+
infos = sorted(infos, key=attrgetter("rating"), reverse=True)
|
211
|
+
return tuple((i.sentence, i.rating, i.order) for i in infos)
|
212
|
+
|
213
|
+
|
214
|
+
def _sumy__lsa_call(summarizer, document):
|
215
|
+
summarizer._ensure_dependecies_installed()
|
216
|
+
dictionary = summarizer._create_dictionary(document)
|
217
|
+
if not dictionary:
|
218
|
+
return ()
|
219
|
+
matrix = summarizer._create_matrix(document, dictionary)
|
220
|
+
matrix = summarizer._compute_term_frequency(matrix)
|
221
|
+
from numpy.linalg import svd as singular_value_decomposition
|
222
|
+
|
223
|
+
u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
|
224
|
+
ranks = iter(summarizer._compute_ranks(sigma, v))
|
225
|
+
return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
|
226
|
+
|
227
|
+
|
228
|
+
def _sumy__luhn_call(summarizer, document):
|
229
|
+
words = summarizer._get_significant_words(document.words)
|
230
|
+
return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
|
231
|
+
|
232
|
+
|
233
|
+
def get_nltk_tokenizer(language: str) -> Tokenizer:
|
234
|
+
nltk_lang = alpha2_to_language(language).lower()
|
235
|
+
|
236
|
+
try:
|
237
|
+
nltk.data.find("tokenizers/punkt")
|
238
|
+
except LookupError:
|
239
|
+
nltk.download("punkt")
|
240
|
+
|
241
|
+
return Tokenizer(nltk_lang)
|
242
|
+
|
243
|
+
|
244
|
+
class NERObject(BaseModel):
|
245
|
+
name: str
|
246
|
+
label: str
|
247
|
+
score: float = 0.0
|
248
|
+
start: int
|
249
|
+
count: int
|
250
|
+
context: str | None = None
|
251
|
+
comentions: list[str] = Field(default_factory=list)
|
252
|
+
model_config = ConfigDict(extra="allow")
|
253
|
+
|
254
|
+
def __repr__(self):
|
255
|
+
return f"NERObject(label={self.label},name={self.name})"
|
256
|
+
|
257
|
+
|
258
|
+
def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_entities=None):
|
259
|
+
if whitelisted_labels is not None:
|
260
|
+
entities = [e for e in entities if e.label in whitelisted_labels]
|
261
|
+
entities = sorted(entities, key=lambda x: x.name)
|
262
|
+
final_entities = []
|
263
|
+
for _, group in itertools.groupby(entities, key=lambda x: x.name):
|
264
|
+
group = list(group)
|
265
|
+
best_entity = max(group, key=lambda x: x.score * x.count)
|
266
|
+
merged_data = {
|
267
|
+
"name": best_entity.name,
|
268
|
+
"label": best_entity.label,
|
269
|
+
"score": best_entity.score,
|
270
|
+
"context": best_entity.context,
|
271
|
+
"count": sum(e.count for e in group),
|
272
|
+
"start": best_entity.start,
|
273
|
+
}
|
274
|
+
all_fields = best_entity.model_fields.keys()
|
275
|
+
for field in all_fields:
|
276
|
+
if field in merged_data:
|
277
|
+
continue
|
278
|
+
values = [getattr(e, field, None) for e in group if getattr(e, field, None) is not None]
|
279
|
+
if not values:
|
280
|
+
continue
|
281
|
+
if isinstance(values[0], list):
|
282
|
+
merged_data[field] = list(set(itertools.chain.from_iterable(values or [])))
|
283
|
+
else:
|
284
|
+
merged_data[field] = getattr(best_entity, field, None)
|
285
|
+
final_entities.append(NERObject(**merged_data))
|
286
|
+
final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
|
287
|
+
if max_entities and len(final_entities) > max_entities:
|
288
|
+
final_entities = final_entities[:max_entities]
|
289
|
+
return final_entities
|
290
|
+
|
291
|
+
|
292
|
+
def build_presidio_analyzer(language: str, engine_type: str = "spacy", model=None) -> AnalyzerEngine:
|
293
|
+
registry = RecognizerRegistry()
|
294
|
+
|
295
|
+
if engine_type == "flair":
|
296
|
+
|
297
|
+
flair_recognizer = FlairRecognizer(
|
298
|
+
model=model,
|
299
|
+
supported_language=language
|
300
|
+
)
|
301
|
+
registry.add_recognizer(flair_recognizer)
|
302
|
+
|
303
|
+
default_registry = RecognizerRegistry()
|
304
|
+
default_registry.load_predefined_recognizers()
|
305
|
+
|
306
|
+
flair_handled_entities = {"PERSON", "LOC", "ORG"}
|
307
|
+
|
308
|
+
for recognizer in default_registry.recognizers:
|
309
|
+
recognizer_entities = set(recognizer.supported_entities) if hasattr(recognizer, 'supported_entities') else set()
|
310
|
+
|
311
|
+
if recognizer_entities and recognizer_entities.issubset(flair_handled_entities):
|
312
|
+
continue
|
313
|
+
|
314
|
+
registry.add_recognizer(recognizer)
|
315
|
+
|
316
|
+
return AnalyzerEngine(
|
317
|
+
registry=registry,
|
318
|
+
supported_languages=[language]
|
319
|
+
)
|
320
|
+
|
321
|
+
else:
|
322
|
+
registry.load_predefined_recognizers()
|
323
|
+
|
324
|
+
if model is None:
|
325
|
+
raise ValueError("SpaCy model name must be provided")
|
326
|
+
|
327
|
+
configuration = {
|
328
|
+
"nlp_engine_name": "spacy",
|
329
|
+
"models": [{"lang_code": language, "model_name": model}],
|
330
|
+
}
|
331
|
+
|
332
|
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
333
|
+
nlp_engine = provider.create_engine()
|
334
|
+
|
335
|
+
return AnalyzerEngine(
|
336
|
+
nlp_engine=nlp_engine,
|
337
|
+
registry=registry,
|
338
|
+
supported_languages=[language],
|
339
|
+
)
|
340
|
+
|
341
|
+
|
342
|
+
def analyze_column_sample(column_values: pd.Series, analyzer: AnalyzerEngine, language: str,
|
343
|
+
entities: Optional[List[str]], score_threshold: float) -> Optional[str]:
|
344
|
+
sample_values = column_values.dropna().head(50)
|
345
|
+
|
346
|
+
if sample_values.empty:
|
347
|
+
return None
|
348
|
+
|
349
|
+
entity_counter = Counter()
|
350
|
+
|
351
|
+
for value in sample_values:
|
352
|
+
text = str(value).strip()
|
353
|
+
|
354
|
+
if not text:
|
355
|
+
continue
|
356
|
+
|
357
|
+
results = analyzer.analyze(
|
358
|
+
text=text,
|
359
|
+
language=language,
|
360
|
+
entities=(expand_entities_for_analyzer(entities) if entities else None)
|
361
|
+
)
|
362
|
+
|
363
|
+
for result in results:
|
364
|
+
if result.score >= score_threshold:
|
365
|
+
entity_counter[normalize_label(result.entity_type)] += 1
|
366
|
+
|
367
|
+
if not entity_counter:
|
368
|
+
return None
|
369
|
+
|
370
|
+
most_common = entity_counter.most_common(1)[0]
|
371
|
+
total_detections = sum(entity_counter.values())
|
372
|
+
|
373
|
+
if most_common[1] > total_detections * 0.5:
|
374
|
+
return most_common[0]
|
375
|
+
|
376
|
+
return most_common[0] if entity_counter else None
|
377
|
+
|
378
|
+
|
379
|
+
def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, language: str,
|
380
|
+
entities: Optional[List[str]] = None, score_threshold: float = 0.5) -> List[NERObject]:
|
381
|
+
ner_objects = []
|
382
|
+
|
383
|
+
for column_name in df.columns:
|
384
|
+
entity_type = analyze_column_sample(
|
385
|
+
df[column_name],
|
386
|
+
analyzer,
|
387
|
+
language,
|
388
|
+
entities,
|
389
|
+
score_threshold
|
390
|
+
)
|
391
|
+
|
392
|
+
if entity_type:
|
393
|
+
for idx, value in df[column_name].dropna().items():
|
394
|
+
text = str(value).strip()
|
395
|
+
|
396
|
+
if text:
|
397
|
+
ner_objects.append(NERObject(
|
398
|
+
name=text[:100],
|
399
|
+
label=entity_type,
|
400
|
+
score=0.9,
|
401
|
+
start=0,
|
402
|
+
count=1,
|
403
|
+
context=text[:100]
|
404
|
+
))
|
405
|
+
|
406
|
+
return ner_objects
|
407
|
+
|
408
|
+
|
409
|
+
def compute_ner_presidio(
|
410
|
+
text,
|
411
|
+
language,
|
412
|
+
analyzer,
|
413
|
+
entities=None,
|
414
|
+
score_threshold=0.5,
|
415
|
+
context_width=150,
|
416
|
+
with_comentions=True,
|
417
|
+
with_context=True,
|
418
|
+
batch_size=32,
|
419
|
+
n_process=4
|
420
|
+
):
|
421
|
+
if isinstance(text, pd.DataFrame):
|
422
|
+
if len(text) >= 100:
|
423
|
+
return analyze_dataframe_optimized(text, analyzer, language, entities, score_threshold)
|
424
|
+
|
425
|
+
else:
|
426
|
+
texts = []
|
427
|
+
|
428
|
+
for col in text.columns:
|
429
|
+
for idx, value in text[col].dropna().items():
|
430
|
+
text_value = str(value).strip()
|
431
|
+
|
432
|
+
if text_value:
|
433
|
+
texts.append(text_value)
|
434
|
+
|
435
|
+
text = "\n".join(texts)
|
436
|
+
|
437
|
+
elif isinstance(text, list):
|
438
|
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
439
|
+
|
440
|
+
results_generator = batch_analyzer.analyze_iterator(
|
441
|
+
texts=text,
|
442
|
+
language=language,
|
443
|
+
batch_size=batch_size,
|
444
|
+
n_process=n_process,
|
445
|
+
entities=(expand_entities_for_analyzer(entities) if entities else None),
|
446
|
+
)
|
447
|
+
|
448
|
+
all_results = list(results_generator)
|
449
|
+
ner_objects = []
|
450
|
+
|
451
|
+
for text_item, results in zip(text, all_results):
|
452
|
+
for result in results:
|
453
|
+
if result.score >= score_threshold:
|
454
|
+
context_start = max(0, result.start - 30)
|
455
|
+
context_end = min(len(text_item), result.end + 30)
|
456
|
+
context = text_item[context_start:context_end] if with_context else None
|
457
|
+
|
458
|
+
ner_objects.append(NERObject(
|
459
|
+
name=text_item[result.start:result.end],
|
460
|
+
label=normalize_label(result.entity_type),
|
461
|
+
score=float(result.score),
|
462
|
+
start=int(result.start),
|
463
|
+
count=1,
|
464
|
+
context=context
|
465
|
+
))
|
466
|
+
|
467
|
+
return ner_objects
|
468
|
+
|
469
|
+
results = analyzer.analyze(
|
470
|
+
text=text,
|
471
|
+
language=language,
|
472
|
+
entities=(expand_entities_for_analyzer(entities) if entities else None)
|
473
|
+
)
|
474
|
+
|
475
|
+
ner_objects = []
|
476
|
+
|
477
|
+
for result in results:
|
478
|
+
if result.score >= score_threshold:
|
479
|
+
context_start = max(0, result.start - math.floor(context_width / 2))
|
480
|
+
context_end = min(len(text), result.end + math.ceil(context_width / 2))
|
481
|
+
context = text[context_start:context_end] if with_context else None
|
482
|
+
|
483
|
+
ner_objects.append(NERObject(
|
484
|
+
name=text[result.start:result.end],
|
485
|
+
label=normalize_label(result.entity_type),
|
486
|
+
score=float(result.score),
|
487
|
+
start=int(result.start),
|
488
|
+
count=1,
|
489
|
+
context=context
|
490
|
+
))
|
491
|
+
|
492
|
+
if with_comentions:
|
493
|
+
for i in range(len(ner_objects)):
|
494
|
+
entity = ner_objects[i]
|
495
|
+
comentions = [
|
496
|
+
ner_objects[j].name
|
497
|
+
for j in range(len(ner_objects))
|
498
|
+
if j != i and abs(ner_objects[j].start - entity.start) < math.ceil(context_width / 2)
|
499
|
+
]
|
500
|
+
ner_objects[i].comentions = comentions
|
501
|
+
|
502
|
+
return ner_objects
|
503
|
+
|
504
|
+
|
505
|
+
def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
|
506
|
+
tokenizer = get_nltk_tokenizer(language)
|
507
|
+
stemmer = Stemmer(language)
|
508
|
+
parser = PlaintextParser.from_string(text, tokenizer)
|
509
|
+
if fast:
|
510
|
+
summarizer = LuhnSummarizer(stemmer)
|
511
|
+
summarizer.stop_words = get_stop_words(language)
|
512
|
+
scored_sentences = iter(_sumy__luhn_call(summarizer, parser.document))
|
513
|
+
else:
|
514
|
+
summarizer = LsaSummarizer(stemmer)
|
515
|
+
summarizer.stop_words = get_stop_words(language)
|
516
|
+
scored_sentences = iter(_sumy__lsa_call(summarizer, parser.document))
|
517
|
+
summary = []
|
518
|
+
summary_chars = 0
|
519
|
+
summary_chars_penultimate = 0
|
520
|
+
while summary_chars < max_chars:
|
521
|
+
try:
|
522
|
+
next_sentence = next(scored_sentences)
|
523
|
+
summary.append(next_sentence)
|
524
|
+
summary_chars_penultimate = summary_chars
|
525
|
+
summary_chars += len(" " + next_sentence[0]._text)
|
526
|
+
except StopIteration:
|
527
|
+
break
|
528
|
+
summary = sorted(summary, key=lambda x: x[2])
|
529
|
+
summary = [(sentence[0]._text, sentence[1]) for sentence in summary]
|
530
|
+
if summary_chars > max_chars:
|
531
|
+
summary[-1] = (
|
532
|
+
summary[-1][0][: max_chars - summary_chars_penultimate],
|
533
|
+
summary[-1][1],
|
534
|
+
)
|
535
|
+
if not with_scores:
|
536
|
+
summary = " ".join([s[0] for s in summary])
|
537
|
+
else:
|
538
|
+
min_score = min([s[1] for s in summary]) if summary else 0
|
539
|
+
max_score = max([min_score] + [s[1] for s in summary])
|
540
|
+
score_range = 1 if min_score == max_score else (max_score - min_score)
|
541
|
+
summary = [(s[0], (s[1] - min_score) / score_range) for s in summary]
|
542
|
+
return summary
|
543
|
+
|
544
|
+
|
545
|
+
def ner_pipe(
|
546
|
+
text,
|
547
|
+
language,
|
548
|
+
model,
|
549
|
+
engine_type="spacy",
|
550
|
+
fast=False,
|
551
|
+
compression_ratio="auto",
|
552
|
+
with_comentions=True,
|
553
|
+
with_context=True,
|
554
|
+
entities=None,
|
555
|
+
score_threshold=0.5,
|
556
|
+
batch_size=32,
|
557
|
+
n_process=4
|
558
|
+
):
|
559
|
+
analyzer = build_presidio_analyzer(
|
560
|
+
language=language,
|
561
|
+
engine_type=engine_type,
|
562
|
+
model=model,
|
563
|
+
)
|
564
|
+
|
565
|
+
if isinstance(text, pd.DataFrame):
|
566
|
+
ner = compute_ner_presidio(
|
567
|
+
text,
|
568
|
+
language,
|
569
|
+
analyzer,
|
570
|
+
entities,
|
571
|
+
score_threshold,
|
572
|
+
with_comentions=with_comentions,
|
573
|
+
with_context=with_context,
|
574
|
+
batch_size=batch_size,
|
575
|
+
n_process=n_process
|
576
|
+
)
|
577
|
+
else:
|
578
|
+
if compression_ratio == "auto":
|
579
|
+
compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
|
580
|
+
|
581
|
+
if compression_ratio > 1.0:
|
582
|
+
sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast,
|
583
|
+
with_scores=True)
|
584
|
+
text = " ".join([s[0] for s in sentences])
|
585
|
+
|
586
|
+
ner = compute_ner_presidio(
|
587
|
+
text,
|
588
|
+
language,
|
589
|
+
analyzer,
|
590
|
+
entities,
|
591
|
+
score_threshold,
|
592
|
+
with_comentions=with_comentions,
|
593
|
+
with_context=with_context,
|
594
|
+
batch_size=batch_size,
|
595
|
+
n_process=n_process
|
596
|
+
)
|
597
|
+
|
598
|
+
return ner
|
599
|
+
|
600
|
+
|
601
|
+
def get_ner_handler(
|
602
|
+
language,
|
603
|
+
model,
|
604
|
+
engine_type="spacy",
|
605
|
+
fast=False,
|
606
|
+
entities=None,
|
607
|
+
score_threshold=0.5,
|
608
|
+
batch_size=32,
|
609
|
+
n_process=4
|
610
|
+
):
|
611
|
+
try:
|
612
|
+
get_nltk_tokenizer(language)
|
613
|
+
except LookupError:
|
614
|
+
language = "en"
|
615
|
+
|
616
|
+
return lambda text, compression_ratio="auto", with_comentions=True, with_context=True: ner_pipe(
|
617
|
+
text,
|
618
|
+
language,
|
619
|
+
model,
|
620
|
+
engine_type,
|
621
|
+
fast,
|
622
|
+
compression_ratio,
|
623
|
+
with_comentions,
|
624
|
+
with_context,
|
625
|
+
entities,
|
626
|
+
score_threshold,
|
627
|
+
batch_size,
|
628
|
+
n_process
|
629
|
+
)
|
630
|
+
|
631
|
+
|
632
|
+
@st.cache_resource
|
633
|
+
def get_cached_ner_handler(language, model):
|
634
|
+
return get_ner_handler(language, model)
|
@@ -1,15 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
import py3langid as langid
|
3
|
-
import iso639 as languages
|
4
|
-
|
5
|
-
|
6
|
-
def detect_language(text, min_confidence=None):
|
7
|
-
detector = langid.langid.LanguageIdentifier.from_pickled_model(
|
8
|
-
langid.langid.MODEL_FILE, norm_probs=True
|
9
|
-
)
|
10
|
-
detected_lang, confidence = detector.classify(text)
|
11
|
-
if min_confidence and confidence < min_confidence:
|
12
|
-
return None, confidence
|
13
|
-
detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
|
14
|
-
detected_lang = languages.to_name(detected_lang).lower()
|
15
|
-
return detected_lang, confidence
|
@@ -1,329 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
import streamlit as st
|
3
|
-
from spacy_download import load_spacy
|
4
|
-
from flair.data import Sentence
|
5
|
-
from flair.models import SequenceTagger
|
6
|
-
from sumy.parsers.plaintext import PlaintextParser
|
7
|
-
from sumy.nlp.tokenizers import Tokenizer
|
8
|
-
from sumy.nlp.stemmers import Stemmer
|
9
|
-
from sumy.summarizers.lsa import LsaSummarizer
|
10
|
-
from sumy.summarizers.luhn import LuhnSummarizer
|
11
|
-
from sumy.utils import get_stop_words
|
12
|
-
import itertools
|
13
|
-
import numpy as np
|
14
|
-
import math
|
15
|
-
import nltk
|
16
|
-
from typing import Optional, List
|
17
|
-
from pydantic import BaseModel, ConfigDict, Field
|
18
|
-
|
19
|
-
SPACY_NER_MODELS = {
|
20
|
-
"english": lambda: load_spacy(
|
21
|
-
"en_core_web_sm",
|
22
|
-
disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"],
|
23
|
-
)
|
24
|
-
}
|
25
|
-
FLAIR_NER_MODELS = {"english": lambda: SequenceTagger.load("flair/ner-english")}
|
26
|
-
REGEX_NER_MODELS = {
|
27
|
-
"IP_ADDRESS": [
|
28
|
-
r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::(?:[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?\b",
|
29
|
-
],
|
30
|
-
"PHONE": r"(?:(?:\+(?:\d{1,3}[ .-]?)?(?:\(\d{1,3}\)[ .-]?)?)(?:\d{2,5}[ .-]?){1,3}|\d{2,5}[ .-]\d{2,5}(?:[ .-]\d{2,5}){0,2})\b",
|
31
|
-
"EMAIL": r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)+\b",
|
32
|
-
"URL": r"\b(?:(?:https?|ftp|sftp|ftps|ssh|file|mailto|git|onion|ipfs|ipns):\/\/|www\.)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}(?::\d+)?(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?|(?:https?:\/\/)?[a-z2-7]{16,56}\.onion(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)\b",
|
33
|
-
}
|
34
|
-
|
35
|
-
BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
|
36
|
-
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL"]
|
37
|
-
|
38
|
-
|
39
|
-
def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
|
40
|
-
from operator import attrgetter
|
41
|
-
from sumy.summarizers._summarizer import SentenceInfo
|
42
|
-
|
43
|
-
rate = rating
|
44
|
-
if isinstance(rating, dict):
|
45
|
-
assert not args and not kwargs
|
46
|
-
rate = lambda s: rating[s]
|
47
|
-
infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
|
48
|
-
infos = sorted(infos, key=attrgetter("rating"), reverse=True)
|
49
|
-
return tuple((i.sentence, i.rating, i.order) for i in infos)
|
50
|
-
|
51
|
-
|
52
|
-
def _sumy__lsa_call(summarizer, document):
|
53
|
-
summarizer._ensure_dependecies_installed()
|
54
|
-
dictionary = summarizer._create_dictionary(document)
|
55
|
-
if not dictionary:
|
56
|
-
return ()
|
57
|
-
matrix = summarizer._create_matrix(document, dictionary)
|
58
|
-
matrix = summarizer._compute_term_frequency(matrix)
|
59
|
-
from numpy.linalg import svd as singular_value_decomposition
|
60
|
-
|
61
|
-
u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
|
62
|
-
ranks = iter(summarizer._compute_ranks(sigma, v))
|
63
|
-
return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
|
64
|
-
|
65
|
-
|
66
|
-
def _sumy__luhn_call(summarizer, document):
|
67
|
-
words = summarizer._get_significant_words(document.words)
|
68
|
-
return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
|
69
|
-
|
70
|
-
|
71
|
-
def get_nltk_tokenizer(language: str) -> Tokenizer:
|
72
|
-
nltk.data.find("tokenizers/punkt")
|
73
|
-
return Tokenizer(language)
|
74
|
-
|
75
|
-
|
76
|
-
class NERObject(BaseModel):
|
77
|
-
name: str
|
78
|
-
label: str
|
79
|
-
score: float = 0.0
|
80
|
-
start: int
|
81
|
-
count: int
|
82
|
-
context: str | None = None
|
83
|
-
comentions: list[str] = Field(default_factory=list)
|
84
|
-
model_config = ConfigDict(extra="allow")
|
85
|
-
|
86
|
-
def __repr__(self):
|
87
|
-
return f"NERObject(label={self.label},name={self.name})"
|
88
|
-
|
89
|
-
|
90
|
-
def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_entities=None):
|
91
|
-
if whitelisted_labels is not None:
|
92
|
-
entities = [e for e in entities if e.label in whitelisted_labels]
|
93
|
-
entities = sorted(entities, key=lambda x: x.name)
|
94
|
-
final_entities = []
|
95
|
-
for _, group in itertools.groupby(entities, key=lambda x: x.name):
|
96
|
-
group = list(group)
|
97
|
-
best_entity = max(group, key=lambda x: x.score * x.count)
|
98
|
-
merged_data = {
|
99
|
-
"name": best_entity.name,
|
100
|
-
"label": best_entity.label,
|
101
|
-
"score": best_entity.score,
|
102
|
-
"context": best_entity.context,
|
103
|
-
"count": sum(e.count for e in group),
|
104
|
-
"start": best_entity.start,
|
105
|
-
}
|
106
|
-
all_fields = best_entity.model_fields.keys()
|
107
|
-
for field in all_fields:
|
108
|
-
if field in merged_data:
|
109
|
-
continue
|
110
|
-
values = [getattr(e, field, None) for e in group if getattr(e, field, None) is not None]
|
111
|
-
if not values:
|
112
|
-
continue
|
113
|
-
if isinstance(values[0], list):
|
114
|
-
merged_data[field] = list(set(itertools.chain.from_iterable(values or [])))
|
115
|
-
else:
|
116
|
-
merged_data[field] = getattr(best_entity, field, None)
|
117
|
-
final_entities.append(NERObject(**merged_data))
|
118
|
-
final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
|
119
|
-
if max_entities and len(final_entities) > max_entities:
|
120
|
-
final_entities = final_entities[:max_entities]
|
121
|
-
return final_entities
|
122
|
-
|
123
|
-
|
124
|
-
def compute_ner(
|
125
|
-
language,
|
126
|
-
sentences,
|
127
|
-
spacy_model,
|
128
|
-
flair_model=None,
|
129
|
-
context_width=150,
|
130
|
-
with_scores=True,
|
131
|
-
with_comentions=True,
|
132
|
-
with_context=True,
|
133
|
-
):
|
134
|
-
sentence_starts = [0] + [len(s[0]) + 1 for s in sentences]
|
135
|
-
del sentence_starts[-1]
|
136
|
-
sentence_starts = list(np.cumsum(sentence_starts))
|
137
|
-
text = "\n".join([s[0] for s in sentences])
|
138
|
-
min_score = 1.0
|
139
|
-
entities: list[NERObject] = []
|
140
|
-
|
141
|
-
# FLAIR model (if not fast)
|
142
|
-
if flair_model:
|
143
|
-
input = [Sentence(sentence[0]) for sentence in sentences]
|
144
|
-
flair_model.predict(input)
|
145
|
-
output = [e for sentence in input for e in sentence.get_spans("ner")]
|
146
|
-
flair_entities = [
|
147
|
-
NERObject(
|
148
|
-
name=entity.text,
|
149
|
-
label=BASE_TO_ONTONOTES_LABELMAP.get(
|
150
|
-
entity.annotation_layers["ner"][0].value,
|
151
|
-
entity.annotation_layers["ner"][0].value,
|
152
|
-
),
|
153
|
-
score=entity.score,
|
154
|
-
start=sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
|
155
|
-
count=1,
|
156
|
-
)
|
157
|
-
for entity in output
|
158
|
-
]
|
159
|
-
min_score = min([min_score] + [e.score for e in flair_entities])
|
160
|
-
entities += flair_entities
|
161
|
-
del flair_entities
|
162
|
-
|
163
|
-
# REGEX model
|
164
|
-
for label, regexes in REGEX_NER_MODELS.items():
|
165
|
-
if not isinstance(regexes, list):
|
166
|
-
regexes = [regexes]
|
167
|
-
for regex in regexes:
|
168
|
-
regex_entities = [
|
169
|
-
NERObject(
|
170
|
-
name=match.group(),
|
171
|
-
label=label,
|
172
|
-
score=min_score - 0.5,
|
173
|
-
count=1,
|
174
|
-
start=match.start(),
|
175
|
-
)
|
176
|
-
for match in re.finditer(regex, text)
|
177
|
-
]
|
178
|
-
entities += regex_entities
|
179
|
-
min_score = min([min_score] + [e.score for e in regex_entities])
|
180
|
-
|
181
|
-
# SPACY model
|
182
|
-
chunks = []
|
183
|
-
chunk_start_offsets = []
|
184
|
-
current_chunk = []
|
185
|
-
current_length = 0
|
186
|
-
offset = 0
|
187
|
-
for sentence, _ in sentences:
|
188
|
-
sentence_len = len(sentence) + 1
|
189
|
-
if sentence_len > spacy_model.max_length:
|
190
|
-
truncated = sentence[: spacy_model.max_length - 1]
|
191
|
-
chunks.append(truncated)
|
192
|
-
chunk_start_offsets.append(offset)
|
193
|
-
offset += sentence_len
|
194
|
-
continue
|
195
|
-
if current_length + sentence_len > spacy_model.max_length:
|
196
|
-
chunks.append("\n".join(current_chunk))
|
197
|
-
chunk_start_offsets.append(offset - current_length)
|
198
|
-
current_chunk = []
|
199
|
-
current_length = 0
|
200
|
-
current_chunk.append(sentence)
|
201
|
-
current_length += sentence_len
|
202
|
-
offset += sentence_len
|
203
|
-
if current_chunk:
|
204
|
-
chunks.append("\n".join(current_chunk))
|
205
|
-
chunk_start_offsets.append(offset - current_length)
|
206
|
-
for i, chunk in enumerate(chunks):
|
207
|
-
doc = spacy_model(chunk)
|
208
|
-
chunk_offset = chunk_start_offsets[i]
|
209
|
-
for entity in doc.ents:
|
210
|
-
entities.append(
|
211
|
-
NERObject(
|
212
|
-
name=entity.text,
|
213
|
-
label=BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
|
214
|
-
score=min_score - 0.5,
|
215
|
-
start=chunk_offset + entity.start_char,
|
216
|
-
count=1,
|
217
|
-
)
|
218
|
-
)
|
219
|
-
|
220
|
-
# Reformatting for consistency
|
221
|
-
if not entities:
|
222
|
-
return []
|
223
|
-
if with_scores:
|
224
|
-
min_entity_score = min([e.score for e in entities])
|
225
|
-
max_entity_score = max([e.score for e in entities])
|
226
|
-
entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
|
227
|
-
for e in entities:
|
228
|
-
e.score = (e.score - min_entity_score) / entity_score_range
|
229
|
-
scores = list(np.searchsorted(sentence_starts, [e.start + 1 for e in entities]))
|
230
|
-
scores = [sentences[i - 1][1] for i in scores]
|
231
|
-
scores = [scores[i] + 10 * entities[i].score for i in range(len(entities))]
|
232
|
-
for i in range(len(entities)):
|
233
|
-
entities[i].score = scores[i]
|
234
|
-
else:
|
235
|
-
for i in range(len(entities)):
|
236
|
-
entities[i].score = 0.0
|
237
|
-
if with_comentions:
|
238
|
-
for i in range(len(entities)):
|
239
|
-
entity = entities[i]
|
240
|
-
comentions = [
|
241
|
-
entities[j].name
|
242
|
-
for j in range(len(entities))
|
243
|
-
if j != i and abs(entities[j].start - entity.start) < math.ceil(context_width / 2)
|
244
|
-
]
|
245
|
-
entities[i].comentions = comentions
|
246
|
-
if with_context:
|
247
|
-
for i in range(len(entities)):
|
248
|
-
entity = entities[i]
|
249
|
-
if entity.start >= 0 and entity.start < len(text):
|
250
|
-
left = max(0, entity.start - math.floor(context_width / 2))
|
251
|
-
right = min(len(text), entity.start + math.ceil(context_width / 2))
|
252
|
-
context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
|
253
|
-
entities[i].context = context
|
254
|
-
return entities
|
255
|
-
|
256
|
-
|
257
|
-
def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
|
258
|
-
tokenizer = get_nltk_tokenizer(language)
|
259
|
-
stemmer = Stemmer(language)
|
260
|
-
parser = PlaintextParser.from_string(text, tokenizer)
|
261
|
-
if fast:
|
262
|
-
summarizer = LuhnSummarizer(stemmer)
|
263
|
-
summarizer.stop_words = get_stop_words(language)
|
264
|
-
scored_sentences = iter(_sumy__luhn_call(summarizer, parser.document))
|
265
|
-
else:
|
266
|
-
summarizer = LsaSummarizer(stemmer)
|
267
|
-
summarizer.stop_words = get_stop_words(language)
|
268
|
-
scored_sentences = iter(_sumy__lsa_call(summarizer, parser.document))
|
269
|
-
summary = []
|
270
|
-
summary_chars = 0
|
271
|
-
summary_chars_penultimate = 0
|
272
|
-
while summary_chars < max_chars:
|
273
|
-
try:
|
274
|
-
next_sentence = next(scored_sentences)
|
275
|
-
summary.append(next_sentence)
|
276
|
-
summary_chars_penultimate = summary_chars
|
277
|
-
summary_chars += len(" " + next_sentence[0]._text)
|
278
|
-
except StopIteration:
|
279
|
-
break
|
280
|
-
summary = sorted(summary, key=lambda x: x[2])
|
281
|
-
summary = [(sentence[0]._text, sentence[1]) for sentence in summary]
|
282
|
-
if summary_chars > max_chars:
|
283
|
-
summary[-1] = (
|
284
|
-
summary[-1][0][: max_chars - summary_chars_penultimate],
|
285
|
-
summary[-1][1],
|
286
|
-
)
|
287
|
-
if not with_scores:
|
288
|
-
summary = " ".join([s[0] for s in summary])
|
289
|
-
else:
|
290
|
-
min_score = min([s[1] for s in summary]) if summary else 0
|
291
|
-
max_score = max([min_score] + [s[1] for s in summary])
|
292
|
-
score_range = 1 if min_score == max_score else (max_score - min_score)
|
293
|
-
summary = [(s[0], (s[1] - min_score) / score_range) for s in summary]
|
294
|
-
return summary
|
295
|
-
|
296
|
-
|
297
|
-
def ner_pipe(
|
298
|
-
text,
|
299
|
-
language,
|
300
|
-
spacy_model,
|
301
|
-
flair_model=None,
|
302
|
-
fast=False,
|
303
|
-
compression_ratio="auto",
|
304
|
-
with_scores=True,
|
305
|
-
with_comentions=True,
|
306
|
-
with_context=True,
|
307
|
-
):
|
308
|
-
if compression_ratio == "auto":
|
309
|
-
compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
|
310
|
-
sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True)
|
311
|
-
ner = compute_ner(language, sentences, spacy_model, flair_model, 150, with_scores, with_comentions, with_context)
|
312
|
-
return ner
|
313
|
-
|
314
|
-
|
315
|
-
def get_ner_handler(language, fast=False):
|
316
|
-
try:
|
317
|
-
get_nltk_tokenizer(language) # raises a LookupError if the language is not valid
|
318
|
-
except LookupError:
|
319
|
-
language = "english"
|
320
|
-
spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS["english"])()
|
321
|
-
flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS["english"])()
|
322
|
-
return lambda text, compression_ratio="auto", with_scores=True, with_comentions=True, with_context=True: ner_pipe(
|
323
|
-
text, language, spacy_model, flair_model, fast, compression_ratio, with_scores, with_comentions, with_context
|
324
|
-
)
|
325
|
-
|
326
|
-
|
327
|
-
@st.cache_resource
|
328
|
-
def get_cached_ner_handler(language, fast):
|
329
|
-
return get_ner_handler(language, fast)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|