streamlit-octostar-utils 0.2.9__py3-none-any.whl → 2.11a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- streamlit_octostar_utils/nlp/ner.py +424 -172
- {streamlit_octostar_utils-0.2.9.dist-info → streamlit_octostar_utils-2.11a1.dist-info}/METADATA +4 -2
- {streamlit_octostar_utils-0.2.9.dist-info → streamlit_octostar_utils-2.11a1.dist-info}/RECORD +5 -5
- {streamlit_octostar_utils-0.2.9.dist-info → streamlit_octostar_utils-2.11a1.dist-info}/WHEEL +1 -1
- {streamlit_octostar_utils-0.2.9.dist-info → streamlit_octostar_utils-2.11a1.dist-info/licenses}/LICENSE +0 -0
@@ -1,36 +1,142 @@
|
|
1
|
-
import
|
1
|
+
import itertools
|
2
|
+
import math
|
3
|
+
from typing import Optional, List, Tuple
|
4
|
+
from pydantic import BaseModel, ConfigDict, Field
|
5
|
+
from collections import Counter
|
6
|
+
|
7
|
+
from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerRegistry, AnalysisExplanation, \
|
8
|
+
EntityRecognizer, RecognizerResult
|
9
|
+
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
|
2
10
|
import streamlit as st
|
3
|
-
|
11
|
+
import nltk
|
12
|
+
import pandas as pd
|
4
13
|
from flair.data import Sentence
|
5
14
|
from flair.models import SequenceTagger
|
15
|
+
|
6
16
|
from sumy.parsers.plaintext import PlaintextParser
|
7
17
|
from sumy.nlp.tokenizers import Tokenizer
|
8
18
|
from sumy.nlp.stemmers import Stemmer
|
9
19
|
from sumy.summarizers.lsa import LsaSummarizer
|
10
20
|
from sumy.summarizers.luhn import LuhnSummarizer
|
11
21
|
from sumy.utils import get_stop_words
|
12
|
-
import itertools
|
13
|
-
import numpy as np
|
14
|
-
import math
|
15
|
-
import nltk
|
16
|
-
from typing import Optional, List
|
17
|
-
from pydantic import BaseModel, ConfigDict, Field
|
18
22
|
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
"
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
23
|
+
|
24
|
+
class FlairRecognizer(EntityRecognizer):
|
25
|
+
ENTITIES = [
|
26
|
+
"LOCATION",
|
27
|
+
"PERSON",
|
28
|
+
"ORGANIZATION",
|
29
|
+
]
|
30
|
+
|
31
|
+
DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
|
32
|
+
|
33
|
+
CHECK_LABEL_GROUPS = [
|
34
|
+
({"LOCATION"}, {"LOC", "LOCATION"}),
|
35
|
+
({"PERSON"}, {"PER", "PERSON"}),
|
36
|
+
({"ORGANIZATION"}, {"ORG"}),
|
37
|
+
]
|
38
|
+
|
39
|
+
MODEL_LANGUAGES = {
|
40
|
+
"en": "flair/ner-english-large",
|
41
|
+
"es": "flair/ner-spanish-large",
|
42
|
+
"de": "flair/ner-german-large",
|
43
|
+
"nl": "flair/ner-dutch-large",
|
44
|
+
"multi": "flair/ner-multi",
|
45
|
+
"multi-fast": "flair/ner-multi-fast",
|
46
|
+
}
|
47
|
+
|
48
|
+
PRESIDIO_EQUIVALENCES = {
|
49
|
+
"PER": "PERSON",
|
50
|
+
"LOC": "LOCATION",
|
51
|
+
"ORG": "ORGANIZATION",
|
52
|
+
}
|
53
|
+
|
54
|
+
def __init__(
|
55
|
+
self,
|
56
|
+
model: SequenceTagger = None,
|
57
|
+
supported_language: str = "en",
|
58
|
+
supported_entities: Optional[List[str]] = None,
|
59
|
+
check_label_groups: Optional[Tuple[set, set]] = None,
|
60
|
+
):
|
61
|
+
self.check_label_groups = (
|
62
|
+
check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
|
63
|
+
)
|
64
|
+
|
65
|
+
supported_entities = supported_entities if supported_entities else self.ENTITIES
|
66
|
+
self.model = model
|
67
|
+
|
68
|
+
super().__init__(
|
69
|
+
supported_entities=supported_entities,
|
70
|
+
supported_language=supported_language,
|
71
|
+
name="Flair Analytics",
|
72
|
+
)
|
73
|
+
|
74
|
+
def load(self) -> None:
|
75
|
+
pass
|
76
|
+
|
77
|
+
def get_supported_entities(self) -> List[str]:
|
78
|
+
return self.supported_entities
|
79
|
+
|
80
|
+
def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None) -> List[RecognizerResult]:
|
81
|
+
results = []
|
82
|
+
|
83
|
+
sentences = Sentence(text)
|
84
|
+
self.model.predict(sentences)
|
85
|
+
|
86
|
+
if not entities:
|
87
|
+
entities = self.supported_entities
|
88
|
+
|
89
|
+
for entity in entities:
|
90
|
+
if entity not in self.supported_entities:
|
91
|
+
continue
|
92
|
+
|
93
|
+
for ent in sentences.get_spans("ner"):
|
94
|
+
if not self.__check_label(
|
95
|
+
entity, ent.labels[0].value, self.check_label_groups
|
96
|
+
):
|
97
|
+
continue
|
98
|
+
textual_explanation = self.DEFAULT_EXPLANATION.format(
|
99
|
+
ent.labels[0].value
|
100
|
+
)
|
101
|
+
explanation = self.build_flair_explanation(
|
102
|
+
round(ent.score, 2), textual_explanation
|
103
|
+
)
|
104
|
+
flair_result = self._convert_to_recognizer_result(ent, explanation)
|
105
|
+
|
106
|
+
results.append(flair_result)
|
107
|
+
|
108
|
+
return results
|
109
|
+
|
110
|
+
def build_flair_explanation(self, original_score: float, explanation: str) -> AnalysisExplanation:
|
111
|
+
explanation = AnalysisExplanation(
|
112
|
+
recognizer=self.__class__.__name__,
|
113
|
+
original_score=original_score,
|
114
|
+
textual_explanation=explanation,
|
115
|
+
)
|
116
|
+
return explanation
|
117
|
+
|
118
|
+
def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
|
119
|
+
entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
|
120
|
+
flair_score = round(entity.score, 2)
|
121
|
+
|
122
|
+
flair_results = RecognizerResult(
|
123
|
+
entity_type=entity_type,
|
124
|
+
start=entity.start_position,
|
125
|
+
end=entity.end_position,
|
126
|
+
score=flair_score,
|
127
|
+
analysis_explanation=explanation,
|
128
|
+
)
|
129
|
+
|
130
|
+
return flair_results
|
131
|
+
|
132
|
+
@staticmethod
|
133
|
+
def __check_label(
|
134
|
+
entity: str, label: str, check_label_groups: Tuple[set, set]
|
135
|
+
) -> bool:
|
136
|
+
return any(
|
137
|
+
[entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
|
138
|
+
)
|
139
|
+
|
34
140
|
|
35
141
|
BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
|
36
142
|
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL"]
|
@@ -69,11 +175,18 @@ def _sumy__luhn_call(summarizer, document):
|
|
69
175
|
|
70
176
|
|
71
177
|
def get_nltk_tokenizer(language: str) -> Tokenizer:
|
178
|
+
if language == "en":
|
179
|
+
nltk_lang = "english"
|
180
|
+
elif language == "it":
|
181
|
+
nltk_lang = "italian"
|
182
|
+
else:
|
183
|
+
nltk_lang = language
|
184
|
+
|
72
185
|
try:
|
73
186
|
nltk.data.find("tokenizers/punkt")
|
74
187
|
except LookupError:
|
75
188
|
nltk.download("punkt")
|
76
|
-
return Tokenizer(
|
189
|
+
return Tokenizer(nltk_lang)
|
77
190
|
|
78
191
|
|
79
192
|
class NERObject(BaseModel):
|
@@ -124,137 +237,217 @@ def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_enti
|
|
124
237
|
return final_entities
|
125
238
|
|
126
239
|
|
127
|
-
def
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
240
|
+
def build_presidio_analyzer(language: str, engine_type: str = "spacy", model=None) -> AnalyzerEngine:
|
241
|
+
registry = RecognizerRegistry()
|
242
|
+
|
243
|
+
if engine_type == "flair":
|
244
|
+
|
245
|
+
flair_recognizer = FlairRecognizer(
|
246
|
+
model=model,
|
247
|
+
supported_language=language
|
248
|
+
)
|
249
|
+
registry.add_recognizer(flair_recognizer)
|
250
|
+
|
251
|
+
default_registry = RecognizerRegistry()
|
252
|
+
default_registry.load_predefined_recognizers()
|
253
|
+
|
254
|
+
flair_handled_entities = {"PERSON", "LOCATION", "ORGANIZATION"}
|
255
|
+
|
256
|
+
for recognizer in default_registry.recognizers:
|
257
|
+
recognizer_entities = set(recognizer.supported_entities) if hasattr(recognizer, 'supported_entities') else set()
|
258
|
+
|
259
|
+
if recognizer_entities and recognizer_entities.issubset(flair_handled_entities):
|
260
|
+
continue
|
261
|
+
|
262
|
+
registry.add_recognizer(recognizer)
|
263
|
+
|
264
|
+
return AnalyzerEngine(
|
265
|
+
registry=registry,
|
266
|
+
supported_languages=[language]
|
267
|
+
)
|
268
|
+
|
269
|
+
else:
|
270
|
+
registry.load_predefined_recognizers()
|
271
|
+
|
272
|
+
if model is None:
|
273
|
+
raise ValueError("SpaCy model name must be provided")
|
274
|
+
|
275
|
+
configuration = {
|
276
|
+
"nlp_engine_name": "spacy",
|
277
|
+
"models": [{"lang_code": language, "model_name": model}],
|
278
|
+
}
|
279
|
+
|
280
|
+
provider = NlpEngineProvider(nlp_configuration=configuration)
|
281
|
+
nlp_engine = provider.create_engine()
|
282
|
+
|
283
|
+
return AnalyzerEngine(
|
284
|
+
nlp_engine=nlp_engine,
|
285
|
+
registry=registry,
|
286
|
+
supported_languages=[language],
|
287
|
+
)
|
288
|
+
|
289
|
+
|
290
|
+
def analyze_column_sample(column_values: pd.Series, analyzer: AnalyzerEngine, language: str,
|
291
|
+
entities: Optional[List[str]], score_threshold: float) -> Optional[str]:
|
292
|
+
sample_values = column_values.dropna().head(50)
|
293
|
+
|
294
|
+
if sample_values.empty:
|
295
|
+
return None
|
296
|
+
|
297
|
+
entity_counter = Counter()
|
298
|
+
|
299
|
+
for value in sample_values:
|
300
|
+
text = str(value).strip()
|
301
|
+
|
302
|
+
if not text:
|
303
|
+
continue
|
304
|
+
|
305
|
+
results = analyzer.analyze(
|
306
|
+
text=text,
|
307
|
+
language=language,
|
308
|
+
entities=entities if entities else None
|
309
|
+
)
|
310
|
+
|
311
|
+
for result in results:
|
312
|
+
if result.score >= score_threshold:
|
313
|
+
entity_counter[result.entity_type] += 1
|
314
|
+
|
315
|
+
if not entity_counter:
|
316
|
+
return None
|
317
|
+
|
318
|
+
most_common = entity_counter.most_common(1)[0]
|
319
|
+
total_detections = sum(entity_counter.values())
|
320
|
+
|
321
|
+
if most_common[1] > total_detections * 0.5:
|
322
|
+
return most_common[0]
|
323
|
+
|
324
|
+
return most_common[0] if entity_counter else None
|
325
|
+
|
326
|
+
|
327
|
+
def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, language: str,
|
328
|
+
entities: Optional[List[str]] = None, score_threshold: float = 0.5) -> List[NERObject]:
|
329
|
+
ner_objects = []
|
330
|
+
|
331
|
+
for column_name in df.columns:
|
332
|
+
entity_type = analyze_column_sample(
|
333
|
+
df[column_name],
|
334
|
+
analyzer,
|
335
|
+
language,
|
336
|
+
entities,
|
337
|
+
score_threshold
|
338
|
+
)
|
339
|
+
|
340
|
+
if entity_type:
|
341
|
+
for idx, value in df[column_name].dropna().items():
|
342
|
+
text = str(value).strip()
|
343
|
+
|
344
|
+
if text:
|
345
|
+
ner_objects.append(NERObject(
|
346
|
+
name=text[:100],
|
347
|
+
label=entity_type,
|
348
|
+
score=0.9,
|
349
|
+
start=0,
|
350
|
+
count=1,
|
351
|
+
context=text[:100]
|
352
|
+
))
|
353
|
+
|
354
|
+
return ner_objects
|
355
|
+
|
356
|
+
|
357
|
+
def compute_ner_presidio(
|
358
|
+
text,
|
359
|
+
language,
|
360
|
+
analyzer,
|
361
|
+
entities=None,
|
362
|
+
score_threshold=0.5,
|
363
|
+
context_width=150,
|
364
|
+
with_comentions=True,
|
365
|
+
with_context=True,
|
366
|
+
batch_size=32,
|
367
|
+
n_process=4
|
136
368
|
):
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
369
|
+
if isinstance(text, pd.DataFrame):
|
370
|
+
if len(text) >= 100:
|
371
|
+
return analyze_dataframe_optimized(text, analyzer, language, entities, score_threshold)
|
372
|
+
|
373
|
+
else:
|
374
|
+
texts = []
|
375
|
+
|
376
|
+
for col in text.columns:
|
377
|
+
for idx, value in text[col].dropna().items():
|
378
|
+
text_value = str(value).strip()
|
379
|
+
|
380
|
+
if text_value:
|
381
|
+
texts.append(text_value)
|
382
|
+
|
383
|
+
text = "\n".join(texts)
|
384
|
+
|
385
|
+
elif isinstance(text, list):
|
386
|
+
batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
|
387
|
+
|
388
|
+
results_generator = batch_analyzer.analyze_iterator(
|
389
|
+
texts=text,
|
390
|
+
language=language,
|
391
|
+
batch_size=batch_size,
|
392
|
+
n_process=n_process,
|
393
|
+
entities=entities if entities else None,
|
394
|
+
)
|
395
|
+
|
396
|
+
all_results = list(results_generator)
|
397
|
+
ner_objects = []
|
398
|
+
|
399
|
+
for text_item, results in zip(text, all_results):
|
400
|
+
for result in results:
|
401
|
+
if result.score >= score_threshold:
|
402
|
+
context_start = max(0, result.start - 30)
|
403
|
+
context_end = min(len(text_item), result.end + 30)
|
404
|
+
context = text_item[context_start:context_end] if with_context else None
|
405
|
+
|
406
|
+
ner_objects.append(NERObject(
|
407
|
+
name=text_item[result.start:result.end],
|
408
|
+
label=result.entity_type,
|
409
|
+
score=float(result.score),
|
410
|
+
start=int(result.start),
|
411
|
+
count=1,
|
412
|
+
context=context
|
413
|
+
))
|
414
|
+
|
415
|
+
return ner_objects
|
416
|
+
|
417
|
+
results = analyzer.analyze(
|
418
|
+
text=text,
|
419
|
+
language=language,
|
420
|
+
entities=entities if entities else None
|
421
|
+
)
|
422
|
+
|
423
|
+
ner_objects = []
|
424
|
+
|
425
|
+
for result in results:
|
426
|
+
if result.score >= score_threshold:
|
427
|
+
context_start = max(0, result.start - math.floor(context_width / 2))
|
428
|
+
context_end = min(len(text), result.end + math.ceil(context_width / 2))
|
429
|
+
context = text[context_start:context_end] if with_context else None
|
430
|
+
|
431
|
+
ner_objects.append(NERObject(
|
432
|
+
name=text[result.start:result.end],
|
433
|
+
label=result.entity_type,
|
434
|
+
score=float(result.score),
|
435
|
+
start=int(result.start),
|
158
436
|
count=1,
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
min_score = min([min_score] + [e.score for e in flair_entities])
|
163
|
-
entities += flair_entities
|
164
|
-
del flair_entities
|
165
|
-
|
166
|
-
# REGEX model
|
167
|
-
for label, regexes in REGEX_NER_MODELS.items():
|
168
|
-
if not isinstance(regexes, list):
|
169
|
-
regexes = [regexes]
|
170
|
-
for regex in regexes:
|
171
|
-
regex_entities = [
|
172
|
-
NERObject(
|
173
|
-
name=match.group(),
|
174
|
-
label=label,
|
175
|
-
score=min_score - 0.5,
|
176
|
-
count=1,
|
177
|
-
start=match.start(),
|
178
|
-
)
|
179
|
-
for match in re.finditer(regex, text)
|
180
|
-
]
|
181
|
-
entities += regex_entities
|
182
|
-
min_score = min([min_score] + [e.score for e in regex_entities])
|
183
|
-
|
184
|
-
# SPACY model
|
185
|
-
chunks = []
|
186
|
-
chunk_start_offsets = []
|
187
|
-
current_chunk = []
|
188
|
-
current_length = 0
|
189
|
-
offset = 0
|
190
|
-
for sentence, _ in sentences:
|
191
|
-
sentence_len = len(sentence) + 1
|
192
|
-
if sentence_len > spacy_model.max_length:
|
193
|
-
truncated = sentence[: spacy_model.max_length - 1]
|
194
|
-
chunks.append(truncated)
|
195
|
-
chunk_start_offsets.append(offset)
|
196
|
-
offset += sentence_len
|
197
|
-
continue
|
198
|
-
if current_length + sentence_len > spacy_model.max_length:
|
199
|
-
chunks.append("\n".join(current_chunk))
|
200
|
-
chunk_start_offsets.append(offset - current_length)
|
201
|
-
current_chunk = []
|
202
|
-
current_length = 0
|
203
|
-
current_chunk.append(sentence)
|
204
|
-
current_length += sentence_len
|
205
|
-
offset += sentence_len
|
206
|
-
if current_chunk:
|
207
|
-
chunks.append("\n".join(current_chunk))
|
208
|
-
chunk_start_offsets.append(offset - current_length)
|
209
|
-
for i, chunk in enumerate(chunks):
|
210
|
-
doc = spacy_model(chunk)
|
211
|
-
chunk_offset = chunk_start_offsets[i]
|
212
|
-
for entity in doc.ents:
|
213
|
-
entities.append(
|
214
|
-
NERObject(
|
215
|
-
name=entity.text,
|
216
|
-
label=BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
|
217
|
-
score=min_score - 0.5,
|
218
|
-
start=chunk_offset + entity.start_char,
|
219
|
-
count=1,
|
220
|
-
)
|
221
|
-
)
|
222
|
-
|
223
|
-
# Reformatting for consistency
|
224
|
-
if not entities:
|
225
|
-
return []
|
226
|
-
if with_scores:
|
227
|
-
min_entity_score = min([e.score for e in entities])
|
228
|
-
max_entity_score = max([e.score for e in entities])
|
229
|
-
entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
|
230
|
-
for e in entities:
|
231
|
-
e.score = (e.score - min_entity_score) / entity_score_range
|
232
|
-
scores = list(np.searchsorted(sentence_starts, [e.start + 1 for e in entities]))
|
233
|
-
scores = [sentences[i - 1][1] for i in scores]
|
234
|
-
scores = [scores[i] + 10 * entities[i].score for i in range(len(entities))]
|
235
|
-
for i in range(len(entities)):
|
236
|
-
entities[i].score = scores[i]
|
237
|
-
else:
|
238
|
-
for i in range(len(entities)):
|
239
|
-
entities[i].score = 0.0
|
437
|
+
context=context
|
438
|
+
))
|
439
|
+
|
240
440
|
if with_comentions:
|
241
|
-
for i in range(len(
|
242
|
-
entity =
|
441
|
+
for i in range(len(ner_objects)):
|
442
|
+
entity = ner_objects[i]
|
243
443
|
comentions = [
|
244
|
-
|
245
|
-
for j in range(len(
|
246
|
-
if j != i and abs(
|
444
|
+
ner_objects[j].name
|
445
|
+
for j in range(len(ner_objects))
|
446
|
+
if j != i and abs(ner_objects[j].start - entity.start) < math.ceil(context_width / 2)
|
247
447
|
]
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
entity = entities[i]
|
252
|
-
if entity.start >= 0 and entity.start < len(text):
|
253
|
-
left = max(0, entity.start - math.floor(context_width / 2))
|
254
|
-
right = min(len(text), entity.start + math.ceil(context_width / 2))
|
255
|
-
context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
|
256
|
-
entities[i].context = context
|
257
|
-
return entities
|
448
|
+
ner_objects[i].comentions = comentions
|
449
|
+
|
450
|
+
return ner_objects
|
258
451
|
|
259
452
|
|
260
453
|
def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
|
@@ -298,35 +491,94 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
|
|
298
491
|
|
299
492
|
|
300
493
|
def ner_pipe(
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
494
|
+
text,
|
495
|
+
language,
|
496
|
+
model,
|
497
|
+
engine_type="spacy",
|
498
|
+
fast=False,
|
499
|
+
compression_ratio="auto",
|
500
|
+
with_comentions=True,
|
501
|
+
with_context=True,
|
502
|
+
entities=None,
|
503
|
+
score_threshold=0.5,
|
504
|
+
batch_size=32,
|
505
|
+
n_process=4
|
310
506
|
):
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
507
|
+
analyzer = build_presidio_analyzer(
|
508
|
+
language=language,
|
509
|
+
engine_type=engine_type,
|
510
|
+
model=model,
|
511
|
+
)
|
512
|
+
|
513
|
+
if isinstance(text, pd.DataFrame):
|
514
|
+
ner = compute_ner_presidio(
|
515
|
+
text,
|
516
|
+
language,
|
517
|
+
analyzer,
|
518
|
+
entities,
|
519
|
+
score_threshold,
|
520
|
+
150,
|
521
|
+
with_comentions,
|
522
|
+
with_context,
|
523
|
+
batch_size,
|
524
|
+
n_process
|
525
|
+
)
|
526
|
+
else:
|
527
|
+
if compression_ratio == "auto":
|
528
|
+
compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
|
529
|
+
|
530
|
+
if compression_ratio > 1.0:
|
531
|
+
sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast,
|
532
|
+
with_scores=True)
|
533
|
+
text = " ".join([s[0] for s in sentences])
|
534
|
+
|
535
|
+
ner = compute_ner_presidio(
|
536
|
+
text,
|
537
|
+
language,
|
538
|
+
analyzer,
|
539
|
+
entities,
|
540
|
+
score_threshold,
|
541
|
+
150,
|
542
|
+
with_comentions,
|
543
|
+
with_context,
|
544
|
+
batch_size,
|
545
|
+
n_process
|
546
|
+
)
|
547
|
+
|
315
548
|
return ner
|
316
549
|
|
317
550
|
|
318
|
-
def get_ner_handler(
|
551
|
+
def get_ner_handler(
|
552
|
+
language,
|
553
|
+
model,
|
554
|
+
engine_type="spacy",
|
555
|
+
fast=False,
|
556
|
+
entities=None,
|
557
|
+
score_threshold=0.5,
|
558
|
+
batch_size=32,
|
559
|
+
n_process=4
|
560
|
+
):
|
319
561
|
try:
|
320
|
-
get_nltk_tokenizer(language)
|
562
|
+
get_nltk_tokenizer(language)
|
321
563
|
except LookupError:
|
322
|
-
language = "
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
564
|
+
language = "en"
|
565
|
+
|
566
|
+
return lambda text, compression_ratio="auto", with_comentions=True, with_context=True: ner_pipe(
|
567
|
+
text,
|
568
|
+
language,
|
569
|
+
model,
|
570
|
+
engine_type,
|
571
|
+
fast,
|
572
|
+
compression_ratio,
|
573
|
+
with_comentions,
|
574
|
+
with_context,
|
575
|
+
entities,
|
576
|
+
score_threshold,
|
577
|
+
batch_size,
|
578
|
+
n_process
|
327
579
|
)
|
328
580
|
|
329
581
|
|
330
582
|
@st.cache_resource
|
331
|
-
def get_cached_ner_handler(language,
|
332
|
-
return get_ner_handler(language,
|
583
|
+
def get_cached_ner_handler(language, model):
|
584
|
+
return get_ner_handler(language, model)
|
{streamlit_octostar_utils-0.2.9.dist-info → streamlit_octostar_utils-2.11a1.dist-info}/METADATA
RENAMED
@@ -1,8 +1,9 @@
|
|
1
|
-
Metadata-Version: 2.
|
1
|
+
Metadata-Version: 2.4
|
2
2
|
Name: streamlit-octostar-utils
|
3
|
-
Version:
|
3
|
+
Version: 2.11a1
|
4
4
|
Summary:
|
5
5
|
License: MIT
|
6
|
+
License-File: LICENSE
|
6
7
|
Author: Octostar
|
7
8
|
Requires-Python: >=3.9, !=2.7.*, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*, !=3.7.*, !=3.8.*
|
8
9
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -12,6 +13,7 @@ Classifier: Programming Language :: Python :: 3.10
|
|
12
13
|
Classifier: Programming Language :: Python :: 3.11
|
13
14
|
Classifier: Programming Language :: Python :: 3.12
|
14
15
|
Classifier: Programming Language :: Python :: 3.13
|
16
|
+
Classifier: Programming Language :: Python :: 3.14
|
15
17
|
Provides-Extra: nlp
|
16
18
|
Requires-Dist: PyJWT (>=2.5.0,<3.0.0)
|
17
19
|
Requires-Dist: celery (>=5.3.0,<6.0.0)
|
{streamlit_octostar_utils-0.2.9.dist-info → streamlit_octostar_utils-2.11a1.dist-info}/RECORD
RENAMED
@@ -21,7 +21,7 @@ streamlit_octostar_utils/core/threading/key_queue.py,sha256=7CJpj0gvZMQd8eC5wKQi
|
|
21
21
|
streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_yN1OByl8ahLc8,383
|
22
22
|
streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
|
23
23
|
streamlit_octostar_utils/nlp/language.py,sha256=BBBT8wtwWtVrCin5fNLMqGg5WdgHVotFkIvouk2qKh0,561
|
24
|
-
streamlit_octostar_utils/nlp/ner.py,sha256=
|
24
|
+
streamlit_octostar_utils/nlp/ner.py,sha256=ZKYVG33uoCupr-WmberQ0856cC1Fu_W5Da2NdeYtlBw,18561
|
25
25
|
streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
26
26
|
streamlit_octostar_utils/octostar/client.py,sha256=NUvHe9asd65g4-hJ4CuUvUns-9dNWes1XZRJlO9eAAc,1690
|
27
27
|
streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
|
@@ -36,7 +36,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
|
|
36
36
|
streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
|
37
37
|
streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
|
38
38
|
streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
|
39
|
-
streamlit_octostar_utils-
|
40
|
-
streamlit_octostar_utils-
|
41
|
-
streamlit_octostar_utils-
|
42
|
-
streamlit_octostar_utils-
|
39
|
+
streamlit_octostar_utils-2.11a1.dist-info/METADATA,sha256=HsaC9ySXFVacqX0l_i255QiqYKscq8b_0Edyp960xho,2330
|
40
|
+
streamlit_octostar_utils-2.11a1.dist-info/WHEEL,sha256=M5asmiAlL6HEcOq52Yi5mmk9KmTVjY2RDPtO4p9DMrc,88
|
41
|
+
streamlit_octostar_utils-2.11a1.dist-info/licenses/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
|
42
|
+
streamlit_octostar_utils-2.11a1.dist-info/RECORD,,
|
File without changes
|