streamlit-octostar-utils 0.2.10__tar.gz → 2.11a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/pyproject.toml +1 -1
  3. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/fastapi.py +1 -1
  4. streamlit_octostar_utils-2.11a1/streamlit_octostar_utils/nlp/ner.py +584 -0
  5. streamlit_octostar_utils-0.2.10/streamlit_octostar_utils/nlp/ner.py +0 -329
  6. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/LICENSE +0 -0
  7. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/README.md +0 -0
  8. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/__init__.py +0 -0
  9. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  10. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
  11. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/nifi.py +0 -0
  12. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  13. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  14. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  15. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  16. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  17. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  18. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  19. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  20. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  21. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  22. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/core/__init__.py +0 -0
  23. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/core/dict.py +0 -0
  24. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/core/filetypes.py +0 -0
  25. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  26. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  27. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/core/timestamp.py +0 -0
  28. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  29. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/nlp/language.py +0 -0
  30. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  31. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/octostar/client.py +0 -0
  32. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/octostar/context.py +0 -0
  33. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  34. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  35. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/ontology/expand_entities.py +0 -0
  36. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  37. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/ontology/validation.py +0 -0
  38. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/style/__init__.py +0 -0
  39. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/style/common.py +0 -0
  40. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/threading/__init__.py +0 -0
  41. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  42. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  43. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a1}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.2.10
3
+ Version: 2.11a1
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "0.2.10"
8
+ version = "2.11a1"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -330,7 +330,7 @@ class DefaultErrorRoute:
330
330
  if len(message) > MAX_ERROR_MESSAGE_BYTES:
331
331
  message = message[-MAX_ERROR_MESSAGE_BYTES:]
332
332
  try:
333
- tcbk = "\n".join(traceback.format_exception(exc))
333
+ tcbk = traceback.format_exception(exc)
334
334
  if len(tcbk) > MAX_ERROR_TRACEBACK_BYTES:
335
335
  tcbk = tcbk[-MAX_ERROR_TRACEBACK_BYTES:]
336
336
  except:
@@ -0,0 +1,584 @@
1
+ import itertools
2
+ import math
3
+ from typing import Optional, List, Tuple
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+ from collections import Counter
6
+
7
+ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerRegistry, AnalysisExplanation, \
8
+ EntityRecognizer, RecognizerResult
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
10
+ import streamlit as st
11
+ import nltk
12
+ import pandas as pd
13
+ from flair.data import Sentence
14
+ from flair.models import SequenceTagger
15
+
16
+ from sumy.parsers.plaintext import PlaintextParser
17
+ from sumy.nlp.tokenizers import Tokenizer
18
+ from sumy.nlp.stemmers import Stemmer
19
+ from sumy.summarizers.lsa import LsaSummarizer
20
+ from sumy.summarizers.luhn import LuhnSummarizer
21
+ from sumy.utils import get_stop_words
22
+
23
+
24
+ class FlairRecognizer(EntityRecognizer):
25
+ ENTITIES = [
26
+ "LOCATION",
27
+ "PERSON",
28
+ "ORGANIZATION",
29
+ ]
30
+
31
+ DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
32
+
33
+ CHECK_LABEL_GROUPS = [
34
+ ({"LOCATION"}, {"LOC", "LOCATION"}),
35
+ ({"PERSON"}, {"PER", "PERSON"}),
36
+ ({"ORGANIZATION"}, {"ORG"}),
37
+ ]
38
+
39
+ MODEL_LANGUAGES = {
40
+ "en": "flair/ner-english-large",
41
+ "es": "flair/ner-spanish-large",
42
+ "de": "flair/ner-german-large",
43
+ "nl": "flair/ner-dutch-large",
44
+ "multi": "flair/ner-multi",
45
+ "multi-fast": "flair/ner-multi-fast",
46
+ }
47
+
48
+ PRESIDIO_EQUIVALENCES = {
49
+ "PER": "PERSON",
50
+ "LOC": "LOCATION",
51
+ "ORG": "ORGANIZATION",
52
+ }
53
+
54
+ def __init__(
55
+ self,
56
+ model: SequenceTagger = None,
57
+ supported_language: str = "en",
58
+ supported_entities: Optional[List[str]] = None,
59
+ check_label_groups: Optional[Tuple[set, set]] = None,
60
+ ):
61
+ self.check_label_groups = (
62
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
63
+ )
64
+
65
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
66
+ self.model = model
67
+
68
+ super().__init__(
69
+ supported_entities=supported_entities,
70
+ supported_language=supported_language,
71
+ name="Flair Analytics",
72
+ )
73
+
74
+ def load(self) -> None:
75
+ pass
76
+
77
+ def get_supported_entities(self) -> List[str]:
78
+ return self.supported_entities
79
+
80
+ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None) -> List[RecognizerResult]:
81
+ results = []
82
+
83
+ sentences = Sentence(text)
84
+ self.model.predict(sentences)
85
+
86
+ if not entities:
87
+ entities = self.supported_entities
88
+
89
+ for entity in entities:
90
+ if entity not in self.supported_entities:
91
+ continue
92
+
93
+ for ent in sentences.get_spans("ner"):
94
+ if not self.__check_label(
95
+ entity, ent.labels[0].value, self.check_label_groups
96
+ ):
97
+ continue
98
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
99
+ ent.labels[0].value
100
+ )
101
+ explanation = self.build_flair_explanation(
102
+ round(ent.score, 2), textual_explanation
103
+ )
104
+ flair_result = self._convert_to_recognizer_result(ent, explanation)
105
+
106
+ results.append(flair_result)
107
+
108
+ return results
109
+
110
+ def build_flair_explanation(self, original_score: float, explanation: str) -> AnalysisExplanation:
111
+ explanation = AnalysisExplanation(
112
+ recognizer=self.__class__.__name__,
113
+ original_score=original_score,
114
+ textual_explanation=explanation,
115
+ )
116
+ return explanation
117
+
118
+ def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
119
+ entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
120
+ flair_score = round(entity.score, 2)
121
+
122
+ flair_results = RecognizerResult(
123
+ entity_type=entity_type,
124
+ start=entity.start_position,
125
+ end=entity.end_position,
126
+ score=flair_score,
127
+ analysis_explanation=explanation,
128
+ )
129
+
130
+ return flair_results
131
+
132
+ @staticmethod
133
+ def __check_label(
134
+ entity: str, label: str, check_label_groups: Tuple[set, set]
135
+ ) -> bool:
136
+ return any(
137
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
138
+ )
139
+
140
+
141
+ BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
142
+ BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL"]
143
+
144
+
145
+ def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
146
+ from operator import attrgetter
147
+ from sumy.summarizers._summarizer import SentenceInfo
148
+
149
+ rate = rating
150
+ if isinstance(rating, dict):
151
+ assert not args and not kwargs
152
+ rate = lambda s: rating[s]
153
+ infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
154
+ infos = sorted(infos, key=attrgetter("rating"), reverse=True)
155
+ return tuple((i.sentence, i.rating, i.order) for i in infos)
156
+
157
+
158
+ def _sumy__lsa_call(summarizer, document):
159
+ summarizer._ensure_dependecies_installed()
160
+ dictionary = summarizer._create_dictionary(document)
161
+ if not dictionary:
162
+ return ()
163
+ matrix = summarizer._create_matrix(document, dictionary)
164
+ matrix = summarizer._compute_term_frequency(matrix)
165
+ from numpy.linalg import svd as singular_value_decomposition
166
+
167
+ u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
168
+ ranks = iter(summarizer._compute_ranks(sigma, v))
169
+ return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
170
+
171
+
172
+ def _sumy__luhn_call(summarizer, document):
173
+ words = summarizer._get_significant_words(document.words)
174
+ return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
175
+
176
+
177
+ def get_nltk_tokenizer(language: str) -> Tokenizer:
178
+ if language == "en":
179
+ nltk_lang = "english"
180
+ elif language == "it":
181
+ nltk_lang = "italian"
182
+ else:
183
+ nltk_lang = language
184
+
185
+ try:
186
+ nltk.data.find("tokenizers/punkt")
187
+ except LookupError:
188
+ nltk.download("punkt")
189
+ return Tokenizer(nltk_lang)
190
+
191
+
192
+ class NERObject(BaseModel):
193
+ name: str
194
+ label: str
195
+ score: float = 0.0
196
+ start: int
197
+ count: int
198
+ context: str | None = None
199
+ comentions: list[str] = Field(default_factory=list)
200
+ model_config = ConfigDict(extra="allow")
201
+
202
+ def __repr__(self):
203
+ return f"NERObject(label={self.label},name={self.name})"
204
+
205
+
206
+ def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_entities=None):
207
+ if whitelisted_labels is not None:
208
+ entities = [e for e in entities if e.label in whitelisted_labels]
209
+ entities = sorted(entities, key=lambda x: x.name)
210
+ final_entities = []
211
+ for _, group in itertools.groupby(entities, key=lambda x: x.name):
212
+ group = list(group)
213
+ best_entity = max(group, key=lambda x: x.score * x.count)
214
+ merged_data = {
215
+ "name": best_entity.name,
216
+ "label": best_entity.label,
217
+ "score": best_entity.score,
218
+ "context": best_entity.context,
219
+ "count": sum(e.count for e in group),
220
+ "start": best_entity.start,
221
+ }
222
+ all_fields = best_entity.model_fields.keys()
223
+ for field in all_fields:
224
+ if field in merged_data:
225
+ continue
226
+ values = [getattr(e, field, None) for e in group if getattr(e, field, None) is not None]
227
+ if not values:
228
+ continue
229
+ if isinstance(values[0], list):
230
+ merged_data[field] = list(set(itertools.chain.from_iterable(values or [])))
231
+ else:
232
+ merged_data[field] = getattr(best_entity, field, None)
233
+ final_entities.append(NERObject(**merged_data))
234
+ final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
235
+ if max_entities and len(final_entities) > max_entities:
236
+ final_entities = final_entities[:max_entities]
237
+ return final_entities
238
+
239
+
240
+ def build_presidio_analyzer(language: str, engine_type: str = "spacy", model=None) -> AnalyzerEngine:
241
+ registry = RecognizerRegistry()
242
+
243
+ if engine_type == "flair":
244
+
245
+ flair_recognizer = FlairRecognizer(
246
+ model=model,
247
+ supported_language=language
248
+ )
249
+ registry.add_recognizer(flair_recognizer)
250
+
251
+ default_registry = RecognizerRegistry()
252
+ default_registry.load_predefined_recognizers()
253
+
254
+ flair_handled_entities = {"PERSON", "LOCATION", "ORGANIZATION"}
255
+
256
+ for recognizer in default_registry.recognizers:
257
+ recognizer_entities = set(recognizer.supported_entities) if hasattr(recognizer, 'supported_entities') else set()
258
+
259
+ if recognizer_entities and recognizer_entities.issubset(flair_handled_entities):
260
+ continue
261
+
262
+ registry.add_recognizer(recognizer)
263
+
264
+ return AnalyzerEngine(
265
+ registry=registry,
266
+ supported_languages=[language]
267
+ )
268
+
269
+ else:
270
+ registry.load_predefined_recognizers()
271
+
272
+ if model is None:
273
+ raise ValueError("SpaCy model name must be provided")
274
+
275
+ configuration = {
276
+ "nlp_engine_name": "spacy",
277
+ "models": [{"lang_code": language, "model_name": model}],
278
+ }
279
+
280
+ provider = NlpEngineProvider(nlp_configuration=configuration)
281
+ nlp_engine = provider.create_engine()
282
+
283
+ return AnalyzerEngine(
284
+ nlp_engine=nlp_engine,
285
+ registry=registry,
286
+ supported_languages=[language],
287
+ )
288
+
289
+
290
+ def analyze_column_sample(column_values: pd.Series, analyzer: AnalyzerEngine, language: str,
291
+ entities: Optional[List[str]], score_threshold: float) -> Optional[str]:
292
+ sample_values = column_values.dropna().head(50)
293
+
294
+ if sample_values.empty:
295
+ return None
296
+
297
+ entity_counter = Counter()
298
+
299
+ for value in sample_values:
300
+ text = str(value).strip()
301
+
302
+ if not text:
303
+ continue
304
+
305
+ results = analyzer.analyze(
306
+ text=text,
307
+ language=language,
308
+ entities=entities if entities else None
309
+ )
310
+
311
+ for result in results:
312
+ if result.score >= score_threshold:
313
+ entity_counter[result.entity_type] += 1
314
+
315
+ if not entity_counter:
316
+ return None
317
+
318
+ most_common = entity_counter.most_common(1)[0]
319
+ total_detections = sum(entity_counter.values())
320
+
321
+ if most_common[1] > total_detections * 0.5:
322
+ return most_common[0]
323
+
324
+ return most_common[0] if entity_counter else None
325
+
326
+
327
+ def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, language: str,
328
+ entities: Optional[List[str]] = None, score_threshold: float = 0.5) -> List[NERObject]:
329
+ ner_objects = []
330
+
331
+ for column_name in df.columns:
332
+ entity_type = analyze_column_sample(
333
+ df[column_name],
334
+ analyzer,
335
+ language,
336
+ entities,
337
+ score_threshold
338
+ )
339
+
340
+ if entity_type:
341
+ for idx, value in df[column_name].dropna().items():
342
+ text = str(value).strip()
343
+
344
+ if text:
345
+ ner_objects.append(NERObject(
346
+ name=text[:100],
347
+ label=entity_type,
348
+ score=0.9,
349
+ start=0,
350
+ count=1,
351
+ context=text[:100]
352
+ ))
353
+
354
+ return ner_objects
355
+
356
+
357
+ def compute_ner_presidio(
358
+ text,
359
+ language,
360
+ analyzer,
361
+ entities=None,
362
+ score_threshold=0.5,
363
+ context_width=150,
364
+ with_comentions=True,
365
+ with_context=True,
366
+ batch_size=32,
367
+ n_process=4
368
+ ):
369
+ if isinstance(text, pd.DataFrame):
370
+ if len(text) >= 100:
371
+ return analyze_dataframe_optimized(text, analyzer, language, entities, score_threshold)
372
+
373
+ else:
374
+ texts = []
375
+
376
+ for col in text.columns:
377
+ for idx, value in text[col].dropna().items():
378
+ text_value = str(value).strip()
379
+
380
+ if text_value:
381
+ texts.append(text_value)
382
+
383
+ text = "\n".join(texts)
384
+
385
+ elif isinstance(text, list):
386
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
387
+
388
+ results_generator = batch_analyzer.analyze_iterator(
389
+ texts=text,
390
+ language=language,
391
+ batch_size=batch_size,
392
+ n_process=n_process,
393
+ entities=entities if entities else None,
394
+ )
395
+
396
+ all_results = list(results_generator)
397
+ ner_objects = []
398
+
399
+ for text_item, results in zip(text, all_results):
400
+ for result in results:
401
+ if result.score >= score_threshold:
402
+ context_start = max(0, result.start - 30)
403
+ context_end = min(len(text_item), result.end + 30)
404
+ context = text_item[context_start:context_end] if with_context else None
405
+
406
+ ner_objects.append(NERObject(
407
+ name=text_item[result.start:result.end],
408
+ label=result.entity_type,
409
+ score=float(result.score),
410
+ start=int(result.start),
411
+ count=1,
412
+ context=context
413
+ ))
414
+
415
+ return ner_objects
416
+
417
+ results = analyzer.analyze(
418
+ text=text,
419
+ language=language,
420
+ entities=entities if entities else None
421
+ )
422
+
423
+ ner_objects = []
424
+
425
+ for result in results:
426
+ if result.score >= score_threshold:
427
+ context_start = max(0, result.start - math.floor(context_width / 2))
428
+ context_end = min(len(text), result.end + math.ceil(context_width / 2))
429
+ context = text[context_start:context_end] if with_context else None
430
+
431
+ ner_objects.append(NERObject(
432
+ name=text[result.start:result.end],
433
+ label=result.entity_type,
434
+ score=float(result.score),
435
+ start=int(result.start),
436
+ count=1,
437
+ context=context
438
+ ))
439
+
440
+ if with_comentions:
441
+ for i in range(len(ner_objects)):
442
+ entity = ner_objects[i]
443
+ comentions = [
444
+ ner_objects[j].name
445
+ for j in range(len(ner_objects))
446
+ if j != i and abs(ner_objects[j].start - entity.start) < math.ceil(context_width / 2)
447
+ ]
448
+ ner_objects[i].comentions = comentions
449
+
450
+ return ner_objects
451
+
452
+
453
+ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
454
+ tokenizer = get_nltk_tokenizer(language)
455
+ stemmer = Stemmer(language)
456
+ parser = PlaintextParser.from_string(text, tokenizer)
457
+ if fast:
458
+ summarizer = LuhnSummarizer(stemmer)
459
+ summarizer.stop_words = get_stop_words(language)
460
+ scored_sentences = iter(_sumy__luhn_call(summarizer, parser.document))
461
+ else:
462
+ summarizer = LsaSummarizer(stemmer)
463
+ summarizer.stop_words = get_stop_words(language)
464
+ scored_sentences = iter(_sumy__lsa_call(summarizer, parser.document))
465
+ summary = []
466
+ summary_chars = 0
467
+ summary_chars_penultimate = 0
468
+ while summary_chars < max_chars:
469
+ try:
470
+ next_sentence = next(scored_sentences)
471
+ summary.append(next_sentence)
472
+ summary_chars_penultimate = summary_chars
473
+ summary_chars += len(" " + next_sentence[0]._text)
474
+ except StopIteration:
475
+ break
476
+ summary = sorted(summary, key=lambda x: x[2])
477
+ summary = [(sentence[0]._text, sentence[1]) for sentence in summary]
478
+ if summary_chars > max_chars:
479
+ summary[-1] = (
480
+ summary[-1][0][: max_chars - summary_chars_penultimate],
481
+ summary[-1][1],
482
+ )
483
+ if not with_scores:
484
+ summary = " ".join([s[0] for s in summary])
485
+ else:
486
+ min_score = min([s[1] for s in summary]) if summary else 0
487
+ max_score = max([min_score] + [s[1] for s in summary])
488
+ score_range = 1 if min_score == max_score else (max_score - min_score)
489
+ summary = [(s[0], (s[1] - min_score) / score_range) for s in summary]
490
+ return summary
491
+
492
+
493
+ def ner_pipe(
494
+ text,
495
+ language,
496
+ model,
497
+ engine_type="spacy",
498
+ fast=False,
499
+ compression_ratio="auto",
500
+ with_comentions=True,
501
+ with_context=True,
502
+ entities=None,
503
+ score_threshold=0.5,
504
+ batch_size=32,
505
+ n_process=4
506
+ ):
507
+ analyzer = build_presidio_analyzer(
508
+ language=language,
509
+ engine_type=engine_type,
510
+ model=model,
511
+ )
512
+
513
+ if isinstance(text, pd.DataFrame):
514
+ ner = compute_ner_presidio(
515
+ text,
516
+ language,
517
+ analyzer,
518
+ entities,
519
+ score_threshold,
520
+ 150,
521
+ with_comentions,
522
+ with_context,
523
+ batch_size,
524
+ n_process
525
+ )
526
+ else:
527
+ if compression_ratio == "auto":
528
+ compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
529
+
530
+ if compression_ratio > 1.0:
531
+ sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast,
532
+ with_scores=True)
533
+ text = " ".join([s[0] for s in sentences])
534
+
535
+ ner = compute_ner_presidio(
536
+ text,
537
+ language,
538
+ analyzer,
539
+ entities,
540
+ score_threshold,
541
+ 150,
542
+ with_comentions,
543
+ with_context,
544
+ batch_size,
545
+ n_process
546
+ )
547
+
548
+ return ner
549
+
550
+
551
+ def get_ner_handler(
552
+ language,
553
+ model,
554
+ engine_type="spacy",
555
+ fast=False,
556
+ entities=None,
557
+ score_threshold=0.5,
558
+ batch_size=32,
559
+ n_process=4
560
+ ):
561
+ try:
562
+ get_nltk_tokenizer(language)
563
+ except LookupError:
564
+ language = "en"
565
+
566
+ return lambda text, compression_ratio="auto", with_comentions=True, with_context=True: ner_pipe(
567
+ text,
568
+ language,
569
+ model,
570
+ engine_type,
571
+ fast,
572
+ compression_ratio,
573
+ with_comentions,
574
+ with_context,
575
+ entities,
576
+ score_threshold,
577
+ batch_size,
578
+ n_process
579
+ )
580
+
581
+
582
+ @st.cache_resource
583
+ def get_cached_ner_handler(language, model):
584
+ return get_ner_handler(language, model)
@@ -1,329 +0,0 @@
1
- import re
2
- import streamlit as st
3
- from spacy_download import load_spacy
4
- from flair.data import Sentence
5
- from flair.models import SequenceTagger
6
- from sumy.parsers.plaintext import PlaintextParser
7
- from sumy.nlp.tokenizers import Tokenizer
8
- from sumy.nlp.stemmers import Stemmer
9
- from sumy.summarizers.lsa import LsaSummarizer
10
- from sumy.summarizers.luhn import LuhnSummarizer
11
- from sumy.utils import get_stop_words
12
- import itertools
13
- import numpy as np
14
- import math
15
- import nltk
16
- from typing import Optional, List
17
- from pydantic import BaseModel, ConfigDict, Field
18
-
19
- SPACY_NER_MODELS = {
20
- "english": lambda: load_spacy(
21
- "en_core_web_sm",
22
- disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"],
23
- )
24
- }
25
- FLAIR_NER_MODELS = {"english": lambda: SequenceTagger.load("flair/ner-english")}
26
- REGEX_NER_MODELS = {
27
- "IP_ADDRESS": [
28
- r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::(?:[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?\b",
29
- ],
30
- "PHONE": r"(?:(?:\+(?:\d{1,3}[ .-]?)?(?:\(\d{1,3}\)[ .-]?)?)(?:\d{2,5}[ .-]?){1,3}|\d{2,5}[ .-]\d{2,5}(?:[ .-]\d{2,5}){0,2})\b",
31
- "EMAIL": r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)+\b",
32
- "URL": r"\b(?:(?:https?|ftp|sftp|ftps|ssh|file|mailto|git|onion|ipfs|ipns):\/\/|www\.)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}(?::\d+)?(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?|(?:https?:\/\/)?[a-z2-7]{16,56}\.onion(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)\b",
33
- }
34
-
35
- BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
36
- BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL"]
37
-
38
-
39
- def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
40
- from operator import attrgetter
41
- from sumy.summarizers._summarizer import SentenceInfo
42
-
43
- rate = rating
44
- if isinstance(rating, dict):
45
- assert not args and not kwargs
46
- rate = lambda s: rating[s]
47
- infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
48
- infos = sorted(infos, key=attrgetter("rating"), reverse=True)
49
- return tuple((i.sentence, i.rating, i.order) for i in infos)
50
-
51
-
52
- def _sumy__lsa_call(summarizer, document):
53
- summarizer._ensure_dependecies_installed()
54
- dictionary = summarizer._create_dictionary(document)
55
- if not dictionary:
56
- return ()
57
- matrix = summarizer._create_matrix(document, dictionary)
58
- matrix = summarizer._compute_term_frequency(matrix)
59
- from numpy.linalg import svd as singular_value_decomposition
60
-
61
- u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
62
- ranks = iter(summarizer._compute_ranks(sigma, v))
63
- return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
64
-
65
-
66
- def _sumy__luhn_call(summarizer, document):
67
- words = summarizer._get_significant_words(document.words)
68
- return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
69
-
70
-
71
- def get_nltk_tokenizer(language: str) -> Tokenizer:
72
- nltk.data.find("tokenizers/punkt")
73
- return Tokenizer(language)
74
-
75
-
76
- class NERObject(BaseModel):
77
- name: str
78
- label: str
79
- score: float = 0.0
80
- start: int
81
- count: int
82
- context: str | None = None
83
- comentions: list[str] = Field(default_factory=list)
84
- model_config = ConfigDict(extra="allow")
85
-
86
- def __repr__(self):
87
- return f"NERObject(label={self.label},name={self.name})"
88
-
89
-
90
- def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_entities=None):
91
- if whitelisted_labels is not None:
92
- entities = [e for e in entities if e.label in whitelisted_labels]
93
- entities = sorted(entities, key=lambda x: x.name)
94
- final_entities = []
95
- for _, group in itertools.groupby(entities, key=lambda x: x.name):
96
- group = list(group)
97
- best_entity = max(group, key=lambda x: x.score * x.count)
98
- merged_data = {
99
- "name": best_entity.name,
100
- "label": best_entity.label,
101
- "score": best_entity.score,
102
- "context": best_entity.context,
103
- "count": sum(e.count for e in group),
104
- "start": best_entity.start,
105
- }
106
- all_fields = best_entity.model_fields.keys()
107
- for field in all_fields:
108
- if field in merged_data:
109
- continue
110
- values = [getattr(e, field, None) for e in group if getattr(e, field, None) is not None]
111
- if not values:
112
- continue
113
- if isinstance(values[0], list):
114
- merged_data[field] = list(set(itertools.chain.from_iterable(values or [])))
115
- else:
116
- merged_data[field] = getattr(best_entity, field, None)
117
- final_entities.append(NERObject(**merged_data))
118
- final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
119
- if max_entities and len(final_entities) > max_entities:
120
- final_entities = final_entities[:max_entities]
121
- return final_entities
122
-
123
-
124
- def compute_ner(
125
- language,
126
- sentences,
127
- spacy_model,
128
- flair_model=None,
129
- context_width=150,
130
- with_scores=True,
131
- with_comentions=True,
132
- with_context=True,
133
- ):
134
- sentence_starts = [0] + [len(s[0]) + 1 for s in sentences]
135
- del sentence_starts[-1]
136
- sentence_starts = list(np.cumsum(sentence_starts))
137
- text = "\n".join([s[0] for s in sentences])
138
- min_score = 1.0
139
- entities: list[NERObject] = []
140
-
141
- # FLAIR model (if not fast)
142
- if flair_model:
143
- input = [Sentence(sentence[0]) for sentence in sentences]
144
- flair_model.predict(input)
145
- output = [e for sentence in input for e in sentence.get_spans("ner")]
146
- flair_entities = [
147
- NERObject(
148
- name=entity.text,
149
- label=BASE_TO_ONTONOTES_LABELMAP.get(
150
- entity.annotation_layers["ner"][0].value,
151
- entity.annotation_layers["ner"][0].value,
152
- ),
153
- score=entity.score,
154
- start=sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
155
- count=1,
156
- )
157
- for entity in output
158
- ]
159
- min_score = min([min_score] + [e.score for e in flair_entities])
160
- entities += flair_entities
161
- del flair_entities
162
-
163
- # REGEX model
164
- for label, regexes in REGEX_NER_MODELS.items():
165
- if not isinstance(regexes, list):
166
- regexes = [regexes]
167
- for regex in regexes:
168
- regex_entities = [
169
- NERObject(
170
- name=match.group(),
171
- label=label,
172
- score=min_score - 0.5,
173
- count=1,
174
- start=match.start(),
175
- )
176
- for match in re.finditer(regex, text)
177
- ]
178
- entities += regex_entities
179
- min_score = min([min_score] + [e.score for e in regex_entities])
180
-
181
- # SPACY model
182
- chunks = []
183
- chunk_start_offsets = []
184
- current_chunk = []
185
- current_length = 0
186
- offset = 0
187
- for sentence, _ in sentences:
188
- sentence_len = len(sentence) + 1
189
- if sentence_len > spacy_model.max_length:
190
- truncated = sentence[: spacy_model.max_length - 1]
191
- chunks.append(truncated)
192
- chunk_start_offsets.append(offset)
193
- offset += sentence_len
194
- continue
195
- if current_length + sentence_len > spacy_model.max_length:
196
- chunks.append("\n".join(current_chunk))
197
- chunk_start_offsets.append(offset - current_length)
198
- current_chunk = []
199
- current_length = 0
200
- current_chunk.append(sentence)
201
- current_length += sentence_len
202
- offset += sentence_len
203
- if current_chunk:
204
- chunks.append("\n".join(current_chunk))
205
- chunk_start_offsets.append(offset - current_length)
206
- for i, chunk in enumerate(chunks):
207
- doc = spacy_model(chunk)
208
- chunk_offset = chunk_start_offsets[i]
209
- for entity in doc.ents:
210
- entities.append(
211
- NERObject(
212
- name=entity.text,
213
- label=BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
214
- score=min_score - 0.5,
215
- start=chunk_offset + entity.start_char,
216
- count=1,
217
- )
218
- )
219
-
220
- # Reformatting for consistency
221
- if not entities:
222
- return []
223
- if with_scores:
224
- min_entity_score = min([e.score for e in entities])
225
- max_entity_score = max([e.score for e in entities])
226
- entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
227
- for e in entities:
228
- e.score = (e.score - min_entity_score) / entity_score_range
229
- scores = list(np.searchsorted(sentence_starts, [e.start + 1 for e in entities]))
230
- scores = [sentences[i - 1][1] for i in scores]
231
- scores = [scores[i] + 10 * entities[i].score for i in range(len(entities))]
232
- for i in range(len(entities)):
233
- entities[i].score = scores[i]
234
- else:
235
- for i in range(len(entities)):
236
- entities[i].score = 0.0
237
- if with_comentions:
238
- for i in range(len(entities)):
239
- entity = entities[i]
240
- comentions = [
241
- entities[j].name
242
- for j in range(len(entities))
243
- if j != i and abs(entities[j].start - entity.start) < math.ceil(context_width / 2)
244
- ]
245
- entities[i].comentions = comentions
246
- if with_context:
247
- for i in range(len(entities)):
248
- entity = entities[i]
249
- if entity.start >= 0 and entity.start < len(text):
250
- left = max(0, entity.start - math.floor(context_width / 2))
251
- right = min(len(text), entity.start + math.ceil(context_width / 2))
252
- context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
253
- entities[i].context = context
254
- return entities
255
-
256
-
257
- def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
258
- tokenizer = get_nltk_tokenizer(language)
259
- stemmer = Stemmer(language)
260
- parser = PlaintextParser.from_string(text, tokenizer)
261
- if fast:
262
- summarizer = LuhnSummarizer(stemmer)
263
- summarizer.stop_words = get_stop_words(language)
264
- scored_sentences = iter(_sumy__luhn_call(summarizer, parser.document))
265
- else:
266
- summarizer = LsaSummarizer(stemmer)
267
- summarizer.stop_words = get_stop_words(language)
268
- scored_sentences = iter(_sumy__lsa_call(summarizer, parser.document))
269
- summary = []
270
- summary_chars = 0
271
- summary_chars_penultimate = 0
272
- while summary_chars < max_chars:
273
- try:
274
- next_sentence = next(scored_sentences)
275
- summary.append(next_sentence)
276
- summary_chars_penultimate = summary_chars
277
- summary_chars += len(" " + next_sentence[0]._text)
278
- except StopIteration:
279
- break
280
- summary = sorted(summary, key=lambda x: x[2])
281
- summary = [(sentence[0]._text, sentence[1]) for sentence in summary]
282
- if summary_chars > max_chars:
283
- summary[-1] = (
284
- summary[-1][0][: max_chars - summary_chars_penultimate],
285
- summary[-1][1],
286
- )
287
- if not with_scores:
288
- summary = " ".join([s[0] for s in summary])
289
- else:
290
- min_score = min([s[1] for s in summary]) if summary else 0
291
- max_score = max([min_score] + [s[1] for s in summary])
292
- score_range = 1 if min_score == max_score else (max_score - min_score)
293
- summary = [(s[0], (s[1] - min_score) / score_range) for s in summary]
294
- return summary
295
-
296
-
297
- def ner_pipe(
298
- text,
299
- language,
300
- spacy_model,
301
- flair_model=None,
302
- fast=False,
303
- compression_ratio="auto",
304
- with_scores=True,
305
- with_comentions=True,
306
- with_context=True,
307
- ):
308
- if compression_ratio == "auto":
309
- compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
310
- sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True)
311
- ner = compute_ner(language, sentences, spacy_model, flair_model, 150, with_scores, with_comentions, with_context)
312
- return ner
313
-
314
-
315
- def get_ner_handler(language, fast=False):
316
- try:
317
- get_nltk_tokenizer(language) # raises a LookupError if the language is not valid
318
- except LookupError:
319
- language = "english"
320
- spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS["english"])()
321
- flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS["english"])()
322
- return lambda text, compression_ratio="auto", with_scores=True, with_comentions=True, with_context=True: ner_pipe(
323
- text, language, spacy_model, flair_model, fast, compression_ratio, with_scores, with_comentions, with_context
324
- )
325
-
326
-
327
- @st.cache_resource
328
- def get_cached_ner_handler(language, fast):
329
- return get_ner_handler(language, fast)