streamlit-octostar-utils 0.2.10__tar.gz → 2.11a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/pyproject.toml +1 -1
  3. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/fastapi.py +1 -1
  4. streamlit_octostar_utils-2.11a2/streamlit_octostar_utils/nlp/language.py +55 -0
  5. streamlit_octostar_utils-2.11a2/streamlit_octostar_utils/nlp/ner.py +634 -0
  6. streamlit_octostar_utils-0.2.10/streamlit_octostar_utils/nlp/language.py +0 -15
  7. streamlit_octostar_utils-0.2.10/streamlit_octostar_utils/nlp/ner.py +0 -329
  8. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/LICENSE +0 -0
  9. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/README.md +0 -0
  10. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/__init__.py +0 -0
  11. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  12. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
  13. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/nifi.py +0 -0
  14. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  15. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  16. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  17. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  18. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  19. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  20. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  21. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  22. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  23. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  24. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/__init__.py +0 -0
  25. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/dict.py +0 -0
  26. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/filetypes.py +0 -0
  27. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  28. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  29. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/core/timestamp.py +0 -0
  30. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  31. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  32. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/octostar/client.py +0 -0
  33. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/octostar/context.py +0 -0
  34. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  35. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  36. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/ontology/expand_entities.py +0 -0
  37. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  38. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/ontology/validation.py +0 -0
  39. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/style/__init__.py +0 -0
  40. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/style/common.py +0 -0
  41. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/threading/__init__.py +0 -0
  42. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  43. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  44. {streamlit_octostar_utils-0.2.10 → streamlit_octostar_utils-2.11a2}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.2.10
3
+ Version: 2.11a2
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "0.2.10"
8
+ version = "2.11a2"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -330,7 +330,7 @@ class DefaultErrorRoute:
330
330
  if len(message) > MAX_ERROR_MESSAGE_BYTES:
331
331
  message = message[-MAX_ERROR_MESSAGE_BYTES:]
332
332
  try:
333
- tcbk = "\n".join(traceback.format_exception(exc))
333
+ tcbk = traceback.format_exception(exc)
334
334
  if len(tcbk) > MAX_ERROR_TRACEBACK_BYTES:
335
335
  tcbk = tcbk[-MAX_ERROR_TRACEBACK_BYTES:]
336
336
  except:
@@ -0,0 +1,55 @@
1
+ import re
2
+ import py3langid as langid
3
+ import iso639 as languages
4
+
5
+
6
+ def alpha2_to_language(alpha2: str) -> str:
7
+ if not alpha2:
8
+ return None
9
+ code = alpha2.strip().lower()
10
+ return languages.to_name(code)
11
+
12
+ def language_to_alpha2(language_name: str) -> str:
13
+ if not language_name:
14
+ return None
15
+ name = language_name.strip().lower()
16
+ data = languages.find(name)
17
+ return data["iso639_1"]
18
+
19
+ def detect_language(text, min_confidence=None):
20
+ detector = langid.langid.LanguageIdentifier.from_pickled_model(
21
+ langid.langid.MODEL_FILE, norm_probs=True
22
+ )
23
+ detected_lang, confidence = detector.classify(text)
24
+ if min_confidence and confidence < min_confidence:
25
+ return None, confidence
26
+ detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
27
+ detected_lang = languages.to_name(detected_lang).lower()
28
+ return detected_lang, confidence
29
+
30
+ FLAIR_MODELS = {
31
+ "en": "flair/ner-english-large",
32
+ "es": "flair/ner-spanish-large",
33
+ "de": "flair/ner-german-large",
34
+ "nl": "flair/ner-dutch-large",
35
+ "multi": "flair/ner-multi",
36
+ "multi-fast": "flair/ner-multi-fast",
37
+ }
38
+
39
+ SPACY_MODELS = {
40
+ "en": 'en_core_web_sm',
41
+ }
42
+
43
+ def load_language_model(language, type):
44
+ from flair.models import SequenceTagger
45
+ from spacy_download import load_spacy
46
+
47
+ model = None
48
+ match type:
49
+ case "spacy":
50
+ model_name = SPACY_MODELS.get(language, SPACY_MODELS["en"])
51
+ model = load_spacy(model_name)
52
+ case "flair":
53
+ model_name = FLAIR_MODELS.get(language, "flair/ner-multi")
54
+ model = SequenceTagger.load(model_name)
55
+ return model
@@ -0,0 +1,634 @@
1
+ import itertools
2
+ import math
3
+ from typing import Optional, List, Tuple
4
+ from pydantic import BaseModel, ConfigDict, Field
5
+ from collections import Counter
6
+
7
+ from presidio_analyzer import AnalyzerEngine, BatchAnalyzerEngine, RecognizerRegistry, AnalysisExplanation, \
8
+ EntityRecognizer, RecognizerResult
9
+ from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
10
+ import streamlit as st
11
+ import nltk
12
+ import pandas as pd
13
+ from flair.data import Sentence
14
+ from flair.models import SequenceTagger
15
+
16
+ from sumy.parsers.plaintext import PlaintextParser
17
+ from sumy.nlp.tokenizers import Tokenizer
18
+ from sumy.nlp.stemmers import Stemmer
19
+ from sumy.summarizers.lsa import LsaSummarizer
20
+ from sumy.summarizers.luhn import LuhnSummarizer
21
+ from sumy.utils import get_stop_words
22
+
23
+ from nlp.language import alpha2_to_language
24
+
25
+ BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL",
26
+ "CRYPTO", "IBAN", "CREDIT_CARD", "US_SSN", "US_DRIVER_LICENSE", "US_PASSPORT", "MEDICAL_LICENSE"]
27
+
28
+ PRESIDIO_TO_BASE_ALIASES = {
29
+ "PHONE_NUMBER": "PHONE",
30
+ "EMAIL_ADDRESS": "EMAIL",
31
+ "IBAN_CODE": "IBAN",
32
+ "DRIVER_LICENSE": "US_DRIVER_LICENSE",
33
+ "US_DRIVER_LICENSE": "US_DRIVER_LICENSE",
34
+ "US_DRIVERS_LICENSE": "US_DRIVER_LICENSE",
35
+ "PASSPORT": "US_PASSPORT",
36
+ "CREDIT_CARD": "CREDIT_CARD",
37
+ "URL": "URL",
38
+ "IP_ADDRESS": "IP_ADDRESS",
39
+ "CRYPTO": "CRYPTO",
40
+ "CRYPTO_WALLET": "CRYPTO",
41
+ "CRYPTO_WALLET_ADDRESS": "CRYPTO",
42
+ "DATE_TIME": "DATE",
43
+ "LOCATION": "LOC",
44
+ "ORGANIZATION": "ORG",
45
+ }
46
+
47
+ BASE_TO_RECOGNIZER_EXPANSIONS = {
48
+ "ORG": ["ORG", "ORGANIZATION"],
49
+ "LOC": ["LOC", "LOCATION"],
50
+ "PHONE": ["PHONE", "PHONE_NUMBER"],
51
+ "EMAIL": ["EMAIL", "EMAIL_ADDRESS"],
52
+ "IBAN": ["IBAN", "IBAN_CODE"],
53
+ "US_DRIVER_LICENSE": ["US_DRIVER_LICENSE", "US_DRIVERS_LICENSE", "DRIVER_LICENSE"],
54
+ "US_PASSPORT": ["US_PASSPORT", "PASSPORT"],
55
+ "DATE": ["DATE", "DATE_TIME"],
56
+ "PERSON": ["PERSON"],
57
+ "URL": ["URL"],
58
+ "IP_ADDRESS": ["IP_ADDRESS"],
59
+ "CRYPTO": ["CRYPTO", "CRYPTO_WALLET", "CRYPTO_WALLET_ADDRESS"],
60
+ "CREDIT_CARD": ["CREDIT_CARD"],
61
+ "US_SSN": ["US_SSN"],
62
+ "MEDICAL_LICENSE": ["MEDICAL_LICENSE"],
63
+ "NORP": ["NORP"],
64
+ "GPE": ["GPE"],
65
+ "PRODUCT": ["PRODUCT"],
66
+ }
67
+
68
+ BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
69
+
70
+
71
+ class FlairRecognizer(EntityRecognizer):
72
+ ENTITIES = [
73
+ "LOC",
74
+ "PERSON",
75
+ "ORG",
76
+ ]
77
+
78
+ DEFAULT_EXPLANATION = "Identified as {} by Flair's Named Entity Recognition"
79
+
80
+ CHECK_LABEL_GROUPS = [
81
+ ({"LOC"}, {"LOC", "LOCATION"}),
82
+ ({"PERSON"}, {"PER", "PERSON"}),
83
+ ({"ORG"}, {"ORG", "ORGANIZATION"}),
84
+ ]
85
+
86
+ MODEL_LANGUAGES = {
87
+ "en": "flair/ner-english-large",
88
+ "es": "flair/ner-spanish-large",
89
+ "de": "flair/ner-german-large",
90
+ "nl": "flair/ner-dutch-large",
91
+ "multi": "flair/ner-multi",
92
+ "multi-fast": "flair/ner-multi-fast",
93
+ }
94
+
95
+ PRESIDIO_EQUIVALENCES = {
96
+ "PER": "PERSON",
97
+ "LOC": "LOC",
98
+ "ORG": "ORG"
99
+ }
100
+
101
+ def __init__(
102
+ self,
103
+ model: SequenceTagger = None,
104
+ supported_language: str = "en",
105
+ supported_entities: Optional[List[str]] = None,
106
+ check_label_groups: Optional[Tuple[set, set]] = None,
107
+ ):
108
+ self.check_label_groups = (
109
+ check_label_groups if check_label_groups else self.CHECK_LABEL_GROUPS
110
+ )
111
+
112
+ supported_entities = supported_entities if supported_entities else self.ENTITIES
113
+ self.model = model
114
+
115
+ super().__init__(
116
+ supported_entities=supported_entities,
117
+ supported_language=supported_language,
118
+ name="Flair Analytics",
119
+ )
120
+
121
+ def load(self) -> None:
122
+ pass
123
+
124
+ def get_supported_entities(self) -> List[str]:
125
+ return self.supported_entities
126
+
127
+ def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts = None) -> List[RecognizerResult]:
128
+ results = []
129
+
130
+ sentences = Sentence(text)
131
+ self.model.predict(sentences)
132
+
133
+ if not entities:
134
+ entities = self.supported_entities
135
+
136
+ for entity in entities:
137
+ if entity not in self.supported_entities:
138
+ continue
139
+
140
+ for ent in sentences.get_spans("ner"):
141
+ if not self.__check_label(
142
+ entity, ent.labels[0].value, self.check_label_groups
143
+ ):
144
+ continue
145
+ textual_explanation = self.DEFAULT_EXPLANATION.format(
146
+ ent.labels[0].value
147
+ )
148
+ explanation = self.build_flair_explanation(
149
+ round(ent.score, 2), textual_explanation
150
+ )
151
+ flair_result = self._convert_to_recognizer_result(ent, explanation)
152
+
153
+ results.append(flair_result)
154
+
155
+ return results
156
+
157
+ def build_flair_explanation(self, original_score: float, explanation: str) -> AnalysisExplanation:
158
+ explanation = AnalysisExplanation(
159
+ recognizer=self.__class__.__name__,
160
+ original_score=original_score,
161
+ textual_explanation=explanation,
162
+ )
163
+ return explanation
164
+
165
+ def _convert_to_recognizer_result(self, entity, explanation) -> RecognizerResult:
166
+ entity_type = self.PRESIDIO_EQUIVALENCES.get(entity.tag, entity.tag)
167
+ flair_score = round(entity.score, 2)
168
+
169
+ flair_results = RecognizerResult(
170
+ entity_type=entity_type,
171
+ start=entity.start_position,
172
+ end=entity.end_position,
173
+ score=flair_score,
174
+ analysis_explanation=explanation,
175
+ )
176
+
177
+ return flair_results
178
+
179
+ @staticmethod
180
+ def __check_label(
181
+ entity: str, label: str, check_label_groups: Tuple[set, set]
182
+ ) -> bool:
183
+ return any(
184
+ [entity in egrp and label in lgrp for egrp, lgrp in check_label_groups]
185
+ )
186
+
187
+
188
+ def normalize_label(label: str) -> str:
189
+ return PRESIDIO_TO_BASE_ALIASES.get(label, label)
190
+
191
+
192
+ def expand_entities_for_analyzer(entities_list):
193
+ expanded = set()
194
+ for e in entities_list:
195
+ vals = BASE_TO_RECOGNIZER_EXPANSIONS.get(e, [e])
196
+ for v in vals:
197
+ expanded.add(v)
198
+ return list(expanded)
199
+
200
+
201
+ def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
202
+ from operator import attrgetter
203
+ from sumy.summarizers._summarizer import SentenceInfo
204
+
205
+ rate = rating
206
+ if isinstance(rating, dict):
207
+ assert not args and not kwargs
208
+ rate = lambda s: rating[s]
209
+ infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
210
+ infos = sorted(infos, key=attrgetter("rating"), reverse=True)
211
+ return tuple((i.sentence, i.rating, i.order) for i in infos)
212
+
213
+
214
+ def _sumy__lsa_call(summarizer, document):
215
+ summarizer._ensure_dependecies_installed()
216
+ dictionary = summarizer._create_dictionary(document)
217
+ if not dictionary:
218
+ return ()
219
+ matrix = summarizer._create_matrix(document, dictionary)
220
+ matrix = summarizer._compute_term_frequency(matrix)
221
+ from numpy.linalg import svd as singular_value_decomposition
222
+
223
+ u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
224
+ ranks = iter(summarizer._compute_ranks(sigma, v))
225
+ return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
226
+
227
+
228
+ def _sumy__luhn_call(summarizer, document):
229
+ words = summarizer._get_significant_words(document.words)
230
+ return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
231
+
232
+
233
+ def get_nltk_tokenizer(language: str) -> Tokenizer:
234
+ nltk_lang = alpha2_to_language(language).lower()
235
+
236
+ try:
237
+ nltk.data.find("tokenizers/punkt")
238
+ except LookupError:
239
+ nltk.download("punkt")
240
+
241
+ return Tokenizer(nltk_lang)
242
+
243
+
244
+ class NERObject(BaseModel):
245
+ name: str
246
+ label: str
247
+ score: float = 0.0
248
+ start: int
249
+ count: int
250
+ context: str | None = None
251
+ comentions: list[str] = Field(default_factory=list)
252
+ model_config = ConfigDict(extra="allow")
253
+
254
+ def __repr__(self):
255
+ return f"NERObject(label={self.label},name={self.name})"
256
+
257
+
258
+ def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_entities=None):
259
+ if whitelisted_labels is not None:
260
+ entities = [e for e in entities if e.label in whitelisted_labels]
261
+ entities = sorted(entities, key=lambda x: x.name)
262
+ final_entities = []
263
+ for _, group in itertools.groupby(entities, key=lambda x: x.name):
264
+ group = list(group)
265
+ best_entity = max(group, key=lambda x: x.score * x.count)
266
+ merged_data = {
267
+ "name": best_entity.name,
268
+ "label": best_entity.label,
269
+ "score": best_entity.score,
270
+ "context": best_entity.context,
271
+ "count": sum(e.count for e in group),
272
+ "start": best_entity.start,
273
+ }
274
+ all_fields = best_entity.model_fields.keys()
275
+ for field in all_fields:
276
+ if field in merged_data:
277
+ continue
278
+ values = [getattr(e, field, None) for e in group if getattr(e, field, None) is not None]
279
+ if not values:
280
+ continue
281
+ if isinstance(values[0], list):
282
+ merged_data[field] = list(set(itertools.chain.from_iterable(values or [])))
283
+ else:
284
+ merged_data[field] = getattr(best_entity, field, None)
285
+ final_entities.append(NERObject(**merged_data))
286
+ final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
287
+ if max_entities and len(final_entities) > max_entities:
288
+ final_entities = final_entities[:max_entities]
289
+ return final_entities
290
+
291
+
292
+ def build_presidio_analyzer(language: str, engine_type: str = "spacy", model=None) -> AnalyzerEngine:
293
+ registry = RecognizerRegistry()
294
+
295
+ if engine_type == "flair":
296
+
297
+ flair_recognizer = FlairRecognizer(
298
+ model=model,
299
+ supported_language=language
300
+ )
301
+ registry.add_recognizer(flair_recognizer)
302
+
303
+ default_registry = RecognizerRegistry()
304
+ default_registry.load_predefined_recognizers()
305
+
306
+ flair_handled_entities = {"PERSON", "LOC", "ORG"}
307
+
308
+ for recognizer in default_registry.recognizers:
309
+ recognizer_entities = set(recognizer.supported_entities) if hasattr(recognizer, 'supported_entities') else set()
310
+
311
+ if recognizer_entities and recognizer_entities.issubset(flair_handled_entities):
312
+ continue
313
+
314
+ registry.add_recognizer(recognizer)
315
+
316
+ return AnalyzerEngine(
317
+ registry=registry,
318
+ supported_languages=[language]
319
+ )
320
+
321
+ else:
322
+ registry.load_predefined_recognizers()
323
+
324
+ if model is None:
325
+ raise ValueError("SpaCy model name must be provided")
326
+
327
+ configuration = {
328
+ "nlp_engine_name": "spacy",
329
+ "models": [{"lang_code": language, "model_name": model}],
330
+ }
331
+
332
+ provider = NlpEngineProvider(nlp_configuration=configuration)
333
+ nlp_engine = provider.create_engine()
334
+
335
+ return AnalyzerEngine(
336
+ nlp_engine=nlp_engine,
337
+ registry=registry,
338
+ supported_languages=[language],
339
+ )
340
+
341
+
342
+ def analyze_column_sample(column_values: pd.Series, analyzer: AnalyzerEngine, language: str,
343
+ entities: Optional[List[str]], score_threshold: float) -> Optional[str]:
344
+ sample_values = column_values.dropna().head(50)
345
+
346
+ if sample_values.empty:
347
+ return None
348
+
349
+ entity_counter = Counter()
350
+
351
+ for value in sample_values:
352
+ text = str(value).strip()
353
+
354
+ if not text:
355
+ continue
356
+
357
+ results = analyzer.analyze(
358
+ text=text,
359
+ language=language,
360
+ entities=(expand_entities_for_analyzer(entities) if entities else None)
361
+ )
362
+
363
+ for result in results:
364
+ if result.score >= score_threshold:
365
+ entity_counter[normalize_label(result.entity_type)] += 1
366
+
367
+ if not entity_counter:
368
+ return None
369
+
370
+ most_common = entity_counter.most_common(1)[0]
371
+ total_detections = sum(entity_counter.values())
372
+
373
+ if most_common[1] > total_detections * 0.5:
374
+ return most_common[0]
375
+
376
+ return most_common[0] if entity_counter else None
377
+
378
+
379
+ def analyze_dataframe_optimized(df: pd.DataFrame, analyzer: AnalyzerEngine, language: str,
380
+ entities: Optional[List[str]] = None, score_threshold: float = 0.5) -> List[NERObject]:
381
+ ner_objects = []
382
+
383
+ for column_name in df.columns:
384
+ entity_type = analyze_column_sample(
385
+ df[column_name],
386
+ analyzer,
387
+ language,
388
+ entities,
389
+ score_threshold
390
+ )
391
+
392
+ if entity_type:
393
+ for idx, value in df[column_name].dropna().items():
394
+ text = str(value).strip()
395
+
396
+ if text:
397
+ ner_objects.append(NERObject(
398
+ name=text[:100],
399
+ label=entity_type,
400
+ score=0.9,
401
+ start=0,
402
+ count=1,
403
+ context=text[:100]
404
+ ))
405
+
406
+ return ner_objects
407
+
408
+
409
+ def compute_ner_presidio(
410
+ text,
411
+ language,
412
+ analyzer,
413
+ entities=None,
414
+ score_threshold=0.5,
415
+ context_width=150,
416
+ with_comentions=True,
417
+ with_context=True,
418
+ batch_size=32,
419
+ n_process=4
420
+ ):
421
+ if isinstance(text, pd.DataFrame):
422
+ if len(text) >= 100:
423
+ return analyze_dataframe_optimized(text, analyzer, language, entities, score_threshold)
424
+
425
+ else:
426
+ texts = []
427
+
428
+ for col in text.columns:
429
+ for idx, value in text[col].dropna().items():
430
+ text_value = str(value).strip()
431
+
432
+ if text_value:
433
+ texts.append(text_value)
434
+
435
+ text = "\n".join(texts)
436
+
437
+ elif isinstance(text, list):
438
+ batch_analyzer = BatchAnalyzerEngine(analyzer_engine=analyzer)
439
+
440
+ results_generator = batch_analyzer.analyze_iterator(
441
+ texts=text,
442
+ language=language,
443
+ batch_size=batch_size,
444
+ n_process=n_process,
445
+ entities=(expand_entities_for_analyzer(entities) if entities else None),
446
+ )
447
+
448
+ all_results = list(results_generator)
449
+ ner_objects = []
450
+
451
+ for text_item, results in zip(text, all_results):
452
+ for result in results:
453
+ if result.score >= score_threshold:
454
+ context_start = max(0, result.start - 30)
455
+ context_end = min(len(text_item), result.end + 30)
456
+ context = text_item[context_start:context_end] if with_context else None
457
+
458
+ ner_objects.append(NERObject(
459
+ name=text_item[result.start:result.end],
460
+ label=normalize_label(result.entity_type),
461
+ score=float(result.score),
462
+ start=int(result.start),
463
+ count=1,
464
+ context=context
465
+ ))
466
+
467
+ return ner_objects
468
+
469
+ results = analyzer.analyze(
470
+ text=text,
471
+ language=language,
472
+ entities=(expand_entities_for_analyzer(entities) if entities else None)
473
+ )
474
+
475
+ ner_objects = []
476
+
477
+ for result in results:
478
+ if result.score >= score_threshold:
479
+ context_start = max(0, result.start - math.floor(context_width / 2))
480
+ context_end = min(len(text), result.end + math.ceil(context_width / 2))
481
+ context = text[context_start:context_end] if with_context else None
482
+
483
+ ner_objects.append(NERObject(
484
+ name=text[result.start:result.end],
485
+ label=normalize_label(result.entity_type),
486
+ score=float(result.score),
487
+ start=int(result.start),
488
+ count=1,
489
+ context=context
490
+ ))
491
+
492
+ if with_comentions:
493
+ for i in range(len(ner_objects)):
494
+ entity = ner_objects[i]
495
+ comentions = [
496
+ ner_objects[j].name
497
+ for j in range(len(ner_objects))
498
+ if j != i and abs(ner_objects[j].start - entity.start) < math.ceil(context_width / 2)
499
+ ]
500
+ ner_objects[i].comentions = comentions
501
+
502
+ return ner_objects
503
+
504
+
505
+ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
506
+ tokenizer = get_nltk_tokenizer(language)
507
+ stemmer = Stemmer(language)
508
+ parser = PlaintextParser.from_string(text, tokenizer)
509
+ if fast:
510
+ summarizer = LuhnSummarizer(stemmer)
511
+ summarizer.stop_words = get_stop_words(language)
512
+ scored_sentences = iter(_sumy__luhn_call(summarizer, parser.document))
513
+ else:
514
+ summarizer = LsaSummarizer(stemmer)
515
+ summarizer.stop_words = get_stop_words(language)
516
+ scored_sentences = iter(_sumy__lsa_call(summarizer, parser.document))
517
+ summary = []
518
+ summary_chars = 0
519
+ summary_chars_penultimate = 0
520
+ while summary_chars < max_chars:
521
+ try:
522
+ next_sentence = next(scored_sentences)
523
+ summary.append(next_sentence)
524
+ summary_chars_penultimate = summary_chars
525
+ summary_chars += len(" " + next_sentence[0]._text)
526
+ except StopIteration:
527
+ break
528
+ summary = sorted(summary, key=lambda x: x[2])
529
+ summary = [(sentence[0]._text, sentence[1]) for sentence in summary]
530
+ if summary_chars > max_chars:
531
+ summary[-1] = (
532
+ summary[-1][0][: max_chars - summary_chars_penultimate],
533
+ summary[-1][1],
534
+ )
535
+ if not with_scores:
536
+ summary = " ".join([s[0] for s in summary])
537
+ else:
538
+ min_score = min([s[1] for s in summary]) if summary else 0
539
+ max_score = max([min_score] + [s[1] for s in summary])
540
+ score_range = 1 if min_score == max_score else (max_score - min_score)
541
+ summary = [(s[0], (s[1] - min_score) / score_range) for s in summary]
542
+ return summary
543
+
544
+
545
+ def ner_pipe(
546
+ text,
547
+ language,
548
+ model,
549
+ engine_type="spacy",
550
+ fast=False,
551
+ compression_ratio="auto",
552
+ with_comentions=True,
553
+ with_context=True,
554
+ entities=None,
555
+ score_threshold=0.5,
556
+ batch_size=32,
557
+ n_process=4
558
+ ):
559
+ analyzer = build_presidio_analyzer(
560
+ language=language,
561
+ engine_type=engine_type,
562
+ model=model,
563
+ )
564
+
565
+ if isinstance(text, pd.DataFrame):
566
+ ner = compute_ner_presidio(
567
+ text,
568
+ language,
569
+ analyzer,
570
+ entities,
571
+ score_threshold,
572
+ with_comentions=with_comentions,
573
+ with_context=with_context,
574
+ batch_size=batch_size,
575
+ n_process=n_process
576
+ )
577
+ else:
578
+ if compression_ratio == "auto":
579
+ compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
580
+
581
+ if compression_ratio > 1.0:
582
+ sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast,
583
+ with_scores=True)
584
+ text = " ".join([s[0] for s in sentences])
585
+
586
+ ner = compute_ner_presidio(
587
+ text,
588
+ language,
589
+ analyzer,
590
+ entities,
591
+ score_threshold,
592
+ with_comentions=with_comentions,
593
+ with_context=with_context,
594
+ batch_size=batch_size,
595
+ n_process=n_process
596
+ )
597
+
598
+ return ner
599
+
600
+
601
+ def get_ner_handler(
602
+ language,
603
+ model,
604
+ engine_type="spacy",
605
+ fast=False,
606
+ entities=None,
607
+ score_threshold=0.5,
608
+ batch_size=32,
609
+ n_process=4
610
+ ):
611
+ try:
612
+ get_nltk_tokenizer(language)
613
+ except LookupError:
614
+ language = "en"
615
+
616
+ return lambda text, compression_ratio="auto", with_comentions=True, with_context=True: ner_pipe(
617
+ text,
618
+ language,
619
+ model,
620
+ engine_type,
621
+ fast,
622
+ compression_ratio,
623
+ with_comentions,
624
+ with_context,
625
+ entities,
626
+ score_threshold,
627
+ batch_size,
628
+ n_process
629
+ )
630
+
631
+
632
+ @st.cache_resource
633
+ def get_cached_ner_handler(language, model):
634
+ return get_ner_handler(language, model)
@@ -1,15 +0,0 @@
1
- import re
2
- import py3langid as langid
3
- import iso639 as languages
4
-
5
-
6
- def detect_language(text, min_confidence=None):
7
- detector = langid.langid.LanguageIdentifier.from_pickled_model(
8
- langid.langid.MODEL_FILE, norm_probs=True
9
- )
10
- detected_lang, confidence = detector.classify(text)
11
- if min_confidence and confidence < min_confidence:
12
- return None, confidence
13
- detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
14
- detected_lang = languages.to_name(detected_lang).lower()
15
- return detected_lang, confidence
@@ -1,329 +0,0 @@
1
- import re
2
- import streamlit as st
3
- from spacy_download import load_spacy
4
- from flair.data import Sentence
5
- from flair.models import SequenceTagger
6
- from sumy.parsers.plaintext import PlaintextParser
7
- from sumy.nlp.tokenizers import Tokenizer
8
- from sumy.nlp.stemmers import Stemmer
9
- from sumy.summarizers.lsa import LsaSummarizer
10
- from sumy.summarizers.luhn import LuhnSummarizer
11
- from sumy.utils import get_stop_words
12
- import itertools
13
- import numpy as np
14
- import math
15
- import nltk
16
- from typing import Optional, List
17
- from pydantic import BaseModel, ConfigDict, Field
18
-
19
- SPACY_NER_MODELS = {
20
- "english": lambda: load_spacy(
21
- "en_core_web_sm",
22
- disable=["tok2vec", "tagger", "parser", "attribute_ruler", "lemmatizer"],
23
- )
24
- }
25
- FLAIR_NER_MODELS = {"english": lambda: SequenceTagger.load("flair/ner-english")}
26
- REGEX_NER_MODELS = {
27
- "IP_ADDRESS": [
28
- r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::(?:[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?\b",
29
- ],
30
- "PHONE": r"(?:(?:\+(?:\d{1,3}[ .-]?)?(?:\(\d{1,3}\)[ .-]?)?)(?:\d{2,5}[ .-]?){1,3}|\d{2,5}[ .-]\d{2,5}(?:[ .-]\d{2,5}){0,2})\b",
31
- "EMAIL": r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)+\b",
32
- "URL": r"\b(?:(?:https?|ftp|sftp|ftps|ssh|file|mailto|git|onion|ipfs|ipns):\/\/|www\.)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}(?::\d+)?(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?|(?:https?:\/\/)?[a-z2-7]{16,56}\.onion(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)\b",
33
- }
34
-
35
- BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
36
- BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL"]
37
-
38
-
39
- def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
40
- from operator import attrgetter
41
- from sumy.summarizers._summarizer import SentenceInfo
42
-
43
- rate = rating
44
- if isinstance(rating, dict):
45
- assert not args and not kwargs
46
- rate = lambda s: rating[s]
47
- infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
48
- infos = sorted(infos, key=attrgetter("rating"), reverse=True)
49
- return tuple((i.sentence, i.rating, i.order) for i in infos)
50
-
51
-
52
- def _sumy__lsa_call(summarizer, document):
53
- summarizer._ensure_dependecies_installed()
54
- dictionary = summarizer._create_dictionary(document)
55
- if not dictionary:
56
- return ()
57
- matrix = summarizer._create_matrix(document, dictionary)
58
- matrix = summarizer._compute_term_frequency(matrix)
59
- from numpy.linalg import svd as singular_value_decomposition
60
-
61
- u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
62
- ranks = iter(summarizer._compute_ranks(sigma, v))
63
- return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
64
-
65
-
66
- def _sumy__luhn_call(summarizer, document):
67
- words = summarizer._get_significant_words(document.words)
68
- return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
69
-
70
-
71
- def get_nltk_tokenizer(language: str) -> Tokenizer:
72
- nltk.data.find("tokenizers/punkt")
73
- return Tokenizer(language)
74
-
75
-
76
- class NERObject(BaseModel):
77
- name: str
78
- label: str
79
- score: float = 0.0
80
- start: int
81
- count: int
82
- context: str | None = None
83
- comentions: list[str] = Field(default_factory=list)
84
- model_config = ConfigDict(extra="allow")
85
-
86
- def __repr__(self):
87
- return f"NERObject(label={self.label},name={self.name})"
88
-
89
-
90
- def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_entities=None):
91
- if whitelisted_labels is not None:
92
- entities = [e for e in entities if e.label in whitelisted_labels]
93
- entities = sorted(entities, key=lambda x: x.name)
94
- final_entities = []
95
- for _, group in itertools.groupby(entities, key=lambda x: x.name):
96
- group = list(group)
97
- best_entity = max(group, key=lambda x: x.score * x.count)
98
- merged_data = {
99
- "name": best_entity.name,
100
- "label": best_entity.label,
101
- "score": best_entity.score,
102
- "context": best_entity.context,
103
- "count": sum(e.count for e in group),
104
- "start": best_entity.start,
105
- }
106
- all_fields = best_entity.model_fields.keys()
107
- for field in all_fields:
108
- if field in merged_data:
109
- continue
110
- values = [getattr(e, field, None) for e in group if getattr(e, field, None) is not None]
111
- if not values:
112
- continue
113
- if isinstance(values[0], list):
114
- merged_data[field] = list(set(itertools.chain.from_iterable(values or [])))
115
- else:
116
- merged_data[field] = getattr(best_entity, field, None)
117
- final_entities.append(NERObject(**merged_data))
118
- final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
119
- if max_entities and len(final_entities) > max_entities:
120
- final_entities = final_entities[:max_entities]
121
- return final_entities
122
-
123
-
124
- def compute_ner(
125
- language,
126
- sentences,
127
- spacy_model,
128
- flair_model=None,
129
- context_width=150,
130
- with_scores=True,
131
- with_comentions=True,
132
- with_context=True,
133
- ):
134
- sentence_starts = [0] + [len(s[0]) + 1 for s in sentences]
135
- del sentence_starts[-1]
136
- sentence_starts = list(np.cumsum(sentence_starts))
137
- text = "\n".join([s[0] for s in sentences])
138
- min_score = 1.0
139
- entities: list[NERObject] = []
140
-
141
- # FLAIR model (if not fast)
142
- if flair_model:
143
- input = [Sentence(sentence[0]) for sentence in sentences]
144
- flair_model.predict(input)
145
- output = [e for sentence in input for e in sentence.get_spans("ner")]
146
- flair_entities = [
147
- NERObject(
148
- name=entity.text,
149
- label=BASE_TO_ONTONOTES_LABELMAP.get(
150
- entity.annotation_layers["ner"][0].value,
151
- entity.annotation_layers["ner"][0].value,
152
- ),
153
- score=entity.score,
154
- start=sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
155
- count=1,
156
- )
157
- for entity in output
158
- ]
159
- min_score = min([min_score] + [e.score for e in flair_entities])
160
- entities += flair_entities
161
- del flair_entities
162
-
163
- # REGEX model
164
- for label, regexes in REGEX_NER_MODELS.items():
165
- if not isinstance(regexes, list):
166
- regexes = [regexes]
167
- for regex in regexes:
168
- regex_entities = [
169
- NERObject(
170
- name=match.group(),
171
- label=label,
172
- score=min_score - 0.5,
173
- count=1,
174
- start=match.start(),
175
- )
176
- for match in re.finditer(regex, text)
177
- ]
178
- entities += regex_entities
179
- min_score = min([min_score] + [e.score for e in regex_entities])
180
-
181
- # SPACY model
182
- chunks = []
183
- chunk_start_offsets = []
184
- current_chunk = []
185
- current_length = 0
186
- offset = 0
187
- for sentence, _ in sentences:
188
- sentence_len = len(sentence) + 1
189
- if sentence_len > spacy_model.max_length:
190
- truncated = sentence[: spacy_model.max_length - 1]
191
- chunks.append(truncated)
192
- chunk_start_offsets.append(offset)
193
- offset += sentence_len
194
- continue
195
- if current_length + sentence_len > spacy_model.max_length:
196
- chunks.append("\n".join(current_chunk))
197
- chunk_start_offsets.append(offset - current_length)
198
- current_chunk = []
199
- current_length = 0
200
- current_chunk.append(sentence)
201
- current_length += sentence_len
202
- offset += sentence_len
203
- if current_chunk:
204
- chunks.append("\n".join(current_chunk))
205
- chunk_start_offsets.append(offset - current_length)
206
- for i, chunk in enumerate(chunks):
207
- doc = spacy_model(chunk)
208
- chunk_offset = chunk_start_offsets[i]
209
- for entity in doc.ents:
210
- entities.append(
211
- NERObject(
212
- name=entity.text,
213
- label=BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
214
- score=min_score - 0.5,
215
- start=chunk_offset + entity.start_char,
216
- count=1,
217
- )
218
- )
219
-
220
- # Reformatting for consistency
221
- if not entities:
222
- return []
223
- if with_scores:
224
- min_entity_score = min([e.score for e in entities])
225
- max_entity_score = max([e.score for e in entities])
226
- entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
227
- for e in entities:
228
- e.score = (e.score - min_entity_score) / entity_score_range
229
- scores = list(np.searchsorted(sentence_starts, [e.start + 1 for e in entities]))
230
- scores = [sentences[i - 1][1] for i in scores]
231
- scores = [scores[i] + 10 * entities[i].score for i in range(len(entities))]
232
- for i in range(len(entities)):
233
- entities[i].score = scores[i]
234
- else:
235
- for i in range(len(entities)):
236
- entities[i].score = 0.0
237
- if with_comentions:
238
- for i in range(len(entities)):
239
- entity = entities[i]
240
- comentions = [
241
- entities[j].name
242
- for j in range(len(entities))
243
- if j != i and abs(entities[j].start - entity.start) < math.ceil(context_width / 2)
244
- ]
245
- entities[i].comentions = comentions
246
- if with_context:
247
- for i in range(len(entities)):
248
- entity = entities[i]
249
- if entity.start >= 0 and entity.start < len(text):
250
- left = max(0, entity.start - math.floor(context_width / 2))
251
- right = min(len(text), entity.start + math.ceil(context_width / 2))
252
- context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
253
- entities[i].context = context
254
- return entities
255
-
256
-
257
- def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
258
- tokenizer = get_nltk_tokenizer(language)
259
- stemmer = Stemmer(language)
260
- parser = PlaintextParser.from_string(text, tokenizer)
261
- if fast:
262
- summarizer = LuhnSummarizer(stemmer)
263
- summarizer.stop_words = get_stop_words(language)
264
- scored_sentences = iter(_sumy__luhn_call(summarizer, parser.document))
265
- else:
266
- summarizer = LsaSummarizer(stemmer)
267
- summarizer.stop_words = get_stop_words(language)
268
- scored_sentences = iter(_sumy__lsa_call(summarizer, parser.document))
269
- summary = []
270
- summary_chars = 0
271
- summary_chars_penultimate = 0
272
- while summary_chars < max_chars:
273
- try:
274
- next_sentence = next(scored_sentences)
275
- summary.append(next_sentence)
276
- summary_chars_penultimate = summary_chars
277
- summary_chars += len(" " + next_sentence[0]._text)
278
- except StopIteration:
279
- break
280
- summary = sorted(summary, key=lambda x: x[2])
281
- summary = [(sentence[0]._text, sentence[1]) for sentence in summary]
282
- if summary_chars > max_chars:
283
- summary[-1] = (
284
- summary[-1][0][: max_chars - summary_chars_penultimate],
285
- summary[-1][1],
286
- )
287
- if not with_scores:
288
- summary = " ".join([s[0] for s in summary])
289
- else:
290
- min_score = min([s[1] for s in summary]) if summary else 0
291
- max_score = max([min_score] + [s[1] for s in summary])
292
- score_range = 1 if min_score == max_score else (max_score - min_score)
293
- summary = [(s[0], (s[1] - min_score) / score_range) for s in summary]
294
- return summary
295
-
296
-
297
- def ner_pipe(
298
- text,
299
- language,
300
- spacy_model,
301
- flair_model=None,
302
- fast=False,
303
- compression_ratio="auto",
304
- with_scores=True,
305
- with_comentions=True,
306
- with_context=True,
307
- ):
308
- if compression_ratio == "auto":
309
- compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
310
- sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True)
311
- ner = compute_ner(language, sentences, spacy_model, flair_model, 150, with_scores, with_comentions, with_context)
312
- return ner
313
-
314
-
315
- def get_ner_handler(language, fast=False):
316
- try:
317
- get_nltk_tokenizer(language) # raises a LookupError if the language is not valid
318
- except LookupError:
319
- language = "english"
320
- spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS["english"])()
321
- flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS["english"])()
322
- return lambda text, compression_ratio="auto", with_scores=True, with_comentions=True, with_context=True: ner_pipe(
323
- text, language, spacy_model, flair_model, fast, compression_ratio, with_scores, with_comentions, with_context
324
- )
325
-
326
-
327
- @st.cache_resource
328
- def get_cached_ner_handler(language, fast):
329
- return get_ner_handler(language, fast)