streamlit-octostar-utils 0.1.7a3__tar.gz → 0.1.7a5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/pyproject.toml +1 -1
  3. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/celery.py +15 -5
  4. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/nifi.py +10 -4
  5. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/nlp/language.py +1 -1
  6. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/nlp/ner.py +91 -94
  7. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/LICENSE +0 -0
  8. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/README.md +0 -0
  9. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/__init__.py +0 -0
  10. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  11. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
  12. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  13. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  14. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  15. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  16. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  17. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  18. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  19. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  20. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  21. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  22. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/__init__.py +0 -0
  23. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/dict.py +0 -0
  24. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/filetypes.py +0 -0
  25. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  26. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  27. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/timestamp.py +0 -0
  28. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/hello.py +0 -0
  29. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  30. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  31. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/octostar/client.py +0 -0
  32. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/octostar/context.py +0 -0
  33. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  34. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  35. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/ontology/expand_entities.py +0 -0
  36. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  37. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/ontology/validation.py +0 -0
  38. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/style/__init__.py +0 -0
  39. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/style/common.py +0 -0
  40. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/threading/__init__.py +0 -0
  41. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  42. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  43. {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.1.7a3
3
+ Version: 0.1.7a5
4
4
  Summary:
5
5
  License: MIT
6
6
  Author: Octostar
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "0.1.7a3"
8
+ version = "0.1.7a5"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -23,7 +23,7 @@ from functools import wraps
23
23
  import logging
24
24
 
25
25
  logger = logging.getLogger(__name__)
26
- logging.getLogger('pottery').setLevel(logging.WARNING)
26
+ logging.getLogger("pottery").setLevel(logging.WARNING)
27
27
  from celery.app.defaults import DEFAULTS as CELERY_DEFAULTS
28
28
  import urllib
29
29
 
@@ -630,6 +630,11 @@ class FastAPICeleryTaskRoute(Route):
630
630
  except BaseException as e:
631
631
  exc = e
632
632
  data = {}
633
+ assert (
634
+ (state in ["FAILURE", "RETRY", "REVOKED"] and exc is not None)
635
+ or (state in ["SUCCESS"] and exc is None)
636
+ or (state not in ["SUCCESS", "FAILURE", "RETRY", "REVOKED"])
637
+ )
633
638
  if state in ["FAILURE", "RETRY", "REVOKED"]:
634
639
  error_response = DefaultErrorRoute.format_error(exc, debug=True).body.decode("utf-8")
635
640
  data = {
@@ -682,11 +687,16 @@ class CeleryErrorRoute(DefaultErrorRoute):
682
687
  debug=False,
683
688
  excs_to_status_codes=None,
684
689
  silenced_excs=None,
685
- log_filter=None
686
- ,
690
+ log_filter=None,
687
691
  ):
688
692
  if excs_to_status_codes is None:
689
- excs_to_status_codes = {**DefaultErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS, **CeleryErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS}
693
+ excs_to_status_codes = {
694
+ **DefaultErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS,
695
+ **CeleryErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS,
696
+ }
690
697
  if silenced_excs is None:
691
- silenced_excs = {**DefaultErrorRoute.DEFAULT_SILENCED_EXCEPTIONS, **CeleryErrorRoute.DEFAULT_SILENCED_EXCEPTIONS}
698
+ silenced_excs = {
699
+ **DefaultErrorRoute.DEFAULT_SILENCED_EXCEPTIONS,
700
+ **CeleryErrorRoute.DEFAULT_SILENCED_EXCEPTIONS,
701
+ }
692
702
  DefaultErrorRoute.add_default_exceptions_handler(fs_app, debug, excs_to_status_codes)
@@ -1103,14 +1103,20 @@ class NifiRoute(Route):
1103
1103
  def define_routes(self):
1104
1104
  @Route.route(self, path="/task-state/{task_id}")
1105
1105
  async def get_task_status(task_id: str) -> JSONResponse:
1106
- task_status = await self.tasks_routes.get_task(task_id, pop=False)
1107
- task_status = task_status.model_dump(mode="json")["data"]["task_state"]
1106
+ try:
1107
+ task_status = await self.tasks_routes.get_task(task_id, pop=False)
1108
+ task_status = task_status.model_dump(mode="json")["data"]["task_state"]
1109
+ except BaseException as e:
1110
+ raise ValueError(f"Could not fetch task state for task id {task_id}!\n{e}")
1108
1111
  return JSONResponse(task_status)
1109
1112
 
1110
1113
  @Route.route(self, path="/task-result/{task_id}")
1111
1114
  async def get_task_result(task_id: str) -> JSONResponse:
1112
- return_data = await self.tasks_routes.get_task(task_id, pop=True)
1113
- return_data = return_data.model_dump(mode="json")["data"]["data"]
1115
+ try:
1116
+ return_data = await self.tasks_routes.get_task(task_id, pop=True)
1117
+ return_data = return_data.model_dump(mode="json")["data"]["data"]
1118
+ except BaseException as e:
1119
+ raise ValueError(f"Could not fetch task result for task id {task_id}\n{e}!")
1114
1120
  return JSONResponse(return_data)
1115
1121
 
1116
1122
  @Route.route(self, path="/{op}", methods=["POST"])
@@ -12,4 +12,4 @@ def detect_language(text, min_confidence=None):
12
12
  return None
13
13
  detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
14
14
  detected_lang = languages.to_name(detected_lang).lower()
15
- return detected_lang
15
+ return detected_lang, confidence
@@ -1,7 +1,5 @@
1
1
  import re
2
2
  import streamlit as st
3
- import py3langid as langid
4
- import iso639 as languages
5
3
  from spacy_download import load_spacy
6
4
  from flair.data import Sentence
7
5
  from flair.models import SequenceTagger
@@ -17,8 +15,6 @@ import math
17
15
  import nltk
18
16
  from typing import Optional, List
19
17
 
20
- nltk.download("punkt")
21
-
22
18
  SPACY_NER_MODELS = {
23
19
  "english": lambda: load_spacy(
24
20
  "en_core_web_sm",
@@ -26,9 +22,17 @@ SPACY_NER_MODELS = {
26
22
  )
27
23
  }
28
24
  FLAIR_NER_MODELS = {"english": lambda: SequenceTagger.load("flair/ner-english")}
25
+ REGEX_NER_MODELS = {
26
+ "IP_ADDRESS": [
27
+ r"(?:(?<=:=)|(?<=\s)|(?<=\b))(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?(?:(?=\s)|(?=\b))",
28
+ r"(?:(?<=:=)|(?<=\s)|(?<=\b))(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}(?::\d{1,5})?(?:(?=\s)|(?=\b))"
29
+ ],
30
+ "PHONE": r"(?:(?<=:=)|(?<=\s)|(?<=\b))[+]?[(]?[0-9]{1,4}[)]?[-\s\/0-9]*(?:(?=\s)|(?=\b))",
31
+ "EMAIL": r"(?:(?<=:=)|(?<=\s)|(?<=\b))[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:(?=\s)|(?=\b))",
32
+ }
29
33
 
30
34
  BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
31
- BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "TIME"]
35
+ BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "TIME", "PHONE", "IP_ADDRESS", "EMAIL"]
32
36
 
33
37
 
34
38
  def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
@@ -39,9 +43,7 @@ def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
39
43
  if isinstance(rating, dict):
40
44
  assert not args and not kwargs
41
45
  rate = lambda s: rating[s]
42
- infos = (
43
- SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences)
44
- )
46
+ infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
45
47
  infos = sorted(infos, key=attrgetter("rating"), reverse=True)
46
48
  return tuple((i.sentence, i.rating, i.order) for i in infos)
47
49
 
@@ -62,9 +64,15 @@ def _sumy__lsa_call(summarizer, document):
62
64
 
63
65
  def _sumy__luhn_call(summarizer, document):
64
66
  words = summarizer._get_significant_words(document.words)
65
- return _sumy__get_best_sentences(
66
- document.sentences, summarizer.rate_sentence, words
67
- )
67
+ return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
68
+
69
+
70
+ def get_nltk_tokenizer(language: str) -> Tokenizer:
71
+ try:
72
+ nltk.data.find("tokenizers/punkt")
73
+ except LookupError:
74
+ nltk.download("punkt")
75
+ return Tokenizer(language)
68
76
 
69
77
 
70
78
  class NERObject(object):
@@ -77,22 +85,26 @@ class NERObject(object):
77
85
  self.comentions: Optional[List[str]] = comentions
78
86
  self.sources: Optional[List[str]] = list()
79
87
 
80
-
81
- def detect_language(text, min_confidence=None):
82
- detector = langid.langid.LanguageIdentifier.from_pickled_model(
83
- langid.langid.MODEL_FILE, norm_probs=True
84
- )
85
- detected_lang, confidence = detector.classify(text)
86
- if min_confidence and confidence < min_confidence:
87
- return None
88
- detected_lang = re.sub("[^A-Za-z]", "", detected_lang).lower()
89
- detected_lang = languages.to_name(detected_lang).lower()
90
- return detected_lang
88
+ def to_dict(self):
89
+ data = {
90
+ "name": self.name,
91
+ "label": self.label,
92
+ "score": self.score,
93
+ "context": self.context,
94
+ "count": self.count,
95
+ "comentions": self.comentions or [],
96
+ }
97
+ if self.sources:
98
+ data["sources"] = self.sources
99
+ return data
100
+
101
+ def __repr__(self):
102
+ return f"NERObject(label={self.label},name={self.name})"
91
103
 
92
104
 
93
- def postprocess_ner(entities, allowed_labels, max_entities=100):
94
- if allowed_labels != "all":
95
- entities = [e for e in entities if e.label in allowed_labels]
105
+ def postprocess_ner(entities, whitelisted_labels=None, max_entities=None):
106
+ if whitelisted_labels is not None:
107
+ entities = [e for e in entities if e.label in whitelisted_labels]
96
108
  entities = sorted(entities, key=lambda x: x.name)
97
109
  final_entities = []
98
110
  for _, group in itertools.groupby(entities, key=lambda x: x.name):
@@ -108,34 +120,24 @@ def postprocess_ner(entities, allowed_labels, max_entities=100):
108
120
  )
109
121
  best_entity.sources = list(set(itertools.chain(*[e.sources for e in group])))
110
122
  final_entities.append(best_entity)
111
- final_entities = sorted(
112
- final_entities, key=lambda x: x.score * x.count, reverse=True
113
- )
114
- if len(final_entities) > max_entities:
123
+ final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
124
+ if max_entities and len(final_entities) > max_entities:
115
125
  final_entities = final_entities[:max_entities]
116
126
  return final_entities
117
127
 
118
128
 
119
- def compute_ner(language, sentences, fast=True, context_width=150):
129
+ def compute_ner(language, sentences, spacy_model, flair_model=None, context_width=150):
120
130
  sentence_starts = [0] + [len(s[0]) + 1 for s in sentences]
121
131
  del sentence_starts[-1]
122
132
  sentence_starts = list(np.cumsum(sentence_starts))
123
133
  text = "\n".join([s[0] for s in sentences])
124
- if fast:
125
- model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS["english"])()
126
- entities = [
127
- (
128
- entity.text,
129
- BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
130
- 0,
131
- entity.start_char,
132
- )
133
- for entity in model(text).ents
134
- ]
135
- else:
136
- model = FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS["english"])()
134
+ min_score = 1.0
135
+ entities = []
136
+
137
+ # FLAIR model (if not fast)
138
+ if flair_model:
137
139
  input = [Sentence(sentence[0]) for sentence in sentences]
138
- model.predict(input)
140
+ flair_model.predict(input)
139
141
  output = [e for sentence in input for e in sentence.get_spans("ner")]
140
142
  flair_entities = [
141
143
  (
@@ -145,35 +147,45 @@ def compute_ner(language, sentences, fast=True, context_width=150):
145
147
  entity.annotation_layers["ner"][0].value,
146
148
  ),
147
149
  entity.score,
148
- sentence_starts[input.index(entity[0].sentence)]
149
- + entity[0].start_position,
150
+ sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
150
151
  )
151
152
  for entity in output
152
153
  ]
153
- min_score = min([0] + [e[2] for e in flair_entities])
154
- model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS["english"])()
155
- spacy_entities = [
156
- (
157
- entity.text,
158
- BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
159
- min_score - 1,
160
- entity.start_char,
161
- )
162
- for entity in model(text).ents
163
- ]
164
- entities = flair_entities + spacy_entities
154
+ min_score = min(min_score, *[e[2] for e in flair_entities])
155
+ entities += flair_entities
156
+ del flair_entities
157
+
158
+ # REGEX model
159
+ for label, regexes in REGEX_NER_MODELS.items():
160
+ if not isinstance(regexes, list):
161
+ regexes = [regexes]
162
+ for regex in regexes:
163
+ print(regex)
164
+ regex_entities = [
165
+ (match.group(), label, min_score - 0.5, match.start()) for match in re.finditer(regex, text)
166
+ ]
167
+ print(regex_entities)
168
+ entities += regex_entities
169
+
170
+ # SPACY model
171
+ spacy_entities = [
172
+ (
173
+ entity.text,
174
+ BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
175
+ min_score - 1,
176
+ entity.start_char,
177
+ )
178
+ for entity in spacy_model(text).ents
179
+ ]
180
+ entities += spacy_entities
181
+ del spacy_entities
182
+
183
+ # Reformatting for consistency
165
184
  if entities:
166
185
  min_entity_score = min([e[2] for e in entities])
167
186
  max_entity_score = max([min_entity_score] + [e[2] for e in entities])
168
- entity_score_range = (
169
- 1
170
- if min_entity_score == max_entity_score
171
- else (max_entity_score - min_entity_score)
172
- )
173
- entities = [
174
- (e[0], e[1], (e[2] - min_entity_score) / entity_score_range, e[3])
175
- for e in entities
176
- ]
187
+ entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
188
+ entities = [(e[0], e[1], (e[2] - min_entity_score) / entity_score_range, e[3]) for e in entities]
177
189
  scores = list(np.searchsorted(sentence_starts, [e[3] + 1 for e in entities]))
178
190
  scores = [sentences[i - 1][1] for i in scores]
179
191
  scores = [scores[i] + int(10 * entities[i][2]) for i in range(len(entities))]
@@ -185,8 +197,7 @@ def compute_ner(language, sentences, fast=True, context_width=150):
185
197
  comentions = [
186
198
  entities[j][0]
187
199
  for j in range(len(entities))
188
- if j != i
189
- and abs(entities[j][3] - entity[3]) < math.ceil(context_width / 2)
200
+ if j != i and abs(entities[j][3] - entity[3]) < math.ceil(context_width / 2)
190
201
  ]
191
202
  entities[i] = (
192
203
  entity[0],
@@ -201,11 +212,7 @@ def compute_ner(language, sentences, fast=True, context_width=150):
201
212
  if entity[3] >= 0 and entity[3] < len(text):
202
213
  left = max(0, entity[3] - math.floor(context_width / 2))
203
214
  right = min(len(text), entity[3] + math.ceil(context_width / 2))
204
- context = (
205
- ("[..]" if left > 0 else "")
206
- + text[left:right]
207
- + ("[..]" if right < len(text) else "")
208
- )
215
+ context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
209
216
  entities[i] = (
210
217
  entity[0],
211
218
  entity[1],
@@ -214,22 +221,12 @@ def compute_ner(language, sentences, fast=True, context_width=150):
214
221
  entity[4],
215
222
  entity[5],
216
223
  )
217
- entities = [
218
- NERObject(
219
- entities[i][0],
220
- entities[i][1],
221
- entities[i][2],
222
- entities[i][3],
223
- entities[i][4],
224
- entities[i][5],
225
- )
226
- for i in range(len(entities))
227
- ]
224
+ entities = [NERObject(*entities[i]) for i in range(len(entities))]
228
225
  return entities
229
226
 
230
227
 
231
228
  def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
232
- tokenizer = Tokenizer(language)
229
+ tokenizer = get_nltk_tokenizer(language)
233
230
  stemmer = Stemmer(language)
234
231
  parser = PlaintextParser.from_string(text, tokenizer)
235
232
  if fast:
@@ -268,24 +265,24 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
268
265
  return summary
269
266
 
270
267
 
271
- def ner_pipe(text, language, fast=False, compression_ratio="auto"):
268
+ def ner_pipe(text, language, spacy_model, flair_model=None, fast=False, compression_ratio="auto"):
272
269
  if compression_ratio == "auto":
273
270
  compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
274
- sentences = get_extractive_summary(
275
- text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True
276
- )
277
- ner = compute_ner(language, sentences, fast=fast)
271
+ sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True)
272
+ ner = compute_ner(language, sentences, spacy_model, flair_model)
278
273
  return ner
279
274
 
280
275
 
281
276
  def get_ner_handler(language, fast=False, compression_ratio="auto"):
282
277
  try:
283
- Tokenizer(language) # raises a LookupError if the language is not valid
278
+ get_nltk_tokenizer(language) # raises a LookupError if the language is not valid
284
279
  except LookupError:
285
280
  language = "english"
286
- return lambda text: ner_pipe(text, language, fast, compression_ratio)
281
+ spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS['english'])()
282
+ flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS['english'])()
283
+ return lambda text: ner_pipe(text, language, spacy_model, flair_model, fast, compression_ratio)
287
284
 
288
285
 
289
286
  @st.cache_resource
290
287
  def get_cached_ner_handler(language, fast):
291
- return get_ner_handler(language, fast)
288
+ return get_ner_handler(language, fast)