streamlit-octostar-utils 0.1.7a3__tar.gz → 0.1.7a5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/PKG-INFO +1 -1
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/pyproject.toml +1 -1
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/celery.py +15 -5
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/nifi.py +10 -4
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/nlp/language.py +1 -1
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/nlp/ner.py +91 -94
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/LICENSE +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/README.md +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/dict.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/filetypes.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/core/timestamp.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/hello.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/nlp/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/octostar/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/octostar/client.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/octostar/context.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/octostar/permissions.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/ontology/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/ontology/expand_entities.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/ontology/validation.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/style/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/style/common.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
- {streamlit_octostar_utils-0.1.7a3 → streamlit_octostar_utils-0.1.7a5}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -23,7 +23,7 @@ from functools import wraps
|
|
23
23
|
import logging
|
24
24
|
|
25
25
|
logger = logging.getLogger(__name__)
|
26
|
-
logging.getLogger(
|
26
|
+
logging.getLogger("pottery").setLevel(logging.WARNING)
|
27
27
|
from celery.app.defaults import DEFAULTS as CELERY_DEFAULTS
|
28
28
|
import urllib
|
29
29
|
|
@@ -630,6 +630,11 @@ class FastAPICeleryTaskRoute(Route):
|
|
630
630
|
except BaseException as e:
|
631
631
|
exc = e
|
632
632
|
data = {}
|
633
|
+
assert (
|
634
|
+
(state in ["FAILURE", "RETRY", "REVOKED"] and exc is not None)
|
635
|
+
or (state in ["SUCCESS"] and exc is None)
|
636
|
+
or (state not in ["SUCCESS", "FAILURE", "RETRY", "REVOKED"])
|
637
|
+
)
|
633
638
|
if state in ["FAILURE", "RETRY", "REVOKED"]:
|
634
639
|
error_response = DefaultErrorRoute.format_error(exc, debug=True).body.decode("utf-8")
|
635
640
|
data = {
|
@@ -682,11 +687,16 @@ class CeleryErrorRoute(DefaultErrorRoute):
|
|
682
687
|
debug=False,
|
683
688
|
excs_to_status_codes=None,
|
684
689
|
silenced_excs=None,
|
685
|
-
log_filter=None
|
686
|
-
,
|
690
|
+
log_filter=None,
|
687
691
|
):
|
688
692
|
if excs_to_status_codes is None:
|
689
|
-
excs_to_status_codes = {
|
693
|
+
excs_to_status_codes = {
|
694
|
+
**DefaultErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS,
|
695
|
+
**CeleryErrorRoute.DEFAULT_STATUS_CODE_MAPPINGS,
|
696
|
+
}
|
690
697
|
if silenced_excs is None:
|
691
|
-
silenced_excs = {
|
698
|
+
silenced_excs = {
|
699
|
+
**DefaultErrorRoute.DEFAULT_SILENCED_EXCEPTIONS,
|
700
|
+
**CeleryErrorRoute.DEFAULT_SILENCED_EXCEPTIONS,
|
701
|
+
}
|
692
702
|
DefaultErrorRoute.add_default_exceptions_handler(fs_app, debug, excs_to_status_codes)
|
@@ -1103,14 +1103,20 @@ class NifiRoute(Route):
|
|
1103
1103
|
def define_routes(self):
|
1104
1104
|
@Route.route(self, path="/task-state/{task_id}")
|
1105
1105
|
async def get_task_status(task_id: str) -> JSONResponse:
|
1106
|
-
|
1107
|
-
|
1106
|
+
try:
|
1107
|
+
task_status = await self.tasks_routes.get_task(task_id, pop=False)
|
1108
|
+
task_status = task_status.model_dump(mode="json")["data"]["task_state"]
|
1109
|
+
except BaseException as e:
|
1110
|
+
raise ValueError(f"Could not fetch task state for task id {task_id}!\n{e}")
|
1108
1111
|
return JSONResponse(task_status)
|
1109
1112
|
|
1110
1113
|
@Route.route(self, path="/task-result/{task_id}")
|
1111
1114
|
async def get_task_result(task_id: str) -> JSONResponse:
|
1112
|
-
|
1113
|
-
|
1115
|
+
try:
|
1116
|
+
return_data = await self.tasks_routes.get_task(task_id, pop=True)
|
1117
|
+
return_data = return_data.model_dump(mode="json")["data"]["data"]
|
1118
|
+
except BaseException as e:
|
1119
|
+
raise ValueError(f"Could not fetch task result for task id {task_id}\n{e}!")
|
1114
1120
|
return JSONResponse(return_data)
|
1115
1121
|
|
1116
1122
|
@Route.route(self, path="/{op}", methods=["POST"])
|
@@ -1,7 +1,5 @@
|
|
1
1
|
import re
|
2
2
|
import streamlit as st
|
3
|
-
import py3langid as langid
|
4
|
-
import iso639 as languages
|
5
3
|
from spacy_download import load_spacy
|
6
4
|
from flair.data import Sentence
|
7
5
|
from flair.models import SequenceTagger
|
@@ -17,8 +15,6 @@ import math
|
|
17
15
|
import nltk
|
18
16
|
from typing import Optional, List
|
19
17
|
|
20
|
-
nltk.download("punkt")
|
21
|
-
|
22
18
|
SPACY_NER_MODELS = {
|
23
19
|
"english": lambda: load_spacy(
|
24
20
|
"en_core_web_sm",
|
@@ -26,9 +22,17 @@ SPACY_NER_MODELS = {
|
|
26
22
|
)
|
27
23
|
}
|
28
24
|
FLAIR_NER_MODELS = {"english": lambda: SequenceTagger.load("flair/ner-english")}
|
25
|
+
REGEX_NER_MODELS = {
|
26
|
+
"IP_ADDRESS": [
|
27
|
+
r"(?:(?<=:=)|(?<=\s)|(?<=\b))(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?(?:(?=\s)|(?=\b))",
|
28
|
+
r"(?:(?<=:=)|(?<=\s)|(?<=\b))(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}(?::\d{1,5})?(?:(?=\s)|(?=\b))"
|
29
|
+
],
|
30
|
+
"PHONE": r"(?:(?<=:=)|(?<=\s)|(?<=\b))[+]?[(]?[0-9]{1,4}[)]?[-\s\/0-9]*(?:(?=\s)|(?=\b))",
|
31
|
+
"EMAIL": r"(?:(?<=:=)|(?<=\s)|(?<=\b))[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:(?=\s)|(?=\b))",
|
32
|
+
}
|
29
33
|
|
30
34
|
BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
|
31
|
-
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "TIME"]
|
35
|
+
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "TIME", "PHONE", "IP_ADDRESS", "EMAIL"]
|
32
36
|
|
33
37
|
|
34
38
|
def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
|
@@ -39,9 +43,7 @@ def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
|
|
39
43
|
if isinstance(rating, dict):
|
40
44
|
assert not args and not kwargs
|
41
45
|
rate = lambda s: rating[s]
|
42
|
-
infos = (
|
43
|
-
SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences)
|
44
|
-
)
|
46
|
+
infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
|
45
47
|
infos = sorted(infos, key=attrgetter("rating"), reverse=True)
|
46
48
|
return tuple((i.sentence, i.rating, i.order) for i in infos)
|
47
49
|
|
@@ -62,9 +64,15 @@ def _sumy__lsa_call(summarizer, document):
|
|
62
64
|
|
63
65
|
def _sumy__luhn_call(summarizer, document):
|
64
66
|
words = summarizer._get_significant_words(document.words)
|
65
|
-
return _sumy__get_best_sentences(
|
66
|
-
|
67
|
-
|
67
|
+
return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
|
68
|
+
|
69
|
+
|
70
|
+
def get_nltk_tokenizer(language: str) -> Tokenizer:
|
71
|
+
try:
|
72
|
+
nltk.data.find("tokenizers/punkt")
|
73
|
+
except LookupError:
|
74
|
+
nltk.download("punkt")
|
75
|
+
return Tokenizer(language)
|
68
76
|
|
69
77
|
|
70
78
|
class NERObject(object):
|
@@ -77,22 +85,26 @@ class NERObject(object):
|
|
77
85
|
self.comentions: Optional[List[str]] = comentions
|
78
86
|
self.sources: Optional[List[str]] = list()
|
79
87
|
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
88
|
+
def to_dict(self):
|
89
|
+
data = {
|
90
|
+
"name": self.name,
|
91
|
+
"label": self.label,
|
92
|
+
"score": self.score,
|
93
|
+
"context": self.context,
|
94
|
+
"count": self.count,
|
95
|
+
"comentions": self.comentions or [],
|
96
|
+
}
|
97
|
+
if self.sources:
|
98
|
+
data["sources"] = self.sources
|
99
|
+
return data
|
100
|
+
|
101
|
+
def __repr__(self):
|
102
|
+
return f"NERObject(label={self.label},name={self.name})"
|
91
103
|
|
92
104
|
|
93
|
-
def postprocess_ner(entities,
|
94
|
-
if
|
95
|
-
entities = [e for e in entities if e.label in
|
105
|
+
def postprocess_ner(entities, whitelisted_labels=None, max_entities=None):
|
106
|
+
if whitelisted_labels is not None:
|
107
|
+
entities = [e for e in entities if e.label in whitelisted_labels]
|
96
108
|
entities = sorted(entities, key=lambda x: x.name)
|
97
109
|
final_entities = []
|
98
110
|
for _, group in itertools.groupby(entities, key=lambda x: x.name):
|
@@ -108,34 +120,24 @@ def postprocess_ner(entities, allowed_labels, max_entities=100):
|
|
108
120
|
)
|
109
121
|
best_entity.sources = list(set(itertools.chain(*[e.sources for e in group])))
|
110
122
|
final_entities.append(best_entity)
|
111
|
-
final_entities = sorted(
|
112
|
-
|
113
|
-
)
|
114
|
-
if len(final_entities) > max_entities:
|
123
|
+
final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
|
124
|
+
if max_entities and len(final_entities) > max_entities:
|
115
125
|
final_entities = final_entities[:max_entities]
|
116
126
|
return final_entities
|
117
127
|
|
118
128
|
|
119
|
-
def compute_ner(language, sentences,
|
129
|
+
def compute_ner(language, sentences, spacy_model, flair_model=None, context_width=150):
|
120
130
|
sentence_starts = [0] + [len(s[0]) + 1 for s in sentences]
|
121
131
|
del sentence_starts[-1]
|
122
132
|
sentence_starts = list(np.cumsum(sentence_starts))
|
123
133
|
text = "\n".join([s[0] for s in sentences])
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
|
130
|
-
0,
|
131
|
-
entity.start_char,
|
132
|
-
)
|
133
|
-
for entity in model(text).ents
|
134
|
-
]
|
135
|
-
else:
|
136
|
-
model = FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS["english"])()
|
134
|
+
min_score = 1.0
|
135
|
+
entities = []
|
136
|
+
|
137
|
+
# FLAIR model (if not fast)
|
138
|
+
if flair_model:
|
137
139
|
input = [Sentence(sentence[0]) for sentence in sentences]
|
138
|
-
|
140
|
+
flair_model.predict(input)
|
139
141
|
output = [e for sentence in input for e in sentence.get_spans("ner")]
|
140
142
|
flair_entities = [
|
141
143
|
(
|
@@ -145,35 +147,45 @@ def compute_ner(language, sentences, fast=True, context_width=150):
|
|
145
147
|
entity.annotation_layers["ner"][0].value,
|
146
148
|
),
|
147
149
|
entity.score,
|
148
|
-
sentence_starts[input.index(entity[0].sentence)]
|
149
|
-
+ entity[0].start_position,
|
150
|
+
sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
|
150
151
|
)
|
151
152
|
for entity in output
|
152
153
|
]
|
153
|
-
min_score = min(
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
154
|
+
min_score = min(min_score, *[e[2] for e in flair_entities])
|
155
|
+
entities += flair_entities
|
156
|
+
del flair_entities
|
157
|
+
|
158
|
+
# REGEX model
|
159
|
+
for label, regexes in REGEX_NER_MODELS.items():
|
160
|
+
if not isinstance(regexes, list):
|
161
|
+
regexes = [regexes]
|
162
|
+
for regex in regexes:
|
163
|
+
print(regex)
|
164
|
+
regex_entities = [
|
165
|
+
(match.group(), label, min_score - 0.5, match.start()) for match in re.finditer(regex, text)
|
166
|
+
]
|
167
|
+
print(regex_entities)
|
168
|
+
entities += regex_entities
|
169
|
+
|
170
|
+
# SPACY model
|
171
|
+
spacy_entities = [
|
172
|
+
(
|
173
|
+
entity.text,
|
174
|
+
BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
|
175
|
+
min_score - 1,
|
176
|
+
entity.start_char,
|
177
|
+
)
|
178
|
+
for entity in spacy_model(text).ents
|
179
|
+
]
|
180
|
+
entities += spacy_entities
|
181
|
+
del spacy_entities
|
182
|
+
|
183
|
+
# Reformatting for consistency
|
165
184
|
if entities:
|
166
185
|
min_entity_score = min([e[2] for e in entities])
|
167
186
|
max_entity_score = max([min_entity_score] + [e[2] for e in entities])
|
168
|
-
entity_score_range = (
|
169
|
-
|
170
|
-
if min_entity_score == max_entity_score
|
171
|
-
else (max_entity_score - min_entity_score)
|
172
|
-
)
|
173
|
-
entities = [
|
174
|
-
(e[0], e[1], (e[2] - min_entity_score) / entity_score_range, e[3])
|
175
|
-
for e in entities
|
176
|
-
]
|
187
|
+
entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
|
188
|
+
entities = [(e[0], e[1], (e[2] - min_entity_score) / entity_score_range, e[3]) for e in entities]
|
177
189
|
scores = list(np.searchsorted(sentence_starts, [e[3] + 1 for e in entities]))
|
178
190
|
scores = [sentences[i - 1][1] for i in scores]
|
179
191
|
scores = [scores[i] + int(10 * entities[i][2]) for i in range(len(entities))]
|
@@ -185,8 +197,7 @@ def compute_ner(language, sentences, fast=True, context_width=150):
|
|
185
197
|
comentions = [
|
186
198
|
entities[j][0]
|
187
199
|
for j in range(len(entities))
|
188
|
-
if j != i
|
189
|
-
and abs(entities[j][3] - entity[3]) < math.ceil(context_width / 2)
|
200
|
+
if j != i and abs(entities[j][3] - entity[3]) < math.ceil(context_width / 2)
|
190
201
|
]
|
191
202
|
entities[i] = (
|
192
203
|
entity[0],
|
@@ -201,11 +212,7 @@ def compute_ner(language, sentences, fast=True, context_width=150):
|
|
201
212
|
if entity[3] >= 0 and entity[3] < len(text):
|
202
213
|
left = max(0, entity[3] - math.floor(context_width / 2))
|
203
214
|
right = min(len(text), entity[3] + math.ceil(context_width / 2))
|
204
|
-
context = (
|
205
|
-
("[..]" if left > 0 else "")
|
206
|
-
+ text[left:right]
|
207
|
-
+ ("[..]" if right < len(text) else "")
|
208
|
-
)
|
215
|
+
context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
|
209
216
|
entities[i] = (
|
210
217
|
entity[0],
|
211
218
|
entity[1],
|
@@ -214,22 +221,12 @@ def compute_ner(language, sentences, fast=True, context_width=150):
|
|
214
221
|
entity[4],
|
215
222
|
entity[5],
|
216
223
|
)
|
217
|
-
entities = [
|
218
|
-
NERObject(
|
219
|
-
entities[i][0],
|
220
|
-
entities[i][1],
|
221
|
-
entities[i][2],
|
222
|
-
entities[i][3],
|
223
|
-
entities[i][4],
|
224
|
-
entities[i][5],
|
225
|
-
)
|
226
|
-
for i in range(len(entities))
|
227
|
-
]
|
224
|
+
entities = [NERObject(*entities[i]) for i in range(len(entities))]
|
228
225
|
return entities
|
229
226
|
|
230
227
|
|
231
228
|
def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
|
232
|
-
tokenizer =
|
229
|
+
tokenizer = get_nltk_tokenizer(language)
|
233
230
|
stemmer = Stemmer(language)
|
234
231
|
parser = PlaintextParser.from_string(text, tokenizer)
|
235
232
|
if fast:
|
@@ -268,24 +265,24 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
|
|
268
265
|
return summary
|
269
266
|
|
270
267
|
|
271
|
-
def ner_pipe(text, language, fast=False, compression_ratio="auto"):
|
268
|
+
def ner_pipe(text, language, spacy_model, flair_model=None, fast=False, compression_ratio="auto"):
|
272
269
|
if compression_ratio == "auto":
|
273
270
|
compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
|
274
|
-
sentences = get_extractive_summary(
|
275
|
-
|
276
|
-
)
|
277
|
-
ner = compute_ner(language, sentences, fast=fast)
|
271
|
+
sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True)
|
272
|
+
ner = compute_ner(language, sentences, spacy_model, flair_model)
|
278
273
|
return ner
|
279
274
|
|
280
275
|
|
281
276
|
def get_ner_handler(language, fast=False, compression_ratio="auto"):
|
282
277
|
try:
|
283
|
-
|
278
|
+
get_nltk_tokenizer(language) # raises a LookupError if the language is not valid
|
284
279
|
except LookupError:
|
285
280
|
language = "english"
|
286
|
-
|
281
|
+
spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS['english'])()
|
282
|
+
flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS['english'])()
|
283
|
+
return lambda text: ner_pipe(text, language, spacy_model, flair_model, fast, compression_ratio)
|
287
284
|
|
288
285
|
|
289
286
|
@st.cache_resource
|
290
287
|
def get_cached_ner_handler(language, fast):
|
291
|
-
return get_ner_handler(language, fast)
|
288
|
+
return get_ner_handler(language, fast)
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|