streamlit-octostar-utils 0.1.7a4__tar.gz → 0.1.7a6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/PKG-INFO +1 -1
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/pyproject.toml +1 -1
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/nlp/ner.py +143 -105
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/LICENSE +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/README.md +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/nifi.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/core/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/core/dict.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/core/filetypes.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/core/timestamp.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/hello.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/nlp/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/nlp/language.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/octostar/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/octostar/client.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/octostar/context.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/octostar/permissions.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/ontology/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/ontology/expand_entities.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/ontology/validation.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/style/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/style/common.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
- {streamlit_octostar_utils-0.1.7a4 → streamlit_octostar_utils-0.1.7a6}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -14,6 +14,7 @@ import numpy as np
|
|
14
14
|
import math
|
15
15
|
import nltk
|
16
16
|
from typing import Optional, List
|
17
|
+
from pydantic import BaseModel, ConfigDict, Field
|
17
18
|
|
18
19
|
SPACY_NER_MODELS = {
|
19
20
|
"english": lambda: load_spacy(
|
@@ -24,15 +25,15 @@ SPACY_NER_MODELS = {
|
|
24
25
|
FLAIR_NER_MODELS = {"english": lambda: SequenceTagger.load("flair/ner-english")}
|
25
26
|
REGEX_NER_MODELS = {
|
26
27
|
"IP_ADDRESS": [
|
27
|
-
r"(?:(
|
28
|
-
r"(?:(?<=:=)|(?<=\s)|(?<=\b))(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}(?::\d{1,5})?(?:(?=\s)|(?=\b))"
|
28
|
+
r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::(?:[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?\b",
|
29
29
|
],
|
30
|
-
"PHONE": r"(?:(
|
31
|
-
"EMAIL": r"
|
30
|
+
"PHONE": r"(?:(?:\+(?:\d{1,3}[ .-]?)?(?:\(\d{1,3}\)[ .-]?)?)(?:\d{2,5}[ .-]?){1,3}|\d{2,5}[ .-]\d{2,5}(?:[ .-]\d{2,5}){0,2})\b",
|
31
|
+
"EMAIL": r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)+\b",
|
32
|
+
"URL": r"\b(?:(?:https?|ftp|sftp|ftps|ssh|file|mailto|git|onion|ipfs|ipns):\/\/|www\.)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}(?::\d+)?(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?|(?:https?:\/\/)?[a-z2-7]{16,56}\.onion(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)\b",
|
32
33
|
}
|
33
34
|
|
34
35
|
BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
|
35
|
-
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "TIME", "PHONE"]
|
36
|
+
BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "TIME", "PHONE", "IP_ADDRESS", "EMAIL"]
|
36
37
|
|
37
38
|
|
38
39
|
def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
|
@@ -75,33 +76,21 @@ def get_nltk_tokenizer(language: str) -> Tokenizer:
|
|
75
76
|
return Tokenizer(language)
|
76
77
|
|
77
78
|
|
78
|
-
class NERObject(
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
79
|
+
class NERObject(BaseModel):
|
80
|
+
name: str
|
81
|
+
label: str
|
82
|
+
score: float = 0.0
|
83
|
+
start: int
|
84
|
+
count: int
|
85
|
+
context: str | None = None
|
86
|
+
comentions: list[str] = Field(default_factory=list)
|
87
|
+
model_config = ConfigDict(extra="allow")
|
87
88
|
|
88
|
-
def to_dict(self):
|
89
|
-
data = {
|
90
|
-
"name": self.name,
|
91
|
-
"label": self.label,
|
92
|
-
"score": self.score,
|
93
|
-
"context": self.context,
|
94
|
-
"count": self.count,
|
95
|
-
"comentions": self.comentions or [],
|
96
|
-
}
|
97
|
-
if self.sources:
|
98
|
-
data["sources"] = self.sources
|
99
|
-
|
100
89
|
def __repr__(self):
|
101
90
|
return f"NERObject(label={self.label},name={self.name})"
|
102
91
|
|
103
92
|
|
104
|
-
def postprocess_ner(entities, whitelisted_labels=None, max_entities=None):
|
93
|
+
def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_entities=None):
|
105
94
|
if whitelisted_labels is not None:
|
106
95
|
entities = [e for e in entities if e.label in whitelisted_labels]
|
107
96
|
entities = sorted(entities, key=lambda x: x.name)
|
@@ -109,29 +98,48 @@ def postprocess_ner(entities, whitelisted_labels=None, max_entities=None):
|
|
109
98
|
for _, group in itertools.groupby(entities, key=lambda x: x.name):
|
110
99
|
group = list(group)
|
111
100
|
best_entity = max(group, key=lambda x: x.score * x.count)
|
112
|
-
|
113
|
-
best_entity.name,
|
114
|
-
best_entity.label,
|
115
|
-
best_entity.score,
|
116
|
-
best_entity.context,
|
117
|
-
sum(
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
101
|
+
merged_data = {
|
102
|
+
"name": best_entity.name,
|
103
|
+
"label": best_entity.label,
|
104
|
+
"score": best_entity.score,
|
105
|
+
"context": best_entity.context,
|
106
|
+
"count": sum(e.count for e in group),
|
107
|
+
"start": best_entity.start,
|
108
|
+
}
|
109
|
+
all_fields = best_entity.model_fields.keys()
|
110
|
+
for field in all_fields:
|
111
|
+
if field in merged_data:
|
112
|
+
continue
|
113
|
+
values = [getattr(e, field, None) for e in group if getattr(e, field, None) is not None]
|
114
|
+
if not values:
|
115
|
+
continue
|
116
|
+
if isinstance(values[0], list):
|
117
|
+
merged_data[field] = list(set(itertools.chain.from_iterable(values or [])))
|
118
|
+
else:
|
119
|
+
merged_data[field] = getattr(best_entity, field, None)
|
120
|
+
final_entities.append(NERObject(**merged_data))
|
122
121
|
final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
|
123
122
|
if max_entities and len(final_entities) > max_entities:
|
124
123
|
final_entities = final_entities[:max_entities]
|
125
124
|
return final_entities
|
126
125
|
|
127
126
|
|
128
|
-
def compute_ner(
|
127
|
+
def compute_ner(
|
128
|
+
language,
|
129
|
+
sentences,
|
130
|
+
spacy_model,
|
131
|
+
flair_model=None,
|
132
|
+
context_width=150,
|
133
|
+
with_scores=True,
|
134
|
+
with_comentions=True,
|
135
|
+
with_context=True,
|
136
|
+
):
|
129
137
|
sentence_starts = [0] + [len(s[0]) + 1 for s in sentences]
|
130
138
|
del sentence_starts[-1]
|
131
139
|
sentence_starts = list(np.cumsum(sentence_starts))
|
132
140
|
text = "\n".join([s[0] for s in sentences])
|
133
141
|
min_score = 1.0
|
134
|
-
entities = []
|
142
|
+
entities: list[NERObject] = []
|
135
143
|
|
136
144
|
# FLAIR model (if not fast)
|
137
145
|
if flair_model:
|
@@ -139,88 +147,115 @@ def compute_ner(language, sentences, spacy_model, flair_model=None, context_widt
|
|
139
147
|
flair_model.predict(input)
|
140
148
|
output = [e for sentence in input for e in sentence.get_spans("ner")]
|
141
149
|
flair_entities = [
|
142
|
-
(
|
143
|
-
entity.text,
|
144
|
-
BASE_TO_ONTONOTES_LABELMAP.get(
|
150
|
+
NERObject(
|
151
|
+
name=entity.text,
|
152
|
+
label=BASE_TO_ONTONOTES_LABELMAP.get(
|
145
153
|
entity.annotation_layers["ner"][0].value,
|
146
154
|
entity.annotation_layers["ner"][0].value,
|
147
155
|
),
|
148
|
-
entity.score,
|
149
|
-
sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
|
156
|
+
score=entity.score,
|
157
|
+
start=sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
|
158
|
+
count=1,
|
150
159
|
)
|
151
160
|
for entity in output
|
152
161
|
]
|
153
|
-
min_score = min(min_score
|
162
|
+
min_score = min([min_score] + [e.score for e in flair_entities])
|
154
163
|
entities += flair_entities
|
155
164
|
del flair_entities
|
156
165
|
|
166
|
+
print("Checking REGEXES")
|
157
167
|
# REGEX model
|
158
168
|
for label, regexes in REGEX_NER_MODELS.items():
|
159
169
|
if not isinstance(regexes, list):
|
160
170
|
regexes = [regexes]
|
161
171
|
for regex in regexes:
|
162
|
-
print(regex)
|
163
172
|
regex_entities = [
|
164
|
-
(
|
173
|
+
NERObject(
|
174
|
+
name=match.group(),
|
175
|
+
label=label,
|
176
|
+
score=min_score - 0.5,
|
177
|
+
count=1,
|
178
|
+
start=match.start(),
|
179
|
+
)
|
180
|
+
for match in re.finditer(regex, text)
|
165
181
|
]
|
166
|
-
print(regex_entities)
|
167
182
|
entities += regex_entities
|
168
|
-
|
183
|
+
min_score = min([min_score] + [e.score for e in regex_entities])
|
184
|
+
|
169
185
|
# SPACY model
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
186
|
+
print("CHECKING SPACY")
|
187
|
+
chunks = []
|
188
|
+
chunk_start_offsets = []
|
189
|
+
current_chunk = []
|
190
|
+
current_length = 0
|
191
|
+
offset = 0
|
192
|
+
for sentence, _ in sentences:
|
193
|
+
sentence_len = len(sentence) + 1
|
194
|
+
if sentence_len > spacy_model.max_length:
|
195
|
+
truncated = sentence[: spacy_model.max_length - 1]
|
196
|
+
chunks.append(truncated)
|
197
|
+
chunk_start_offsets.append(offset)
|
198
|
+
offset += sentence_len
|
199
|
+
continue
|
200
|
+
if current_length + sentence_len > spacy_model.max_length:
|
201
|
+
chunks.append("\n".join(current_chunk))
|
202
|
+
chunk_start_offsets.append(offset - current_length)
|
203
|
+
current_chunk = []
|
204
|
+
current_length = 0
|
205
|
+
current_chunk.append(sentence)
|
206
|
+
current_length += sentence_len
|
207
|
+
offset += sentence_len
|
208
|
+
if current_chunk:
|
209
|
+
chunks.append("\n".join(current_chunk))
|
210
|
+
chunk_start_offsets.append(offset - current_length)
|
211
|
+
for i, chunk in enumerate(chunks):
|
212
|
+
doc = spacy_model(chunk)
|
213
|
+
chunk_offset = chunk_start_offsets[i]
|
214
|
+
for entity in doc.ents:
|
215
|
+
entities.append(
|
216
|
+
NERObject(
|
217
|
+
name=entity.text,
|
218
|
+
label=BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
|
219
|
+
score=min_score - 0.5,
|
220
|
+
start=chunk_offset + entity.start_char,
|
221
|
+
count=1,
|
222
|
+
)
|
223
|
+
)
|
181
224
|
|
182
225
|
# Reformatting for consistency
|
183
|
-
if entities:
|
184
|
-
|
185
|
-
|
226
|
+
if not entities:
|
227
|
+
return []
|
228
|
+
if with_scores:
|
229
|
+
min_entity_score = min([e.score for e in entities])
|
230
|
+
max_entity_score = max([e.score for e in entities])
|
186
231
|
entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
|
187
|
-
|
188
|
-
|
232
|
+
for e in entities:
|
233
|
+
e.score = (e.score - min_entity_score) / entity_score_range
|
234
|
+
scores = list(np.searchsorted(sentence_starts, [e.start + 1 for e in entities]))
|
189
235
|
scores = [sentences[i - 1][1] for i in scores]
|
190
|
-
scores = [scores[i] +
|
236
|
+
scores = [scores[i] + 10 * entities[i].score for i in range(len(entities))]
|
191
237
|
for i in range(len(entities)):
|
192
|
-
entities[i] =
|
238
|
+
entities[i].score = scores[i]
|
239
|
+
else:
|
240
|
+
for i in range(len(entities)):
|
241
|
+
entities[i].score = 0.0
|
242
|
+
if with_comentions:
|
193
243
|
for i in range(len(entities)):
|
194
244
|
entity = entities[i]
|
195
|
-
count = 1
|
196
245
|
comentions = [
|
197
|
-
entities[j]
|
246
|
+
entities[j].name
|
198
247
|
for j in range(len(entities))
|
199
|
-
if j != i and abs(entities[j]
|
248
|
+
if j != i and abs(entities[j].start - entity.start) < math.ceil(context_width / 2)
|
200
249
|
]
|
201
|
-
entities[i] =
|
202
|
-
|
203
|
-
entity[1],
|
204
|
-
entity[2],
|
205
|
-
entity[3],
|
206
|
-
count,
|
207
|
-
comentions,
|
208
|
-
)
|
250
|
+
entities[i].comentions = comentions
|
251
|
+
if with_context:
|
209
252
|
for i in range(len(entities)):
|
210
253
|
entity = entities[i]
|
211
|
-
if entity
|
212
|
-
left = max(0, entity
|
213
|
-
right = min(len(text), entity
|
254
|
+
if entity.start >= 0 and entity.start < len(text):
|
255
|
+
left = max(0, entity.start - math.floor(context_width / 2))
|
256
|
+
right = min(len(text), entity.start + math.ceil(context_width / 2))
|
214
257
|
context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
|
215
|
-
entities[i] =
|
216
|
-
entity[0],
|
217
|
-
entity[1],
|
218
|
-
entity[2],
|
219
|
-
context,
|
220
|
-
entity[4],
|
221
|
-
entity[5],
|
222
|
-
)
|
223
|
-
entities = [NERObject(*entities[i]) for i in range(len(entities))]
|
258
|
+
entities[i].context = context
|
224
259
|
return entities
|
225
260
|
|
226
261
|
|
@@ -264,33 +299,36 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
|
|
264
299
|
return summary
|
265
300
|
|
266
301
|
|
267
|
-
def ner_pipe(
|
302
|
+
def ner_pipe(
|
303
|
+
text,
|
304
|
+
language,
|
305
|
+
spacy_model,
|
306
|
+
flair_model=None,
|
307
|
+
fast=False,
|
308
|
+
compression_ratio="auto",
|
309
|
+
with_scores=True,
|
310
|
+
with_comentions=True,
|
311
|
+
with_context=True,
|
312
|
+
):
|
268
313
|
if compression_ratio == "auto":
|
269
314
|
compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
|
270
315
|
sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True)
|
271
|
-
ner = compute_ner(language, sentences, spacy_model, flair_model)
|
316
|
+
ner = compute_ner(language, sentences, spacy_model, flair_model, 150, with_scores, with_comentions, with_context)
|
272
317
|
return ner
|
273
318
|
|
274
319
|
|
275
|
-
def get_ner_handler(language, fast=False
|
320
|
+
def get_ner_handler(language, fast=False):
|
276
321
|
try:
|
277
322
|
get_nltk_tokenizer(language) # raises a LookupError if the language is not valid
|
278
323
|
except LookupError:
|
279
324
|
language = "english"
|
280
|
-
spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS[
|
281
|
-
flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS[
|
282
|
-
return lambda text
|
325
|
+
spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS["english"])()
|
326
|
+
flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS["english"])()
|
327
|
+
return lambda text, compression_ratio="auto", with_scores=True, with_comentions=True, with_context=True: ner_pipe(
|
328
|
+
text, language, spacy_model, flair_model, fast, compression_ratio, with_scores, with_comentions, with_context
|
329
|
+
)
|
283
330
|
|
284
331
|
|
285
332
|
@st.cache_resource
|
286
333
|
def get_cached_ner_handler(language, fast):
|
287
334
|
return get_ner_handler(language, fast)
|
288
|
-
|
289
|
-
|
290
|
-
def test():
|
291
|
-
text = """My name is Valerio Simoni, and I live in NYC. I love the Colosseum, and my phone is +123 456 789.
|
292
|
-
my email is aaa@ggg.com, but my ip address is secret! 123.123.123.0:1111"""
|
293
|
-
entities = get_ner_handler("english", True)(text)
|
294
|
-
print(entities)
|
295
|
-
|
296
|
-
test()
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|