streamlit-octostar-utils 0.1.7a4__py3-none-any.whl → 0.1.7a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@ import numpy as np
14
14
  import math
15
15
  import nltk
16
16
  from typing import Optional, List
17
+ from pydantic import BaseModel, ConfigDict, Field
17
18
 
18
19
  SPACY_NER_MODELS = {
19
20
  "english": lambda: load_spacy(
@@ -24,15 +25,15 @@ SPACY_NER_MODELS = {
24
25
  FLAIR_NER_MODELS = {"english": lambda: SequenceTagger.load("flair/ner-english")}
25
26
  REGEX_NER_MODELS = {
26
27
  "IP_ADDRESS": [
27
- r"(?:(?<=:=)|(?<=\s)|(?<=\b))(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?(?:(?=\s)|(?=\b))",
28
- r"(?:(?<=:=)|(?<=\s)|(?<=\b))(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}(?::\d{1,5})?(?:(?=\s)|(?=\b))"
28
+ r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::(?:[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?\b",
29
29
  ],
30
- "PHONE": r"(?:(?<=:=)|(?<=\s)|(?<=\b))[+]?[(]?[0-9]{1,4}[)]?[-\s\/0-9]*(?:(?=\s)|(?=\b))",
31
- "EMAIL": r"(?:(?<=:=)|(?<=\s)|(?<=\b))[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:(?=\s)|(?=\b))",
30
+ "PHONE": r"(?:(?:\+(?:\d{1,3}[ .-]?)?(?:\(\d{1,3}\)[ .-]?)?)(?:\d{2,5}[ .-]?){1,3}|\d{2,5}[ .-]\d{2,5}(?:[ .-]\d{2,5}){0,2})\b",
31
+ "EMAIL": r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)+\b",
32
+ "URL": r"\b(?:(?:https?|ftp|sftp|ftps|ssh|file|mailto|git|onion|ipfs|ipns):\/\/|www\.)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}(?::\d+)?(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?|(?:https?:\/\/)?[a-z2-7]{16,56}\.onion(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)\b",
32
33
  }
33
34
 
34
35
  BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
35
- BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "TIME", "PHONE"]
36
+ BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "TIME", "PHONE", "IP_ADDRESS", "EMAIL"]
36
37
 
37
38
 
38
39
  def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
@@ -75,33 +76,21 @@ def get_nltk_tokenizer(language: str) -> Tokenizer:
75
76
  return Tokenizer(language)
76
77
 
77
78
 
78
- class NERObject(object):
79
- def __init__(self, name, label, score, context, count, comentions):
80
- self.name: str = name
81
- self.label: str = label
82
- self.score: Optional[float] = score
83
- self.context: Optional[str] = context
84
- self.count: int = count
85
- self.comentions: Optional[List[str]] = comentions
86
- self.sources: Optional[List[str]] = list()
79
+ class NERObject(BaseModel):
80
+ name: str
81
+ label: str
82
+ score: float = 0.0
83
+ start: int
84
+ count: int
85
+ context: str | None = None
86
+ comentions: list[str] = Field(default_factory=list)
87
+ model_config = ConfigDict(extra="allow")
87
88
 
88
- def to_dict(self):
89
- data = {
90
- "name": self.name,
91
- "label": self.label,
92
- "score": self.score,
93
- "context": self.context,
94
- "count": self.count,
95
- "comentions": self.comentions or [],
96
- }
97
- if self.sources:
98
- data["sources"] = self.sources
99
-
100
89
  def __repr__(self):
101
90
  return f"NERObject(label={self.label},name={self.name})"
102
91
 
103
92
 
104
- def postprocess_ner(entities, whitelisted_labels=None, max_entities=None):
93
+ def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_entities=None):
105
94
  if whitelisted_labels is not None:
106
95
  entities = [e for e in entities if e.label in whitelisted_labels]
107
96
  entities = sorted(entities, key=lambda x: x.name)
@@ -109,29 +98,48 @@ def postprocess_ner(entities, whitelisted_labels=None, max_entities=None):
109
98
  for _, group in itertools.groupby(entities, key=lambda x: x.name):
110
99
  group = list(group)
111
100
  best_entity = max(group, key=lambda x: x.score * x.count)
112
- best_entity = NERObject(
113
- best_entity.name,
114
- best_entity.label,
115
- best_entity.score,
116
- best_entity.context,
117
- sum([0] + [e.count for e in group]),
118
- list(set(itertools.chain(*[e.comentions for e in group]))),
119
- )
120
- best_entity.sources = list(set(itertools.chain(*[e.sources for e in group])))
121
- final_entities.append(best_entity)
101
+ merged_data = {
102
+ "name": best_entity.name,
103
+ "label": best_entity.label,
104
+ "score": best_entity.score,
105
+ "context": best_entity.context,
106
+ "count": sum(e.count for e in group),
107
+ "start": best_entity.start,
108
+ }
109
+ all_fields = best_entity.model_fields.keys()
110
+ for field in all_fields:
111
+ if field in merged_data:
112
+ continue
113
+ values = [getattr(e, field, None) for e in group if getattr(e, field, None) is not None]
114
+ if not values:
115
+ continue
116
+ if isinstance(values[0], list):
117
+ merged_data[field] = list(set(itertools.chain.from_iterable(values or [])))
118
+ else:
119
+ merged_data[field] = getattr(best_entity, field, None)
120
+ final_entities.append(NERObject(**merged_data))
122
121
  final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
123
122
  if max_entities and len(final_entities) > max_entities:
124
123
  final_entities = final_entities[:max_entities]
125
124
  return final_entities
126
125
 
127
126
 
128
- def compute_ner(language, sentences, spacy_model, flair_model=None, context_width=150):
127
+ def compute_ner(
128
+ language,
129
+ sentences,
130
+ spacy_model,
131
+ flair_model=None,
132
+ context_width=150,
133
+ with_scores=True,
134
+ with_comentions=True,
135
+ with_context=True,
136
+ ):
129
137
  sentence_starts = [0] + [len(s[0]) + 1 for s in sentences]
130
138
  del sentence_starts[-1]
131
139
  sentence_starts = list(np.cumsum(sentence_starts))
132
140
  text = "\n".join([s[0] for s in sentences])
133
141
  min_score = 1.0
134
- entities = []
142
+ entities: list[NERObject] = []
135
143
 
136
144
  # FLAIR model (if not fast)
137
145
  if flair_model:
@@ -139,88 +147,115 @@ def compute_ner(language, sentences, spacy_model, flair_model=None, context_widt
139
147
  flair_model.predict(input)
140
148
  output = [e for sentence in input for e in sentence.get_spans("ner")]
141
149
  flair_entities = [
142
- (
143
- entity.text,
144
- BASE_TO_ONTONOTES_LABELMAP.get(
150
+ NERObject(
151
+ name=entity.text,
152
+ label=BASE_TO_ONTONOTES_LABELMAP.get(
145
153
  entity.annotation_layers["ner"][0].value,
146
154
  entity.annotation_layers["ner"][0].value,
147
155
  ),
148
- entity.score,
149
- sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
156
+ score=entity.score,
157
+ start=sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
158
+ count=1,
150
159
  )
151
160
  for entity in output
152
161
  ]
153
- min_score = min(min_score, *[e[2] for e in flair_entities])
162
+ min_score = min([min_score] + [e.score for e in flair_entities])
154
163
  entities += flair_entities
155
164
  del flair_entities
156
165
 
166
+ print("Checking REGEXES")
157
167
  # REGEX model
158
168
  for label, regexes in REGEX_NER_MODELS.items():
159
169
  if not isinstance(regexes, list):
160
170
  regexes = [regexes]
161
171
  for regex in regexes:
162
- print(regex)
163
172
  regex_entities = [
164
- (match.group(), label, min_score - 0.5, match.start()) for match in re.finditer(regex, text)
173
+ NERObject(
174
+ name=match.group(),
175
+ label=label,
176
+ score=min_score - 0.5,
177
+ count=1,
178
+ start=match.start(),
179
+ )
180
+ for match in re.finditer(regex, text)
165
181
  ]
166
- print(regex_entities)
167
182
  entities += regex_entities
168
-
183
+ min_score = min([min_score] + [e.score for e in regex_entities])
184
+
169
185
  # SPACY model
170
- spacy_entities = [
171
- (
172
- entity.text,
173
- BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
174
- min_score - 1,
175
- entity.start_char,
176
- )
177
- for entity in spacy_model(text).ents
178
- ]
179
- entities += spacy_entities
180
- del spacy_entities
186
+ print("CHECKING SPACY")
187
+ chunks = []
188
+ chunk_start_offsets = []
189
+ current_chunk = []
190
+ current_length = 0
191
+ offset = 0
192
+ for sentence, _ in sentences:
193
+ sentence_len = len(sentence) + 1
194
+ if sentence_len > spacy_model.max_length:
195
+ truncated = sentence[: spacy_model.max_length - 1]
196
+ chunks.append(truncated)
197
+ chunk_start_offsets.append(offset)
198
+ offset += sentence_len
199
+ continue
200
+ if current_length + sentence_len > spacy_model.max_length:
201
+ chunks.append("\n".join(current_chunk))
202
+ chunk_start_offsets.append(offset - current_length)
203
+ current_chunk = []
204
+ current_length = 0
205
+ current_chunk.append(sentence)
206
+ current_length += sentence_len
207
+ offset += sentence_len
208
+ if current_chunk:
209
+ chunks.append("\n".join(current_chunk))
210
+ chunk_start_offsets.append(offset - current_length)
211
+ for i, chunk in enumerate(chunks):
212
+ doc = spacy_model(chunk)
213
+ chunk_offset = chunk_start_offsets[i]
214
+ for entity in doc.ents:
215
+ entities.append(
216
+ NERObject(
217
+ name=entity.text,
218
+ label=BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
219
+ score=min_score - 0.5,
220
+ start=chunk_offset + entity.start_char,
221
+ count=1,
222
+ )
223
+ )
181
224
 
182
225
  # Reformatting for consistency
183
- if entities:
184
- min_entity_score = min([e[2] for e in entities])
185
- max_entity_score = max([min_entity_score] + [e[2] for e in entities])
226
+ if not entities:
227
+ return []
228
+ if with_scores:
229
+ min_entity_score = min([e.score for e in entities])
230
+ max_entity_score = max([e.score for e in entities])
186
231
  entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
187
- entities = [(e[0], e[1], (e[2] - min_entity_score) / entity_score_range, e[3]) for e in entities]
188
- scores = list(np.searchsorted(sentence_starts, [e[3] + 1 for e in entities]))
232
+ for e in entities:
233
+ e.score = (e.score - min_entity_score) / entity_score_range
234
+ scores = list(np.searchsorted(sentence_starts, [e.start + 1 for e in entities]))
189
235
  scores = [sentences[i - 1][1] for i in scores]
190
- scores = [scores[i] + int(10 * entities[i][2]) for i in range(len(entities))]
236
+ scores = [scores[i] + 10 * entities[i].score for i in range(len(entities))]
191
237
  for i in range(len(entities)):
192
- entities[i] = (entities[i][0], entities[i][1], scores[i], entities[i][3])
238
+ entities[i].score = scores[i]
239
+ else:
240
+ for i in range(len(entities)):
241
+ entities[i].score = 0.0
242
+ if with_comentions:
193
243
  for i in range(len(entities)):
194
244
  entity = entities[i]
195
- count = 1
196
245
  comentions = [
197
- entities[j][0]
246
+ entities[j].name
198
247
  for j in range(len(entities))
199
- if j != i and abs(entities[j][3] - entity[3]) < math.ceil(context_width / 2)
248
+ if j != i and abs(entities[j].start - entity.start) < math.ceil(context_width / 2)
200
249
  ]
201
- entities[i] = (
202
- entity[0],
203
- entity[1],
204
- entity[2],
205
- entity[3],
206
- count,
207
- comentions,
208
- )
250
+ entities[i].comentions = comentions
251
+ if with_context:
209
252
  for i in range(len(entities)):
210
253
  entity = entities[i]
211
- if entity[3] >= 0 and entity[3] < len(text):
212
- left = max(0, entity[3] - math.floor(context_width / 2))
213
- right = min(len(text), entity[3] + math.ceil(context_width / 2))
254
+ if entity.start >= 0 and entity.start < len(text):
255
+ left = max(0, entity.start - math.floor(context_width / 2))
256
+ right = min(len(text), entity.start + math.ceil(context_width / 2))
214
257
  context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
215
- entities[i] = (
216
- entity[0],
217
- entity[1],
218
- entity[2],
219
- context,
220
- entity[4],
221
- entity[5],
222
- )
223
- entities = [NERObject(*entities[i]) for i in range(len(entities))]
258
+ entities[i].context = context
224
259
  return entities
225
260
 
226
261
 
@@ -264,33 +299,36 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
264
299
  return summary
265
300
 
266
301
 
267
- def ner_pipe(text, language, spacy_model, flair_model=None, fast=False, compression_ratio="auto"):
302
+ def ner_pipe(
303
+ text,
304
+ language,
305
+ spacy_model,
306
+ flair_model=None,
307
+ fast=False,
308
+ compression_ratio="auto",
309
+ with_scores=True,
310
+ with_comentions=True,
311
+ with_context=True,
312
+ ):
268
313
  if compression_ratio == "auto":
269
314
  compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
270
315
  sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True)
271
- ner = compute_ner(language, sentences, spacy_model, flair_model)
316
+ ner = compute_ner(language, sentences, spacy_model, flair_model, 150, with_scores, with_comentions, with_context)
272
317
  return ner
273
318
 
274
319
 
275
- def get_ner_handler(language, fast=False, compression_ratio="auto"):
320
+ def get_ner_handler(language, fast=False):
276
321
  try:
277
322
  get_nltk_tokenizer(language) # raises a LookupError if the language is not valid
278
323
  except LookupError:
279
324
  language = "english"
280
- spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS['english'])()
281
- flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS['english'])()
282
- return lambda text: ner_pipe(text, language, spacy_model, flair_model, fast, compression_ratio)
325
+ spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS["english"])()
326
+ flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS["english"])()
327
+ return lambda text, compression_ratio="auto", with_scores=True, with_comentions=True, with_context=True: ner_pipe(
328
+ text, language, spacy_model, flair_model, fast, compression_ratio, with_scores, with_comentions, with_context
329
+ )
283
330
 
284
331
 
285
332
  @st.cache_resource
286
333
  def get_cached_ner_handler(language, fast):
287
334
  return get_ner_handler(language, fast)
288
-
289
-
290
- def test():
291
- text = """My name is Valerio Simoni, and I live in NYC. I love the Colosseum, and my phone is +123 456 789.
292
- my email is aaa@ggg.com, but my ip address is secret! 123.123.123.0:1111"""
293
- entities = get_ner_handler("english", True)(text)
294
- print(entities)
295
-
296
- test()
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.1.7a4
3
+ Version: 0.1.7a6
4
4
  Summary:
5
5
  License: MIT
6
6
  Author: Octostar
@@ -22,7 +22,7 @@ streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_
22
22
  streamlit_octostar_utils/hello.py,sha256=JVeug8fnyYYf_qw6eeMDBrdqsSkiwnSHdPvb9puEGdA,69
23
23
  streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
24
24
  streamlit_octostar_utils/nlp/language.py,sha256=13f6kAALYjC3EclBzcyKPd4GlIeIR1mWPu3S6d-z814,549
25
- streamlit_octostar_utils/nlp/ner.py,sha256=mMCbUydVDkjl2seB5dlp5NXSqi3-hemurk1OexNJZzE,11611
25
+ streamlit_octostar_utils/nlp/ner.py,sha256=GSbo8_orY3a1o11NaIRK_P61hFG25hzV-QXJ8uHwQl4,13385
26
26
  streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
27
27
  streamlit_octostar_utils/octostar/client.py,sha256=29tA1LY9ndLzAMR_KlL7Jb071YNAanubsppzy2BXb1E,1779
28
28
  streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
@@ -37,7 +37,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
37
37
  streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
38
38
  streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
39
39
  streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
40
- streamlit_octostar_utils-0.1.7a4.dist-info/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
41
- streamlit_octostar_utils-0.1.7a4.dist-info/METADATA,sha256=qyNzLU0FuRUB0VI7RY6F3xXPH5in1Iv_pNr9kZobyho,2267
42
- streamlit_octostar_utils-0.1.7a4.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
43
- streamlit_octostar_utils-0.1.7a4.dist-info/RECORD,,
40
+ streamlit_octostar_utils-0.1.7a6.dist-info/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
41
+ streamlit_octostar_utils-0.1.7a6.dist-info/METADATA,sha256=PCy7knOjDx-BtE4xFsIDUKDzqNtgGgMSRS7JtznXSN8,2267
42
+ streamlit_octostar_utils-0.1.7a6.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
43
+ streamlit_octostar_utils-0.1.7a6.dist-info/RECORD,,