streamlit-octostar-utils 0.1.7a5__py3-none-any.whl → 0.1.7a7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -14,6 +14,7 @@ import numpy as np
14
14
  import math
15
15
  import nltk
16
16
  from typing import Optional, List
17
+ from pydantic import BaseModel, ConfigDict, Field
17
18
 
18
19
  SPACY_NER_MODELS = {
19
20
  "english": lambda: load_spacy(
@@ -24,15 +25,15 @@ SPACY_NER_MODELS = {
24
25
  FLAIR_NER_MODELS = {"english": lambda: SequenceTagger.load("flair/ner-english")}
25
26
  REGEX_NER_MODELS = {
26
27
  "IP_ADDRESS": [
27
- r"(?:(?<=:=)|(?<=\s)|(?<=\b))(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?(?:(?=\s)|(?=\b))",
28
- r"(?:(?<=:=)|(?<=\s)|(?<=\b))(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}(?::\d{1,5})?(?:(?=\s)|(?=\b))"
28
+ r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::(?:[0-9]|[1-9][0-9]{1,3}|[1-5][0-9]{4}|6[0-4][0-9]{3}|65[0-4][0-9]{2}|655[0-2][0-9]|6553[0-5]))?\b",
29
29
  ],
30
- "PHONE": r"(?:(?<=:=)|(?<=\s)|(?<=\b))[+]?[(]?[0-9]{1,4}[)]?[-\s\/0-9]*(?:(?=\s)|(?=\b))",
31
- "EMAIL": r"(?:(?<=:=)|(?<=\s)|(?<=\b))[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}(?:(?=\s)|(?=\b))",
30
+ "PHONE": r"(?:(?:\+(?:\d{1,3}[ .-]?)?(?:\(\d{1,3}\)[ .-]?)?)(?:\d{2,5}[ .-]?){1,3}|\d{2,5}[ .-]\d{2,5}(?:[ .-]\d{2,5}){0,2})\b",
31
+ "EMAIL": r"\b[a-zA-Z0-9._%+-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]*[a-zA-Z0-9])?)+\b",
32
+ "URL": r"\b(?:(?:https?|ftp|sftp|ftps|ssh|file|mailto|git|onion|ipfs|ipns):\/\/|www\.)(?:[a-z0-9](?:[a-z0-9-]{0,61}[a-z0-9])?\.)+[a-z]{2,}(?::\d+)?(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?|(?:https?:\/\/)?[a-z2-7]{16,56}\.onion(?:\/(?:[-a-z0-9\/_.,~%+:@]|(?:%[0-9a-f]{2}))*)?(?:\?(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)?(?:#(?:[-a-z0-9\/_.,~%+:@=&]|(?:%[0-9a-f]{2}))*)\b",
32
33
  }
33
34
 
34
35
  BASE_TO_ONTONOTES_LABELMAP = {"PER": "PERSON"}
35
- BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "TIME", "PHONE", "IP_ADDRESS", "EMAIL"]
36
+ BASE_ALLOWED_LABELS = ["PERSON", "ORG", "LOC", "NORP", "GPE", "PRODUCT", "DATE", "PHONE", "IP_ADDRESS", "EMAIL", "URL"]
36
37
 
37
38
 
38
39
  def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
@@ -75,34 +76,21 @@ def get_nltk_tokenizer(language: str) -> Tokenizer:
75
76
  return Tokenizer(language)
76
77
 
77
78
 
78
- class NERObject(object):
79
- def __init__(self, name, label, score, context, count, comentions):
80
- self.name: str = name
81
- self.label: str = label
82
- self.score: Optional[float] = score
83
- self.context: Optional[str] = context
84
- self.count: int = count
85
- self.comentions: Optional[List[str]] = comentions
86
- self.sources: Optional[List[str]] = list()
79
+ class NERObject(BaseModel):
80
+ name: str
81
+ label: str
82
+ score: float = 0.0
83
+ start: int
84
+ count: int
85
+ context: str | None = None
86
+ comentions: list[str] = Field(default_factory=list)
87
+ model_config = ConfigDict(extra="allow")
87
88
 
88
- def to_dict(self):
89
- data = {
90
- "name": self.name,
91
- "label": self.label,
92
- "score": self.score,
93
- "context": self.context,
94
- "count": self.count,
95
- "comentions": self.comentions or [],
96
- }
97
- if self.sources:
98
- data["sources"] = self.sources
99
- return data
100
-
101
89
  def __repr__(self):
102
90
  return f"NERObject(label={self.label},name={self.name})"
103
91
 
104
92
 
105
- def postprocess_ner(entities, whitelisted_labels=None, max_entities=None):
93
+ def postprocess_ner(entities: list[NERObject], whitelisted_labels=None, max_entities=None):
106
94
  if whitelisted_labels is not None:
107
95
  entities = [e for e in entities if e.label in whitelisted_labels]
108
96
  entities = sorted(entities, key=lambda x: x.name)
@@ -110,29 +98,48 @@ def postprocess_ner(entities, whitelisted_labels=None, max_entities=None):
110
98
  for _, group in itertools.groupby(entities, key=lambda x: x.name):
111
99
  group = list(group)
112
100
  best_entity = max(group, key=lambda x: x.score * x.count)
113
- best_entity = NERObject(
114
- best_entity.name,
115
- best_entity.label,
116
- best_entity.score,
117
- best_entity.context,
118
- sum([0] + [e.count for e in group]),
119
- list(set(itertools.chain(*[e.comentions for e in group]))),
120
- )
121
- best_entity.sources = list(set(itertools.chain(*[e.sources for e in group])))
122
- final_entities.append(best_entity)
101
+ merged_data = {
102
+ "name": best_entity.name,
103
+ "label": best_entity.label,
104
+ "score": best_entity.score,
105
+ "context": best_entity.context,
106
+ "count": sum(e.count for e in group),
107
+ "start": best_entity.start,
108
+ }
109
+ all_fields = best_entity.model_fields.keys()
110
+ for field in all_fields:
111
+ if field in merged_data:
112
+ continue
113
+ values = [getattr(e, field, None) for e in group if getattr(e, field, None) is not None]
114
+ if not values:
115
+ continue
116
+ if isinstance(values[0], list):
117
+ merged_data[field] = list(set(itertools.chain.from_iterable(values or [])))
118
+ else:
119
+ merged_data[field] = getattr(best_entity, field, None)
120
+ final_entities.append(NERObject(**merged_data))
123
121
  final_entities = sorted(final_entities, key=lambda x: x.score * x.count, reverse=True)
124
122
  if max_entities and len(final_entities) > max_entities:
125
123
  final_entities = final_entities[:max_entities]
126
124
  return final_entities
127
125
 
128
126
 
129
- def compute_ner(language, sentences, spacy_model, flair_model=None, context_width=150):
127
+ def compute_ner(
128
+ language,
129
+ sentences,
130
+ spacy_model,
131
+ flair_model=None,
132
+ context_width=150,
133
+ with_scores=True,
134
+ with_comentions=True,
135
+ with_context=True,
136
+ ):
130
137
  sentence_starts = [0] + [len(s[0]) + 1 for s in sentences]
131
138
  del sentence_starts[-1]
132
139
  sentence_starts = list(np.cumsum(sentence_starts))
133
140
  text = "\n".join([s[0] for s in sentences])
134
141
  min_score = 1.0
135
- entities = []
142
+ entities: list[NERObject] = []
136
143
 
137
144
  # FLAIR model (if not fast)
138
145
  if flair_model:
@@ -140,88 +147,115 @@ def compute_ner(language, sentences, spacy_model, flair_model=None, context_widt
140
147
  flair_model.predict(input)
141
148
  output = [e for sentence in input for e in sentence.get_spans("ner")]
142
149
  flair_entities = [
143
- (
144
- entity.text,
145
- BASE_TO_ONTONOTES_LABELMAP.get(
150
+ NERObject(
151
+ name=entity.text,
152
+ label=BASE_TO_ONTONOTES_LABELMAP.get(
146
153
  entity.annotation_layers["ner"][0].value,
147
154
  entity.annotation_layers["ner"][0].value,
148
155
  ),
149
- entity.score,
150
- sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
156
+ score=entity.score,
157
+ start=sentence_starts[input.index(entity[0].sentence)] + entity[0].start_position,
158
+ count=1,
151
159
  )
152
160
  for entity in output
153
161
  ]
154
- min_score = min(min_score, *[e[2] for e in flair_entities])
162
+ min_score = min([min_score] + [e.score for e in flair_entities])
155
163
  entities += flair_entities
156
164
  del flair_entities
157
165
 
166
+ print("Checking REGEXES")
158
167
  # REGEX model
159
168
  for label, regexes in REGEX_NER_MODELS.items():
160
169
  if not isinstance(regexes, list):
161
170
  regexes = [regexes]
162
171
  for regex in regexes:
163
- print(regex)
164
172
  regex_entities = [
165
- (match.group(), label, min_score - 0.5, match.start()) for match in re.finditer(regex, text)
173
+ NERObject(
174
+ name=match.group(),
175
+ label=label,
176
+ score=min_score - 0.5,
177
+ count=1,
178
+ start=match.start(),
179
+ )
180
+ for match in re.finditer(regex, text)
166
181
  ]
167
- print(regex_entities)
168
182
  entities += regex_entities
169
-
183
+ min_score = min([min_score] + [e.score for e in regex_entities])
184
+
170
185
  # SPACY model
171
- spacy_entities = [
172
- (
173
- entity.text,
174
- BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
175
- min_score - 1,
176
- entity.start_char,
177
- )
178
- for entity in spacy_model(text).ents
179
- ]
180
- entities += spacy_entities
181
- del spacy_entities
186
+ print("CHECKING SPACY")
187
+ chunks = []
188
+ chunk_start_offsets = []
189
+ current_chunk = []
190
+ current_length = 0
191
+ offset = 0
192
+ for sentence, _ in sentences:
193
+ sentence_len = len(sentence) + 1
194
+ if sentence_len > spacy_model.max_length:
195
+ truncated = sentence[: spacy_model.max_length - 1]
196
+ chunks.append(truncated)
197
+ chunk_start_offsets.append(offset)
198
+ offset += sentence_len
199
+ continue
200
+ if current_length + sentence_len > spacy_model.max_length:
201
+ chunks.append("\n".join(current_chunk))
202
+ chunk_start_offsets.append(offset - current_length)
203
+ current_chunk = []
204
+ current_length = 0
205
+ current_chunk.append(sentence)
206
+ current_length += sentence_len
207
+ offset += sentence_len
208
+ if current_chunk:
209
+ chunks.append("\n".join(current_chunk))
210
+ chunk_start_offsets.append(offset - current_length)
211
+ for i, chunk in enumerate(chunks):
212
+ doc = spacy_model(chunk)
213
+ chunk_offset = chunk_start_offsets[i]
214
+ for entity in doc.ents:
215
+ entities.append(
216
+ NERObject(
217
+ name=entity.text,
218
+ label=BASE_TO_ONTONOTES_LABELMAP.get(entity.label_, entity.label_),
219
+ score=min_score - 0.5,
220
+ start=chunk_offset + entity.start_char,
221
+ count=1,
222
+ )
223
+ )
182
224
 
183
225
  # Reformatting for consistency
184
- if entities:
185
- min_entity_score = min([e[2] for e in entities])
186
- max_entity_score = max([min_entity_score] + [e[2] for e in entities])
226
+ if not entities:
227
+ return []
228
+ if with_scores:
229
+ min_entity_score = min([e.score for e in entities])
230
+ max_entity_score = max([e.score for e in entities])
187
231
  entity_score_range = 1 if min_entity_score == max_entity_score else (max_entity_score - min_entity_score)
188
- entities = [(e[0], e[1], (e[2] - min_entity_score) / entity_score_range, e[3]) for e in entities]
189
- scores = list(np.searchsorted(sentence_starts, [e[3] + 1 for e in entities]))
232
+ for e in entities:
233
+ e.score = (e.score - min_entity_score) / entity_score_range
234
+ scores = list(np.searchsorted(sentence_starts, [e.start + 1 for e in entities]))
190
235
  scores = [sentences[i - 1][1] for i in scores]
191
- scores = [scores[i] + int(10 * entities[i][2]) for i in range(len(entities))]
236
+ scores = [scores[i] + 10 * entities[i].score for i in range(len(entities))]
237
+ for i in range(len(entities)):
238
+ entities[i].score = scores[i]
239
+ else:
192
240
  for i in range(len(entities)):
193
- entities[i] = (entities[i][0], entities[i][1], scores[i], entities[i][3])
241
+ entities[i].score = 0.0
242
+ if with_comentions:
194
243
  for i in range(len(entities)):
195
244
  entity = entities[i]
196
- count = 1
197
245
  comentions = [
198
- entities[j][0]
246
+ entities[j].name
199
247
  for j in range(len(entities))
200
- if j != i and abs(entities[j][3] - entity[3]) < math.ceil(context_width / 2)
248
+ if j != i and abs(entities[j].start - entity.start) < math.ceil(context_width / 2)
201
249
  ]
202
- entities[i] = (
203
- entity[0],
204
- entity[1],
205
- entity[2],
206
- entity[3],
207
- count,
208
- comentions,
209
- )
250
+ entities[i].comentions = comentions
251
+ if with_context:
210
252
  for i in range(len(entities)):
211
253
  entity = entities[i]
212
- if entity[3] >= 0 and entity[3] < len(text):
213
- left = max(0, entity[3] - math.floor(context_width / 2))
214
- right = min(len(text), entity[3] + math.ceil(context_width / 2))
254
+ if entity.start >= 0 and entity.start < len(text):
255
+ left = max(0, entity.start - math.floor(context_width / 2))
256
+ right = min(len(text), entity.start + math.ceil(context_width / 2))
215
257
  context = ("[..]" if left > 0 else "") + text[left:right] + ("[..]" if right < len(text) else "")
216
- entities[i] = (
217
- entity[0],
218
- entity[1],
219
- entity[2],
220
- context,
221
- entity[4],
222
- entity[5],
223
- )
224
- entities = [NERObject(*entities[i]) for i in range(len(entities))]
258
+ entities[i].context = context
225
259
  return entities
226
260
 
227
261
 
@@ -265,24 +299,36 @@ def get_extractive_summary(text, language, max_chars, fast=False, with_scores=Fa
265
299
  return summary
266
300
 
267
301
 
268
- def ner_pipe(text, language, spacy_model, flair_model=None, fast=False, compression_ratio="auto"):
302
+ def ner_pipe(
303
+ text,
304
+ language,
305
+ spacy_model,
306
+ flair_model=None,
307
+ fast=False,
308
+ compression_ratio="auto",
309
+ with_scores=True,
310
+ with_comentions=True,
311
+ with_context=True,
312
+ ):
269
313
  if compression_ratio == "auto":
270
314
  compression_ratio = max(1.0, len(text) / 15000) if fast else 1.0
271
315
  sentences = get_extractive_summary(text, language, int(len(text) / compression_ratio), fast=fast, with_scores=True)
272
- ner = compute_ner(language, sentences, spacy_model, flair_model)
316
+ ner = compute_ner(language, sentences, spacy_model, flair_model, 150, with_scores, with_comentions, with_context)
273
317
  return ner
274
318
 
275
319
 
276
- def get_ner_handler(language, fast=False, compression_ratio="auto"):
320
+ def get_ner_handler(language, fast=False):
277
321
  try:
278
322
  get_nltk_tokenizer(language) # raises a LookupError if the language is not valid
279
323
  except LookupError:
280
324
  language = "english"
281
- spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS['english'])()
282
- flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS['english'])()
283
- return lambda text: ner_pipe(text, language, spacy_model, flair_model, fast, compression_ratio)
325
+ spacy_model = SPACY_NER_MODELS.get(language, SPACY_NER_MODELS["english"])()
326
+ flair_model = None if fast else FLAIR_NER_MODELS.get(language, FLAIR_NER_MODELS["english"])()
327
+ return lambda text, compression_ratio="auto", with_scores=True, with_comentions=True, with_context=True: ner_pipe(
328
+ text, language, spacy_model, flair_model, fast, compression_ratio, with_scores, with_comentions, with_context
329
+ )
284
330
 
285
331
 
286
332
  @st.cache_resource
287
333
  def get_cached_ner_handler(language, fast):
288
- return get_ner_handler(language, fast)
334
+ return get_ner_handler(language, fast)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.1.7a5
3
+ Version: 0.1.7a7
4
4
  Summary:
5
5
  License: MIT
6
6
  Author: Octostar
@@ -22,7 +22,7 @@ streamlit_octostar_utils/core/timestamp.py,sha256=a3s4xfm1nctLzYsHOJxqoWIDTdbNY_
22
22
  streamlit_octostar_utils/hello.py,sha256=JVeug8fnyYYf_qw6eeMDBrdqsSkiwnSHdPvb9puEGdA,69
23
23
  streamlit_octostar_utils/nlp/__init__.py,sha256=BtlYDZK_xaEbc7Ju_7MznXbCVPZcdLn26xwR9qf_UhM,336
24
24
  streamlit_octostar_utils/nlp/language.py,sha256=13f6kAALYjC3EclBzcyKPd4GlIeIR1mWPu3S6d-z814,549
25
- streamlit_octostar_utils/nlp/ner.py,sha256=N9Y0d-DU55kb3WY0z3f44IX_eIXhD_jVH8tnuStffyY,11366
25
+ streamlit_octostar_utils/nlp/ner.py,sha256=lw4VGJQHCJAudQvBJszJ0MYG-JGL-5-Y25wLY5ky6bU,13384
26
26
  streamlit_octostar_utils/octostar/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
27
27
  streamlit_octostar_utils/octostar/client.py,sha256=29tA1LY9ndLzAMR_KlL7Jb071YNAanubsppzy2BXb1E,1779
28
28
  streamlit_octostar_utils/octostar/context.py,sha256=TpucK48EbeVy4vDqKd9UULEtr1JOY-_4nBs-rXZzESw,212
@@ -37,7 +37,7 @@ streamlit_octostar_utils/threading/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzp
37
37
  streamlit_octostar_utils/threading/async_task_manager.py,sha256=q7N6YZwUvIYMzkSHmsJNheNVCv93c03H6Hyg9uH8pvk,4747
38
38
  streamlit_octostar_utils/threading/session_callback_manager.py,sha256=LvZVP4g6tvKtYmI13f2j1sX_7hm61Groqp5xJine9_k,3973
39
39
  streamlit_octostar_utils/threading/session_state_hot_swapper.py,sha256=6eeCQI6A42hp4DmW2NQw2rbeR-k9N8DhfBKQdN_fbLU,811
40
- streamlit_octostar_utils-0.1.7a5.dist-info/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
41
- streamlit_octostar_utils-0.1.7a5.dist-info/METADATA,sha256=rhcd3gLwh_hMQ0UBj2Z7ggHpPuhyvfey6wxKe8OUvU4,2267
42
- streamlit_octostar_utils-0.1.7a5.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
43
- streamlit_octostar_utils-0.1.7a5.dist-info/RECORD,,
40
+ streamlit_octostar_utils-0.1.7a7.dist-info/LICENSE,sha256=dkwVPyV03fPHHtERnF6RnvRXcll__tud9gWca1RcgnQ,1073
41
+ streamlit_octostar_utils-0.1.7a7.dist-info/METADATA,sha256=gziDnnF2kHGt2d5MucoV4I8ELE3tZlDQQWF_ggTGwk0,2267
42
+ streamlit_octostar_utils-0.1.7a7.dist-info/WHEEL,sha256=fGIA9gx4Qxk2KDKeNJCbOEwSrmLtjWCwzBz351GyrPQ,88
43
+ streamlit_octostar_utils-0.1.7a7.dist-info/RECORD,,