streamlit-octostar-utils 0.5.0.dev6__tar.gz → 0.5.0.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/PKG-INFO +1 -1
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/pyproject.toml +1 -1
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/nifi.py +47 -19
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/ner.py +214 -83
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/LICENSE +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/README.md +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/contents.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/dict.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/filetypes.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/timestamp.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/language.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/client.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/context.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/permissions.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/relationships.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/validation.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/style/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/style/common.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
|
@@ -45,6 +45,7 @@ TAG_RELATIONSHIP = RelationshipName(name="has_tag", type="mtm")
|
|
|
45
45
|
FRAGMENT_RELATIONSHIP = RelationshipName(name="is_fragment_of", type="otm")
|
|
46
46
|
PREVIOUS_FRAGMENT_RELATIONSHIP = RelationshipName(name="has_previous_fragment", type="otm")
|
|
47
47
|
NEXT_FRAGMENT_RELATIONSHIP = RelationshipName(name="has_next_fragment", type="otm")
|
|
48
|
+
SOURCE_FRAGMENT_ENTITY_RELATIONSHIP = RelationshipName(name="is_child_fragment_of", type="otm")
|
|
48
49
|
OS_RESERVED_FIELDS = [
|
|
49
50
|
"os_entity_uid", "entity_id", "entity_type", "os_concept",
|
|
50
51
|
"entity_label", "os_created_at", "os_created_by",
|
|
@@ -212,6 +213,7 @@ class NifiFragmenter(object):
|
|
|
212
213
|
raise ValueError("Cannot have more than 100k entities for fragmentation")
|
|
213
214
|
identifier = str(uuid.uuid4())
|
|
214
215
|
root_uid = fragments[0].record["os_entity_uid"]
|
|
216
|
+
root_type = fragments[0].record["os_concept"]
|
|
215
217
|
for i, entity in enumerate(fragments):
|
|
216
218
|
travel_dict(entity.request["nifi_attributes"], fragmenter_keylist.split("."), "w")(
|
|
217
219
|
{"identifier": identifier, "count": count, "index": i}
|
|
@@ -225,7 +227,7 @@ class NifiFragmenter(object):
|
|
|
225
227
|
"fragments_stack"
|
|
226
228
|
]
|
|
227
229
|
travel_dict(entity.request["config"]["fragment"], fragmenter_keylist.split("."), "w")(
|
|
228
|
-
{"identifier": identifier, "count": count, "index": i, "root_uid": root_uid}
|
|
230
|
+
{"identifier": identifier, "count": count, "index": i, "root_uid": root_uid, "root_type": root_type}
|
|
229
231
|
)
|
|
230
232
|
|
|
231
233
|
def push_defragment_strategy(fragment, defragmenter_config):
|
|
@@ -424,30 +426,34 @@ class NifiFragmenter(object):
|
|
|
424
426
|
)
|
|
425
427
|
|
|
426
428
|
@staticmethod
|
|
427
|
-
def
|
|
428
|
-
"""Resolve the
|
|
429
|
+
def resolve_source_entity(entity, fragment_root_source=None) -> tuple:
|
|
430
|
+
"""Resolve the source entity (recursive originator) for child fragments.
|
|
429
431
|
|
|
430
432
|
When fragment_root_source is set (a fragment name or stack index),
|
|
431
|
-
the
|
|
432
|
-
|
|
433
|
-
|
|
433
|
+
the root is looked up via entity.get_fragment_root(). When it is
|
|
434
|
+
None and the entity is already fragmented, the oldest fragmentation
|
|
435
|
+
level's root is used (the original non-fragment ancestor). Otherwise
|
|
436
|
+
falls back to the entity itself.
|
|
434
437
|
|
|
435
438
|
Args:
|
|
436
439
|
entity: A NifiEntity or NifiEntityProxy.
|
|
437
|
-
fragment_root_source: None
|
|
440
|
+
fragment_root_source: None for automatic resolution, an int
|
|
438
441
|
to index into the fragments stack, or a string fragmenter
|
|
439
442
|
keylist name.
|
|
440
443
|
|
|
441
444
|
Returns:
|
|
442
|
-
|
|
445
|
+
(source_entity_uid, source_entity_type) tuple.
|
|
443
446
|
"""
|
|
444
|
-
if fragment_root_source is None:
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
447
|
+
if fragment_root_source is not None:
|
|
448
|
+
try:
|
|
449
|
+
idx = int(fragment_root_source)
|
|
450
|
+
return entity.get_fragment_root(idx)
|
|
451
|
+
except (ValueError, TypeError):
|
|
452
|
+
return entity.get_fragment_root(fragment_root_source)
|
|
453
|
+
stack = entity.request.get("config", {}).get("fragment", {}).get("fragments_stack", [])
|
|
454
|
+
if stack:
|
|
455
|
+
return entity.get_fragment_root(-1)
|
|
456
|
+
return (entity.record["os_entity_uid"], entity.record["os_concept"])
|
|
451
457
|
|
|
452
458
|
|
|
453
459
|
class NifiEntityBatch(object):
|
|
@@ -1089,7 +1095,17 @@ class NifiEntity(object):
|
|
|
1089
1095
|
)
|
|
1090
1096
|
return NifiFragmenter.get_fragment_info(self, key).get("index", 0) == 0
|
|
1091
1097
|
|
|
1092
|
-
def
|
|
1098
|
+
def get_fragment_root(self, fragment_name_or_idx) -> tuple:
|
|
1099
|
+
"""Return (root_uid, root_type) for a given fragmentation level.
|
|
1100
|
+
|
|
1101
|
+
Args:
|
|
1102
|
+
fragment_name_or_idx: An int to index into fragments_stack
|
|
1103
|
+
(e.g. -1 for the oldest fragmentation), or a string
|
|
1104
|
+
fragmenter keylist name.
|
|
1105
|
+
|
|
1106
|
+
Returns:
|
|
1107
|
+
(root_uid, root_type) tuple for the root entity at that level.
|
|
1108
|
+
"""
|
|
1093
1109
|
fragment_config = self.request.get("config", {}).get("fragment", {})
|
|
1094
1110
|
fragments_stack = fragment_config.get("fragments_stack", [])
|
|
1095
1111
|
if isinstance(fragment_name_or_idx, int):
|
|
@@ -1114,7 +1130,10 @@ class NifiEntity(object):
|
|
|
1114
1130
|
raise KeyError(
|
|
1115
1131
|
f"No root_uid found in fragment config for '{fragment_key}'"
|
|
1116
1132
|
)
|
|
1117
|
-
return
|
|
1133
|
+
return (
|
|
1134
|
+
fragment_data["root_uid"],
|
|
1135
|
+
fragment_data.get("root_type", "os_fragment"),
|
|
1136
|
+
)
|
|
1118
1137
|
|
|
1119
1138
|
def to_json(self):
|
|
1120
1139
|
if self.drop_on_output:
|
|
@@ -1408,6 +1427,7 @@ class NifiEntity(object):
|
|
|
1408
1427
|
self,
|
|
1409
1428
|
os_workspace,
|
|
1410
1429
|
source_entity_uid,
|
|
1430
|
+
source_entity_type,
|
|
1411
1431
|
filename,
|
|
1412
1432
|
filetype,
|
|
1413
1433
|
file: Union[Contents, bytes],
|
|
@@ -1422,9 +1442,12 @@ class NifiEntity(object):
|
|
|
1422
1442
|
next_fragment_uid=None,
|
|
1423
1443
|
next_fragment_relationship_uid=None,
|
|
1424
1444
|
next_fragment_relationship=NEXT_FRAGMENT_RELATIONSHIP,
|
|
1445
|
+
source_entity_relationship_uid=None,
|
|
1446
|
+
source_entity_relationship=SOURCE_FRAGMENT_ENTITY_RELATIONSHIP,
|
|
1425
1447
|
):
|
|
1426
1448
|
fields = {
|
|
1427
1449
|
**fields,
|
|
1450
|
+
"os_parent_uid": self.record["os_entity_uid"],
|
|
1428
1451
|
"source_entity_uid": source_entity_uid,
|
|
1429
1452
|
"previous_entity_uid": previous_fragment_uid,
|
|
1430
1453
|
"next_entity_uid": next_fragment_uid,
|
|
@@ -1440,9 +1463,14 @@ class NifiEntity(object):
|
|
|
1440
1463
|
os_relationship_uid,
|
|
1441
1464
|
os_entity_type,
|
|
1442
1465
|
)
|
|
1443
|
-
prev_rel, next_rel = None, None
|
|
1444
1466
|
fragment_uid = child_entity.record["os_entity_uid"]
|
|
1445
1467
|
fragment_type = child_entity.record["os_concept"]
|
|
1468
|
+
source_rel = self._add_relationship(
|
|
1469
|
+
os_workspace, fragment_uid, fragment_type,
|
|
1470
|
+
source_entity_uid, source_entity_type,
|
|
1471
|
+
source_entity_relationship, {}, source_entity_relationship_uid,
|
|
1472
|
+
)
|
|
1473
|
+
prev_rel, next_rel = None, None
|
|
1446
1474
|
if previous_fragment_uid:
|
|
1447
1475
|
prev_rel = self._add_relationship(
|
|
1448
1476
|
os_workspace, fragment_uid, fragment_type,
|
|
@@ -1455,7 +1483,7 @@ class NifiEntity(object):
|
|
|
1455
1483
|
next_fragment_uid, os_entity_type,
|
|
1456
1484
|
next_fragment_relationship, {}, next_fragment_relationship_uid,
|
|
1457
1485
|
)
|
|
1458
|
-
return child_entity, child_rel, prev_rel, next_rel
|
|
1486
|
+
return child_entity, child_rel, source_rel, prev_rel, next_rel
|
|
1459
1487
|
|
|
1460
1488
|
def add_tag(self, os_workspace, name, group, order, color, fields={}):
|
|
1461
1489
|
return self.add_child_entity(
|
|
@@ -13,18 +13,14 @@ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, AnalysisExplan
|
|
|
13
13
|
EntityRecognizer, RecognizerResult
|
|
14
14
|
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
|
|
15
15
|
from presidio_analyzer.predefined_recognizers import SpacyRecognizer, PhoneRecognizer
|
|
16
|
-
import streamlit as st
|
|
17
16
|
import nltk
|
|
18
17
|
from flair.data import Sentence
|
|
19
18
|
from flair.models import SequenceTagger
|
|
20
19
|
|
|
21
20
|
from .custom_recognizers import PhonePatternRecognizer, ModernUrlRecognizer
|
|
22
21
|
|
|
23
|
-
from sumy.parsers.plaintext import PlaintextParser
|
|
24
22
|
from sumy.nlp.tokenizers import Tokenizer
|
|
25
23
|
from sumy.nlp.stemmers import Stemmer
|
|
26
|
-
from sumy.summarizers.lsa import LsaSummarizer
|
|
27
|
-
from sumy.summarizers.luhn import LuhnSummarizer
|
|
28
24
|
from sumy.utils import get_stop_words
|
|
29
25
|
|
|
30
26
|
from .language import to_name, SPACY_MODELS
|
|
@@ -450,36 +446,151 @@ def expand_entities_for_analyzer(entities_list):
|
|
|
450
446
|
return list(expanded)
|
|
451
447
|
|
|
452
448
|
|
|
453
|
-
def
|
|
454
|
-
|
|
455
|
-
from
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
449
|
+
def _map_paragraph_sentences(lines, line_offsets, tokenizer):
|
|
450
|
+
"""Sentence-tokenize a joined paragraph and map each sentence back to its
|
|
451
|
+
original character offset using a segment map built from the source lines."""
|
|
452
|
+
joined = ' '.join(lines)
|
|
453
|
+
if not joined:
|
|
454
|
+
return []
|
|
455
|
+
sents = tokenizer.to_sentences(joined)
|
|
456
|
+
|
|
457
|
+
segments = []
|
|
458
|
+
j = 0
|
|
459
|
+
for line_text, orig_start in zip(lines, line_offsets):
|
|
460
|
+
segments.append((j, orig_start, len(line_text)))
|
|
461
|
+
j += len(line_text) + 1
|
|
462
|
+
|
|
463
|
+
def _to_original(pos_in_joined):
|
|
464
|
+
for j_start, o_start, length in segments:
|
|
465
|
+
if pos_in_joined < j_start + length:
|
|
466
|
+
return o_start + (pos_in_joined - j_start)
|
|
467
|
+
last = segments[-1]
|
|
468
|
+
return last[1] + last[2]
|
|
469
|
+
|
|
470
|
+
results = []
|
|
471
|
+
search_pos = 0
|
|
472
|
+
for sent in sents:
|
|
473
|
+
idx = joined.find(sent, search_pos)
|
|
474
|
+
if idx == -1:
|
|
475
|
+
idx = search_pos
|
|
476
|
+
results.append((sent, _to_original(idx)))
|
|
477
|
+
search_pos = idx + len(sent)
|
|
478
|
+
return results
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def _tokenize_sentences(text, tokenizer):
|
|
482
|
+
"""Split text into (sentence_text, original_char_offset) pairs,
|
|
483
|
+
respecting paragraph boundaries."""
|
|
484
|
+
results = []
|
|
485
|
+
current_lines = []
|
|
486
|
+
line_offsets = []
|
|
487
|
+
pos = 0
|
|
488
|
+
for line in text.splitlines(True):
|
|
489
|
+
stripped = line.strip()
|
|
490
|
+
if not stripped:
|
|
491
|
+
if current_lines:
|
|
492
|
+
results.extend(
|
|
493
|
+
_map_paragraph_sentences(current_lines, line_offsets, tokenizer)
|
|
494
|
+
)
|
|
495
|
+
current_lines = []
|
|
496
|
+
line_offsets = []
|
|
497
|
+
else:
|
|
498
|
+
leading = len(line) - len(line.lstrip())
|
|
499
|
+
line_offsets.append(pos + leading)
|
|
500
|
+
current_lines.append(stripped)
|
|
501
|
+
pos += len(line)
|
|
502
|
+
if current_lines:
|
|
503
|
+
results.extend(
|
|
504
|
+
_map_paragraph_sentences(current_lines, line_offsets, tokenizer)
|
|
505
|
+
)
|
|
506
|
+
return results
|
|
474
507
|
|
|
475
|
-
u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
|
|
476
|
-
ranks = iter(summarizer._compute_ranks(sigma, v))
|
|
477
|
-
return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
|
|
478
508
|
|
|
509
|
+
def _build_sentence_vocab(sentences, tokenizer, stemmer, stop_words):
|
|
510
|
+
"""Tokenize words, stem, filter stop words, and build a vocabulary index.
|
|
479
511
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
512
|
+
Returns:
|
|
513
|
+
sentence_word_indices: list of lists of vocab indices per sentence
|
|
514
|
+
n_vocab: total vocabulary size
|
|
515
|
+
doc_freq: dict mapping vocab index to number of sentences containing it
|
|
516
|
+
"""
|
|
517
|
+
vocab = {}
|
|
518
|
+
doc_freq = {}
|
|
519
|
+
n_vocab = 0
|
|
520
|
+
sentence_word_indices = []
|
|
521
|
+
|
|
522
|
+
for sent_text in sentences:
|
|
523
|
+
words = tokenizer.to_words(sent_text)
|
|
524
|
+
indices = []
|
|
525
|
+
seen = set()
|
|
526
|
+
for w in words:
|
|
527
|
+
normalized = w.lower()
|
|
528
|
+
if normalized in stop_words:
|
|
529
|
+
continue
|
|
530
|
+
stemmed = stemmer(normalized)
|
|
531
|
+
if stemmed not in vocab:
|
|
532
|
+
vocab[stemmed] = n_vocab
|
|
533
|
+
n_vocab += 1
|
|
534
|
+
idx = vocab[stemmed]
|
|
535
|
+
indices.append(idx)
|
|
536
|
+
if idx not in seen:
|
|
537
|
+
seen.add(idx)
|
|
538
|
+
doc_freq[idx] = doc_freq.get(idx, 0) + 1
|
|
539
|
+
sentence_word_indices.append(indices)
|
|
540
|
+
|
|
541
|
+
return sentence_word_indices, n_vocab, doc_freq
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def _score_tfidf_centroid(sentence_word_indices, n_vocab, doc_freq):
|
|
545
|
+
"""Score sentences by TF-IDF cosine similarity to the document centroid.
|
|
546
|
+
|
|
547
|
+
Each sentence gets a TF-IDF vector (sparse dict). The centroid is the mean
|
|
548
|
+
of all sentence vectors. Sentence score = cosine(sentence_vec, centroid).
|
|
549
|
+
Runs in O(S * avg_words) time and memory with no dense matrix allocation.
|
|
550
|
+
"""
|
|
551
|
+
n_sents = len(sentence_word_indices)
|
|
552
|
+
if n_sents == 0 or n_vocab == 0:
|
|
553
|
+
return [0.0] * n_sents
|
|
554
|
+
|
|
555
|
+
idf = {idx: math.log(n_sents / count) + 1.0 for idx, count in doc_freq.items()}
|
|
556
|
+
|
|
557
|
+
centroid = {}
|
|
558
|
+
sentence_tfidf = []
|
|
559
|
+
for indices in sentence_word_indices:
|
|
560
|
+
if not indices:
|
|
561
|
+
sentence_tfidf.append(None)
|
|
562
|
+
continue
|
|
563
|
+
tf = {}
|
|
564
|
+
for idx in indices:
|
|
565
|
+
tf[idx] = tf.get(idx, 0) + 1
|
|
566
|
+
max_tf = max(tf.values())
|
|
567
|
+
tfidf = {}
|
|
568
|
+
for idx, count in tf.items():
|
|
569
|
+
val = (0.5 + 0.5 * count / max_tf) * idf.get(idx, 0.0)
|
|
570
|
+
tfidf[idx] = val
|
|
571
|
+
centroid[idx] = centroid.get(idx, 0.0) + val
|
|
572
|
+
sentence_tfidf.append(tfidf)
|
|
573
|
+
|
|
574
|
+
inv_n = 1.0 / n_sents
|
|
575
|
+
centroid_norm_sq = 0.0
|
|
576
|
+
for idx in centroid:
|
|
577
|
+
centroid[idx] *= inv_n
|
|
578
|
+
centroid_norm_sq += centroid[idx] ** 2
|
|
579
|
+
centroid_norm = math.sqrt(centroid_norm_sq)
|
|
580
|
+
|
|
581
|
+
if centroid_norm == 0:
|
|
582
|
+
return [0.0] * n_sents
|
|
583
|
+
|
|
584
|
+
scores = []
|
|
585
|
+
for tfidf in sentence_tfidf:
|
|
586
|
+
if tfidf is None:
|
|
587
|
+
scores.append(0.0)
|
|
588
|
+
continue
|
|
589
|
+
dot = sum(val * centroid.get(idx, 0.0) for idx, val in tfidf.items())
|
|
590
|
+
sent_norm = math.sqrt(sum(v * v for v in tfidf.values()))
|
|
591
|
+
scores.append(dot / (sent_norm * centroid_norm) if sent_norm > 0 else 0.0)
|
|
592
|
+
|
|
593
|
+
return scores
|
|
483
594
|
|
|
484
595
|
|
|
485
596
|
def get_nltk_tokenizer(language: str) -> Tokenizer:
|
|
@@ -911,44 +1022,72 @@ def _compute_ner_batch(
|
|
|
911
1022
|
return all_ner_objects
|
|
912
1023
|
|
|
913
1024
|
|
|
914
|
-
def get_extractive_summary(text, language, max_chars,
|
|
1025
|
+
def get_extractive_summary(text, language, max_chars, with_scores=False):
|
|
1026
|
+
"""Extract a summary using TF-IDF centroid sentence scoring.
|
|
1027
|
+
|
|
1028
|
+
Scores all sentences by cosine similarity to the TF-IDF document centroid,
|
|
1029
|
+
then greedily selects the highest-scoring sentences up to max_chars.
|
|
1030
|
+
|
|
1031
|
+
Args:
|
|
1032
|
+
text: Input text to summarize.
|
|
1033
|
+
language: Language code (e.g. 'en', 'de', 'fr').
|
|
1034
|
+
max_chars: Maximum character budget for the summary.
|
|
1035
|
+
with_scores: If True, return list of (sentence_text, normalized_score,
|
|
1036
|
+
original_char_offset) 3-tuples in document order.
|
|
1037
|
+
If False, return a single joined summary string.
|
|
1038
|
+
"""
|
|
915
1039
|
tokenizer = get_nltk_tokenizer(language)
|
|
916
1040
|
stemmer = Stemmer(language)
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
1041
|
+
stop_words = frozenset(w.lower() for w in get_stop_words(language))
|
|
1042
|
+
|
|
1043
|
+
sentence_pairs = _tokenize_sentences(text, tokenizer)
|
|
1044
|
+
if not sentence_pairs:
|
|
1045
|
+
return [] if with_scores else ""
|
|
1046
|
+
|
|
1047
|
+
sentence_texts = [s for s, _ in sentence_pairs]
|
|
1048
|
+
sentence_offsets = [off for _, off in sentence_pairs]
|
|
1049
|
+
|
|
1050
|
+
word_indices, n_vocab, doc_freq = _build_sentence_vocab(
|
|
1051
|
+
sentence_texts, tokenizer, stemmer, stop_words
|
|
1052
|
+
)
|
|
1053
|
+
scores = _score_tfidf_centroid(word_indices, n_vocab, doc_freq)
|
|
1054
|
+
|
|
1055
|
+
scored = sorted(
|
|
1056
|
+
((sentence_texts[i], scores[i], sentence_offsets[i], i)
|
|
1057
|
+
for i in range(len(sentence_texts))),
|
|
1058
|
+
key=lambda x: x[1],
|
|
1059
|
+
reverse=True,
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
selected = []
|
|
1063
|
+
total_chars = 0
|
|
1064
|
+
chars_before_last = 0
|
|
1065
|
+
for sent_text, score, offset, order in scored:
|
|
1066
|
+
if total_chars >= max_chars:
|
|
936
1067
|
break
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
1068
|
+
selected.append((sent_text, score, offset, order))
|
|
1069
|
+
chars_before_last = total_chars
|
|
1070
|
+
total_chars += len(sent_text) + 1
|
|
1071
|
+
|
|
1072
|
+
selected.sort(key=lambda x: x[3])
|
|
1073
|
+
summary = [(s[0], s[1], s[2]) for s in selected]
|
|
1074
|
+
|
|
1075
|
+
if total_chars > max_chars and summary:
|
|
1076
|
+
remaining = max_chars - chars_before_last
|
|
1077
|
+
if remaining > 0:
|
|
1078
|
+
summary[-1] = (summary[-1][0][:remaining], summary[-1][1], summary[-1][2])
|
|
1079
|
+
else:
|
|
1080
|
+
summary.pop()
|
|
1081
|
+
|
|
944
1082
|
if not with_scores:
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
1083
|
+
return " ".join(s[0] for s in summary)
|
|
1084
|
+
|
|
1085
|
+
if not summary:
|
|
1086
|
+
return []
|
|
1087
|
+
min_score = min(s[1] for s in summary)
|
|
1088
|
+
max_score = max(s[1] for s in summary)
|
|
1089
|
+
score_range = (max_score - min_score) if max_score != min_score else 1.0
|
|
1090
|
+
return [(s[0], (s[1] - min_score) / score_range, s[2]) for s in summary]
|
|
952
1091
|
|
|
953
1092
|
|
|
954
1093
|
def _preprocess_newlines_for_ner(text: str) -> str:
|
|
@@ -1008,7 +1147,7 @@ def _strip_honorifics_for_ner(text: str) -> str:
|
|
|
1008
1147
|
return result
|
|
1009
1148
|
|
|
1010
1149
|
|
|
1011
|
-
def _preprocess_text_for_ner(text, language,
|
|
1150
|
+
def _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines):
|
|
1012
1151
|
"""Preprocess a single text for NER (newlines, honorifics, compression)."""
|
|
1013
1152
|
if preprocess_newlines:
|
|
1014
1153
|
text = _preprocess_newlines_for_ner(text)
|
|
@@ -1016,11 +1155,11 @@ def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess
|
|
|
1016
1155
|
|
|
1017
1156
|
cr = compression_ratio
|
|
1018
1157
|
if cr == "auto":
|
|
1019
|
-
cr = max(1.0, len(text) / 15000)
|
|
1158
|
+
cr = max(1.0, len(text) / 15000)
|
|
1020
1159
|
|
|
1021
1160
|
if cr > 1.0:
|
|
1022
1161
|
sentences = get_extractive_summary(
|
|
1023
|
-
text, language, int(len(text) / cr),
|
|
1162
|
+
text, language, int(len(text) / cr), with_scores=True
|
|
1024
1163
|
)
|
|
1025
1164
|
text = " ".join([s[0] for s in sentences])
|
|
1026
1165
|
|
|
@@ -1032,7 +1171,6 @@ def _ner_pipe_batch(
|
|
|
1032
1171
|
language,
|
|
1033
1172
|
model,
|
|
1034
1173
|
engine_type="spacy",
|
|
1035
|
-
fast=False,
|
|
1036
1174
|
compression_ratio="auto",
|
|
1037
1175
|
with_comentions=True,
|
|
1038
1176
|
with_context=True,
|
|
@@ -1056,7 +1194,7 @@ def _ner_pipe_batch(
|
|
|
1056
1194
|
if not isinstance(t, str):
|
|
1057
1195
|
raise TypeError(f"Each text must be str, not {type(t).__name__}")
|
|
1058
1196
|
processed_texts.append(
|
|
1059
|
-
_preprocess_text_for_ner(t, language,
|
|
1197
|
+
_preprocess_text_for_ner(t, language, compression_ratio, preprocess_newlines)
|
|
1060
1198
|
)
|
|
1061
1199
|
|
|
1062
1200
|
if _analyzer is None:
|
|
@@ -1084,7 +1222,6 @@ def ner_pipe(
|
|
|
1084
1222
|
language,
|
|
1085
1223
|
model,
|
|
1086
1224
|
engine_type="spacy",
|
|
1087
|
-
fast=False,
|
|
1088
1225
|
compression_ratio="auto",
|
|
1089
1226
|
with_scores=False,
|
|
1090
1227
|
with_comentions=True,
|
|
@@ -1109,8 +1246,8 @@ def ner_pipe(
|
|
|
1109
1246
|
language: Language code (e.g., 'en', 'de', 'fr')
|
|
1110
1247
|
model: Model name or instance for spacy/flair engine
|
|
1111
1248
|
engine_type: 'regex', 'flair', 'spacy' or 'custom'
|
|
1112
|
-
|
|
1113
|
-
|
|
1249
|
+
compression_ratio: Compression ratio for long texts ('auto' or float).
|
|
1250
|
+
'auto' compresses texts over ~15k chars proportionally.
|
|
1114
1251
|
with_scores: Include confidence scores (not implemented)
|
|
1115
1252
|
with_comentions: Include co-mentioned entities
|
|
1116
1253
|
with_context: Include surrounding context
|
|
@@ -1129,7 +1266,7 @@ def ner_pipe(
|
|
|
1129
1266
|
|
|
1130
1267
|
if isinstance(text, list):
|
|
1131
1268
|
return _ner_pipe_batch(
|
|
1132
|
-
text, language, model, engine_type,
|
|
1269
|
+
text, language, model, engine_type, compression_ratio,
|
|
1133
1270
|
with_comentions=with_comentions, with_context=with_context,
|
|
1134
1271
|
entities=entities, score_threshold=score_threshold,
|
|
1135
1272
|
batch_size=batch_size, n_process=n_process,
|
|
@@ -1146,7 +1283,7 @@ def ner_pipe(
|
|
|
1146
1283
|
model=model,
|
|
1147
1284
|
)
|
|
1148
1285
|
|
|
1149
|
-
text = _preprocess_text_for_ner(text, language,
|
|
1286
|
+
text = _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines)
|
|
1150
1287
|
|
|
1151
1288
|
ner = compute_ner_presidio(
|
|
1152
1289
|
text,
|
|
@@ -1168,7 +1305,7 @@ def get_ner_handler(
|
|
|
1168
1305
|
language,
|
|
1169
1306
|
model,
|
|
1170
1307
|
engine_type="spacy",
|
|
1171
|
-
|
|
1308
|
+
compression_ratio="auto",
|
|
1172
1309
|
entities=None,
|
|
1173
1310
|
score_threshold=0.5,
|
|
1174
1311
|
batch_size=32,
|
|
@@ -1186,12 +1323,11 @@ def get_ner_handler(
|
|
|
1186
1323
|
model=model,
|
|
1187
1324
|
)
|
|
1188
1325
|
|
|
1189
|
-
return lambda text, compression_ratio=
|
|
1326
|
+
return lambda text, compression_ratio=compression_ratio, with_scores=False, with_comentions=True, with_context=True: ner_pipe(
|
|
1190
1327
|
text,
|
|
1191
1328
|
language,
|
|
1192
1329
|
model,
|
|
1193
1330
|
engine_type,
|
|
1194
|
-
fast,
|
|
1195
1331
|
compression_ratio,
|
|
1196
1332
|
with_scores,
|
|
1197
1333
|
with_comentions,
|
|
@@ -1203,8 +1339,3 @@ def get_ner_handler(
|
|
|
1203
1339
|
preprocess_newlines,
|
|
1204
1340
|
_analyzer=analyzer
|
|
1205
1341
|
)
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
@st.cache_resource
|
|
1209
|
-
def get_cached_ner_handler(language, model):
|
|
1210
|
-
return get_ner_handler(language, model)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|