streamlit-octostar-utils 0.5.0.dev7__tar.gz → 0.5.0.dev8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/PKG-INFO +1 -1
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/pyproject.toml +1 -1
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/ner.py +214 -83
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/LICENSE +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/README.md +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/contents.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/nifi.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/dict.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/filetypes.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/timestamp.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/language.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/client.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/context.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/permissions.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/relationships.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/validation.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/style/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/style/common.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/__init__.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
- {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
|
@@ -13,18 +13,14 @@ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, AnalysisExplan
|
|
|
13
13
|
EntityRecognizer, RecognizerResult
|
|
14
14
|
from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
|
|
15
15
|
from presidio_analyzer.predefined_recognizers import SpacyRecognizer, PhoneRecognizer
|
|
16
|
-
import streamlit as st
|
|
17
16
|
import nltk
|
|
18
17
|
from flair.data import Sentence
|
|
19
18
|
from flair.models import SequenceTagger
|
|
20
19
|
|
|
21
20
|
from .custom_recognizers import PhonePatternRecognizer, ModernUrlRecognizer
|
|
22
21
|
|
|
23
|
-
from sumy.parsers.plaintext import PlaintextParser
|
|
24
22
|
from sumy.nlp.tokenizers import Tokenizer
|
|
25
23
|
from sumy.nlp.stemmers import Stemmer
|
|
26
|
-
from sumy.summarizers.lsa import LsaSummarizer
|
|
27
|
-
from sumy.summarizers.luhn import LuhnSummarizer
|
|
28
24
|
from sumy.utils import get_stop_words
|
|
29
25
|
|
|
30
26
|
from .language import to_name, SPACY_MODELS
|
|
@@ -450,36 +446,151 @@ def expand_entities_for_analyzer(entities_list):
|
|
|
450
446
|
return list(expanded)
|
|
451
447
|
|
|
452
448
|
|
|
453
|
-
def
|
|
454
|
-
|
|
455
|
-
from
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
449
|
+
def _map_paragraph_sentences(lines, line_offsets, tokenizer):
|
|
450
|
+
"""Sentence-tokenize a joined paragraph and map each sentence back to its
|
|
451
|
+
original character offset using a segment map built from the source lines."""
|
|
452
|
+
joined = ' '.join(lines)
|
|
453
|
+
if not joined:
|
|
454
|
+
return []
|
|
455
|
+
sents = tokenizer.to_sentences(joined)
|
|
456
|
+
|
|
457
|
+
segments = []
|
|
458
|
+
j = 0
|
|
459
|
+
for line_text, orig_start in zip(lines, line_offsets):
|
|
460
|
+
segments.append((j, orig_start, len(line_text)))
|
|
461
|
+
j += len(line_text) + 1
|
|
462
|
+
|
|
463
|
+
def _to_original(pos_in_joined):
|
|
464
|
+
for j_start, o_start, length in segments:
|
|
465
|
+
if pos_in_joined < j_start + length:
|
|
466
|
+
return o_start + (pos_in_joined - j_start)
|
|
467
|
+
last = segments[-1]
|
|
468
|
+
return last[1] + last[2]
|
|
469
|
+
|
|
470
|
+
results = []
|
|
471
|
+
search_pos = 0
|
|
472
|
+
for sent in sents:
|
|
473
|
+
idx = joined.find(sent, search_pos)
|
|
474
|
+
if idx == -1:
|
|
475
|
+
idx = search_pos
|
|
476
|
+
results.append((sent, _to_original(idx)))
|
|
477
|
+
search_pos = idx + len(sent)
|
|
478
|
+
return results
|
|
479
|
+
|
|
480
|
+
|
|
481
|
+
def _tokenize_sentences(text, tokenizer):
|
|
482
|
+
"""Split text into (sentence_text, original_char_offset) pairs,
|
|
483
|
+
respecting paragraph boundaries."""
|
|
484
|
+
results = []
|
|
485
|
+
current_lines = []
|
|
486
|
+
line_offsets = []
|
|
487
|
+
pos = 0
|
|
488
|
+
for line in text.splitlines(True):
|
|
489
|
+
stripped = line.strip()
|
|
490
|
+
if not stripped:
|
|
491
|
+
if current_lines:
|
|
492
|
+
results.extend(
|
|
493
|
+
_map_paragraph_sentences(current_lines, line_offsets, tokenizer)
|
|
494
|
+
)
|
|
495
|
+
current_lines = []
|
|
496
|
+
line_offsets = []
|
|
497
|
+
else:
|
|
498
|
+
leading = len(line) - len(line.lstrip())
|
|
499
|
+
line_offsets.append(pos + leading)
|
|
500
|
+
current_lines.append(stripped)
|
|
501
|
+
pos += len(line)
|
|
502
|
+
if current_lines:
|
|
503
|
+
results.extend(
|
|
504
|
+
_map_paragraph_sentences(current_lines, line_offsets, tokenizer)
|
|
505
|
+
)
|
|
506
|
+
return results
|
|
474
507
|
|
|
475
|
-
u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
|
|
476
|
-
ranks = iter(summarizer._compute_ranks(sigma, v))
|
|
477
|
-
return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
|
|
478
508
|
|
|
509
|
+
def _build_sentence_vocab(sentences, tokenizer, stemmer, stop_words):
|
|
510
|
+
"""Tokenize words, stem, filter stop words, and build a vocabulary index.
|
|
479
511
|
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
512
|
+
Returns:
|
|
513
|
+
sentence_word_indices: list of lists of vocab indices per sentence
|
|
514
|
+
n_vocab: total vocabulary size
|
|
515
|
+
doc_freq: dict mapping vocab index to number of sentences containing it
|
|
516
|
+
"""
|
|
517
|
+
vocab = {}
|
|
518
|
+
doc_freq = {}
|
|
519
|
+
n_vocab = 0
|
|
520
|
+
sentence_word_indices = []
|
|
521
|
+
|
|
522
|
+
for sent_text in sentences:
|
|
523
|
+
words = tokenizer.to_words(sent_text)
|
|
524
|
+
indices = []
|
|
525
|
+
seen = set()
|
|
526
|
+
for w in words:
|
|
527
|
+
normalized = w.lower()
|
|
528
|
+
if normalized in stop_words:
|
|
529
|
+
continue
|
|
530
|
+
stemmed = stemmer(normalized)
|
|
531
|
+
if stemmed not in vocab:
|
|
532
|
+
vocab[stemmed] = n_vocab
|
|
533
|
+
n_vocab += 1
|
|
534
|
+
idx = vocab[stemmed]
|
|
535
|
+
indices.append(idx)
|
|
536
|
+
if idx not in seen:
|
|
537
|
+
seen.add(idx)
|
|
538
|
+
doc_freq[idx] = doc_freq.get(idx, 0) + 1
|
|
539
|
+
sentence_word_indices.append(indices)
|
|
540
|
+
|
|
541
|
+
return sentence_word_indices, n_vocab, doc_freq
|
|
542
|
+
|
|
543
|
+
|
|
544
|
+
def _score_tfidf_centroid(sentence_word_indices, n_vocab, doc_freq):
|
|
545
|
+
"""Score sentences by TF-IDF cosine similarity to the document centroid.
|
|
546
|
+
|
|
547
|
+
Each sentence gets a TF-IDF vector (sparse dict). The centroid is the mean
|
|
548
|
+
of all sentence vectors. Sentence score = cosine(sentence_vec, centroid).
|
|
549
|
+
Runs in O(S * avg_words) time and memory with no dense matrix allocation.
|
|
550
|
+
"""
|
|
551
|
+
n_sents = len(sentence_word_indices)
|
|
552
|
+
if n_sents == 0 or n_vocab == 0:
|
|
553
|
+
return [0.0] * n_sents
|
|
554
|
+
|
|
555
|
+
idf = {idx: math.log(n_sents / count) + 1.0 for idx, count in doc_freq.items()}
|
|
556
|
+
|
|
557
|
+
centroid = {}
|
|
558
|
+
sentence_tfidf = []
|
|
559
|
+
for indices in sentence_word_indices:
|
|
560
|
+
if not indices:
|
|
561
|
+
sentence_tfidf.append(None)
|
|
562
|
+
continue
|
|
563
|
+
tf = {}
|
|
564
|
+
for idx in indices:
|
|
565
|
+
tf[idx] = tf.get(idx, 0) + 1
|
|
566
|
+
max_tf = max(tf.values())
|
|
567
|
+
tfidf = {}
|
|
568
|
+
for idx, count in tf.items():
|
|
569
|
+
val = (0.5 + 0.5 * count / max_tf) * idf.get(idx, 0.0)
|
|
570
|
+
tfidf[idx] = val
|
|
571
|
+
centroid[idx] = centroid.get(idx, 0.0) + val
|
|
572
|
+
sentence_tfidf.append(tfidf)
|
|
573
|
+
|
|
574
|
+
inv_n = 1.0 / n_sents
|
|
575
|
+
centroid_norm_sq = 0.0
|
|
576
|
+
for idx in centroid:
|
|
577
|
+
centroid[idx] *= inv_n
|
|
578
|
+
centroid_norm_sq += centroid[idx] ** 2
|
|
579
|
+
centroid_norm = math.sqrt(centroid_norm_sq)
|
|
580
|
+
|
|
581
|
+
if centroid_norm == 0:
|
|
582
|
+
return [0.0] * n_sents
|
|
583
|
+
|
|
584
|
+
scores = []
|
|
585
|
+
for tfidf in sentence_tfidf:
|
|
586
|
+
if tfidf is None:
|
|
587
|
+
scores.append(0.0)
|
|
588
|
+
continue
|
|
589
|
+
dot = sum(val * centroid.get(idx, 0.0) for idx, val in tfidf.items())
|
|
590
|
+
sent_norm = math.sqrt(sum(v * v for v in tfidf.values()))
|
|
591
|
+
scores.append(dot / (sent_norm * centroid_norm) if sent_norm > 0 else 0.0)
|
|
592
|
+
|
|
593
|
+
return scores
|
|
483
594
|
|
|
484
595
|
|
|
485
596
|
def get_nltk_tokenizer(language: str) -> Tokenizer:
|
|
@@ -911,44 +1022,72 @@ def _compute_ner_batch(
|
|
|
911
1022
|
return all_ner_objects
|
|
912
1023
|
|
|
913
1024
|
|
|
914
|
-
def get_extractive_summary(text, language, max_chars,
|
|
1025
|
+
def get_extractive_summary(text, language, max_chars, with_scores=False):
|
|
1026
|
+
"""Extract a summary using TF-IDF centroid sentence scoring.
|
|
1027
|
+
|
|
1028
|
+
Scores all sentences by cosine similarity to the TF-IDF document centroid,
|
|
1029
|
+
then greedily selects the highest-scoring sentences up to max_chars.
|
|
1030
|
+
|
|
1031
|
+
Args:
|
|
1032
|
+
text: Input text to summarize.
|
|
1033
|
+
language: Language code (e.g. 'en', 'de', 'fr').
|
|
1034
|
+
max_chars: Maximum character budget for the summary.
|
|
1035
|
+
with_scores: If True, return list of (sentence_text, normalized_score,
|
|
1036
|
+
original_char_offset) 3-tuples in document order.
|
|
1037
|
+
If False, return a single joined summary string.
|
|
1038
|
+
"""
|
|
915
1039
|
tokenizer = get_nltk_tokenizer(language)
|
|
916
1040
|
stemmer = Stemmer(language)
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
933
|
-
|
|
934
|
-
|
|
935
|
-
|
|
1041
|
+
stop_words = frozenset(w.lower() for w in get_stop_words(language))
|
|
1042
|
+
|
|
1043
|
+
sentence_pairs = _tokenize_sentences(text, tokenizer)
|
|
1044
|
+
if not sentence_pairs:
|
|
1045
|
+
return [] if with_scores else ""
|
|
1046
|
+
|
|
1047
|
+
sentence_texts = [s for s, _ in sentence_pairs]
|
|
1048
|
+
sentence_offsets = [off for _, off in sentence_pairs]
|
|
1049
|
+
|
|
1050
|
+
word_indices, n_vocab, doc_freq = _build_sentence_vocab(
|
|
1051
|
+
sentence_texts, tokenizer, stemmer, stop_words
|
|
1052
|
+
)
|
|
1053
|
+
scores = _score_tfidf_centroid(word_indices, n_vocab, doc_freq)
|
|
1054
|
+
|
|
1055
|
+
scored = sorted(
|
|
1056
|
+
((sentence_texts[i], scores[i], sentence_offsets[i], i)
|
|
1057
|
+
for i in range(len(sentence_texts))),
|
|
1058
|
+
key=lambda x: x[1],
|
|
1059
|
+
reverse=True,
|
|
1060
|
+
)
|
|
1061
|
+
|
|
1062
|
+
selected = []
|
|
1063
|
+
total_chars = 0
|
|
1064
|
+
chars_before_last = 0
|
|
1065
|
+
for sent_text, score, offset, order in scored:
|
|
1066
|
+
if total_chars >= max_chars:
|
|
936
1067
|
break
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
1068
|
+
selected.append((sent_text, score, offset, order))
|
|
1069
|
+
chars_before_last = total_chars
|
|
1070
|
+
total_chars += len(sent_text) + 1
|
|
1071
|
+
|
|
1072
|
+
selected.sort(key=lambda x: x[3])
|
|
1073
|
+
summary = [(s[0], s[1], s[2]) for s in selected]
|
|
1074
|
+
|
|
1075
|
+
if total_chars > max_chars and summary:
|
|
1076
|
+
remaining = max_chars - chars_before_last
|
|
1077
|
+
if remaining > 0:
|
|
1078
|
+
summary[-1] = (summary[-1][0][:remaining], summary[-1][1], summary[-1][2])
|
|
1079
|
+
else:
|
|
1080
|
+
summary.pop()
|
|
1081
|
+
|
|
944
1082
|
if not with_scores:
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
1083
|
+
return " ".join(s[0] for s in summary)
|
|
1084
|
+
|
|
1085
|
+
if not summary:
|
|
1086
|
+
return []
|
|
1087
|
+
min_score = min(s[1] for s in summary)
|
|
1088
|
+
max_score = max(s[1] for s in summary)
|
|
1089
|
+
score_range = (max_score - min_score) if max_score != min_score else 1.0
|
|
1090
|
+
return [(s[0], (s[1] - min_score) / score_range, s[2]) for s in summary]
|
|
952
1091
|
|
|
953
1092
|
|
|
954
1093
|
def _preprocess_newlines_for_ner(text: str) -> str:
|
|
@@ -1008,7 +1147,7 @@ def _strip_honorifics_for_ner(text: str) -> str:
|
|
|
1008
1147
|
return result
|
|
1009
1148
|
|
|
1010
1149
|
|
|
1011
|
-
def _preprocess_text_for_ner(text, language,
|
|
1150
|
+
def _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines):
|
|
1012
1151
|
"""Preprocess a single text for NER (newlines, honorifics, compression)."""
|
|
1013
1152
|
if preprocess_newlines:
|
|
1014
1153
|
text = _preprocess_newlines_for_ner(text)
|
|
@@ -1016,11 +1155,11 @@ def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess
|
|
|
1016
1155
|
|
|
1017
1156
|
cr = compression_ratio
|
|
1018
1157
|
if cr == "auto":
|
|
1019
|
-
cr = max(1.0, len(text) / 15000)
|
|
1158
|
+
cr = max(1.0, len(text) / 15000)
|
|
1020
1159
|
|
|
1021
1160
|
if cr > 1.0:
|
|
1022
1161
|
sentences = get_extractive_summary(
|
|
1023
|
-
text, language, int(len(text) / cr),
|
|
1162
|
+
text, language, int(len(text) / cr), with_scores=True
|
|
1024
1163
|
)
|
|
1025
1164
|
text = " ".join([s[0] for s in sentences])
|
|
1026
1165
|
|
|
@@ -1032,7 +1171,6 @@ def _ner_pipe_batch(
|
|
|
1032
1171
|
language,
|
|
1033
1172
|
model,
|
|
1034
1173
|
engine_type="spacy",
|
|
1035
|
-
fast=False,
|
|
1036
1174
|
compression_ratio="auto",
|
|
1037
1175
|
with_comentions=True,
|
|
1038
1176
|
with_context=True,
|
|
@@ -1056,7 +1194,7 @@ def _ner_pipe_batch(
|
|
|
1056
1194
|
if not isinstance(t, str):
|
|
1057
1195
|
raise TypeError(f"Each text must be str, not {type(t).__name__}")
|
|
1058
1196
|
processed_texts.append(
|
|
1059
|
-
_preprocess_text_for_ner(t, language,
|
|
1197
|
+
_preprocess_text_for_ner(t, language, compression_ratio, preprocess_newlines)
|
|
1060
1198
|
)
|
|
1061
1199
|
|
|
1062
1200
|
if _analyzer is None:
|
|
@@ -1084,7 +1222,6 @@ def ner_pipe(
|
|
|
1084
1222
|
language,
|
|
1085
1223
|
model,
|
|
1086
1224
|
engine_type="spacy",
|
|
1087
|
-
fast=False,
|
|
1088
1225
|
compression_ratio="auto",
|
|
1089
1226
|
with_scores=False,
|
|
1090
1227
|
with_comentions=True,
|
|
@@ -1109,8 +1246,8 @@ def ner_pipe(
|
|
|
1109
1246
|
language: Language code (e.g., 'en', 'de', 'fr')
|
|
1110
1247
|
model: Model name or instance for spacy/flair engine
|
|
1111
1248
|
engine_type: 'regex', 'flair', 'spacy' or 'custom'
|
|
1112
|
-
|
|
1113
|
-
|
|
1249
|
+
compression_ratio: Compression ratio for long texts ('auto' or float).
|
|
1250
|
+
'auto' compresses texts over ~15k chars proportionally.
|
|
1114
1251
|
with_scores: Include confidence scores (not implemented)
|
|
1115
1252
|
with_comentions: Include co-mentioned entities
|
|
1116
1253
|
with_context: Include surrounding context
|
|
@@ -1129,7 +1266,7 @@ def ner_pipe(
|
|
|
1129
1266
|
|
|
1130
1267
|
if isinstance(text, list):
|
|
1131
1268
|
return _ner_pipe_batch(
|
|
1132
|
-
text, language, model, engine_type,
|
|
1269
|
+
text, language, model, engine_type, compression_ratio,
|
|
1133
1270
|
with_comentions=with_comentions, with_context=with_context,
|
|
1134
1271
|
entities=entities, score_threshold=score_threshold,
|
|
1135
1272
|
batch_size=batch_size, n_process=n_process,
|
|
@@ -1146,7 +1283,7 @@ def ner_pipe(
|
|
|
1146
1283
|
model=model,
|
|
1147
1284
|
)
|
|
1148
1285
|
|
|
1149
|
-
text = _preprocess_text_for_ner(text, language,
|
|
1286
|
+
text = _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines)
|
|
1150
1287
|
|
|
1151
1288
|
ner = compute_ner_presidio(
|
|
1152
1289
|
text,
|
|
@@ -1168,7 +1305,7 @@ def get_ner_handler(
|
|
|
1168
1305
|
language,
|
|
1169
1306
|
model,
|
|
1170
1307
|
engine_type="spacy",
|
|
1171
|
-
|
|
1308
|
+
compression_ratio="auto",
|
|
1172
1309
|
entities=None,
|
|
1173
1310
|
score_threshold=0.5,
|
|
1174
1311
|
batch_size=32,
|
|
@@ -1186,12 +1323,11 @@ def get_ner_handler(
|
|
|
1186
1323
|
model=model,
|
|
1187
1324
|
)
|
|
1188
1325
|
|
|
1189
|
-
return lambda text, compression_ratio=
|
|
1326
|
+
return lambda text, compression_ratio=compression_ratio, with_scores=False, with_comentions=True, with_context=True: ner_pipe(
|
|
1190
1327
|
text,
|
|
1191
1328
|
language,
|
|
1192
1329
|
model,
|
|
1193
1330
|
engine_type,
|
|
1194
|
-
fast,
|
|
1195
1331
|
compression_ratio,
|
|
1196
1332
|
with_scores,
|
|
1197
1333
|
with_comentions,
|
|
@@ -1203,8 +1339,3 @@ def get_ner_handler(
|
|
|
1203
1339
|
preprocess_newlines,
|
|
1204
1340
|
_analyzer=analyzer
|
|
1205
1341
|
)
|
|
1206
|
-
|
|
1207
|
-
|
|
1208
|
-
@st.cache_resource
|
|
1209
|
-
def get_cached_ner_handler(language, model):
|
|
1210
|
-
return get_ner_handler(language, model)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|