PyPI - streamlit-octostar-utils - Versions diffs - 0.5.0.dev7__tar.gz → 0.5.0.dev8__tar.gz - Mend

streamlit-octostar-utils 0.5.0.dev7tar.gz → 0.5.0.dev8tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (45) hide show

{streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: streamlit-octostar-utils
-Version: 0.5.0.dev7
+Version: 0.5.0.dev8
 Summary:
 License: MIT
 License-File: LICENSE

{streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/pyproject.toml RENAMED Viewed

@@ -5,7 +5,7 @@ include = '\.pyi?$'
 [tool.poetry]
 name = "streamlit-octostar-utils"
-version = "0.5.0-dev.7"
+version = "0.5.0-dev.8"
 description = ""
 license = "MIT"
 authors = ["Octostar"]

{streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/ner.py RENAMED Viewed

@@ -13,18 +13,14 @@ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, AnalysisExplan
     EntityRecognizer, RecognizerResult
 from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
 from presidio_analyzer.predefined_recognizers import SpacyRecognizer, PhoneRecognizer
-import streamlit as st
 import nltk
 from flair.data import Sentence
 from flair.models import SequenceTagger
 from .custom_recognizers import PhonePatternRecognizer, ModernUrlRecognizer
-from sumy.parsers.plaintext import PlaintextParser
 from sumy.nlp.tokenizers import Tokenizer
 from sumy.nlp.stemmers import Stemmer
-from sumy.summarizers.lsa import LsaSummarizer
-from sumy.summarizers.luhn import LuhnSummarizer
 from sumy.utils import get_stop_words
 from .language import to_name, SPACY_MODELS
@@ -450,36 +446,151 @@ def expand_entities_for_analyzer(entities_list):
     return list(expanded)
-def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
-    from operator import attrgetter
-    from sumy.summarizers._summarizer import SentenceInfo
-    rate = rating
-    if isinstance(rating, dict):
-        assert not args and not kwargs
-        rate = lambda s: rating[s]
-    infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
-    infos = sorted(infos, key=attrgetter("rating"), reverse=True)
-    return tuple((i.sentence, i.rating, i.order) for i in infos)
-def _sumy__lsa_call(summarizer, document):
-    summarizer._ensure_dependecies_installed()
-    dictionary = summarizer._create_dictionary(document)
-    if not dictionary:
-        return ()
-    matrix = summarizer._create_matrix(document, dictionary)
-    matrix = summarizer._compute_term_frequency(matrix)
-    from numpy.linalg import svd as singular_value_decomposition
+def _map_paragraph_sentences(lines, line_offsets, tokenizer):
+    """Sentence-tokenize a joined paragraph and map each sentence back to its
+    original character offset using a segment map built from the source lines."""
+    joined = ' '.join(lines)
+    if not joined:
+        return []
+    sents = tokenizer.to_sentences(joined)
+    segments = []
+    j = 0
+    for line_text, orig_start in zip(lines, line_offsets):
+        segments.append((j, orig_start, len(line_text)))
+        j += len(line_text) + 1
+    def _to_original(pos_in_joined):
+        for j_start, o_start, length in segments:
+            if pos_in_joined < j_start + length:
+                return o_start + (pos_in_joined - j_start)
+        last = segments[-1]
+        return last[1] + last[2]
+    results = []
+    search_pos = 0
+    for sent in sents:
+        idx = joined.find(sent, search_pos)
+        if idx == -1:
+            idx = search_pos
+        results.append((sent, _to_original(idx)))
+        search_pos = idx + len(sent)
+    return results
+def _tokenize_sentences(text, tokenizer):
+    """Split text into (sentence_text, original_char_offset) pairs,
+    respecting paragraph boundaries."""
+    results = []
+    current_lines = []
+    line_offsets = []
+    pos = 0
+    for line in text.splitlines(True):
+        stripped = line.strip()
+        if not stripped:
+            if current_lines:
+                results.extend(
+                    _map_paragraph_sentences(current_lines, line_offsets, tokenizer)
+                )
+                current_lines = []
+                line_offsets = []
+        else:
+            leading = len(line) - len(line.lstrip())
+            line_offsets.append(pos + leading)
+            current_lines.append(stripped)
+        pos += len(line)
+    if current_lines:
+        results.extend(
+            _map_paragraph_sentences(current_lines, line_offsets, tokenizer)
+        )
+    return results
-    u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
-    ranks = iter(summarizer._compute_ranks(sigma, v))
-    return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
+def _build_sentence_vocab(sentences, tokenizer, stemmer, stop_words):
+    """Tokenize words, stem, filter stop words, and build a vocabulary index.
-def _sumy__luhn_call(summarizer, document):
-    words = summarizer._get_significant_words(document.words)
-    return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
+    Returns:
+        sentence_word_indices: list of lists of vocab indices per sentence
+        n_vocab: total vocabulary size
+        doc_freq: dict mapping vocab index to number of sentences containing it
+    """
+    vocab = {}
+    doc_freq = {}
+    n_vocab = 0
+    sentence_word_indices = []
+    for sent_text in sentences:
+        words = tokenizer.to_words(sent_text)
+        indices = []
+        seen = set()
+        for w in words:
+            normalized = w.lower()
+            if normalized in stop_words:
+                continue
+            stemmed = stemmer(normalized)
+            if stemmed not in vocab:
+                vocab[stemmed] = n_vocab
+                n_vocab += 1
+            idx = vocab[stemmed]
+            indices.append(idx)
+            if idx not in seen:
+                seen.add(idx)
+                doc_freq[idx] = doc_freq.get(idx, 0) + 1
+        sentence_word_indices.append(indices)
+    return sentence_word_indices, n_vocab, doc_freq
+def _score_tfidf_centroid(sentence_word_indices, n_vocab, doc_freq):
+    """Score sentences by TF-IDF cosine similarity to the document centroid.
+    Each sentence gets a TF-IDF vector (sparse dict). The centroid is the mean
+    of all sentence vectors. Sentence score = cosine(sentence_vec, centroid).
+    Runs in O(S * avg_words) time and memory with no dense matrix allocation.
+    """
+    n_sents = len(sentence_word_indices)
+    if n_sents == 0 or n_vocab == 0:
+        return [0.0] * n_sents
+    idf = {idx: math.log(n_sents / count) + 1.0 for idx, count in doc_freq.items()}
+    centroid = {}
+    sentence_tfidf = []
+    for indices in sentence_word_indices:
+        if not indices:
+            sentence_tfidf.append(None)
+            continue
+        tf = {}
+        for idx in indices:
+            tf[idx] = tf.get(idx, 0) + 1
+        max_tf = max(tf.values())
+        tfidf = {}
+        for idx, count in tf.items():
+            val = (0.5 + 0.5 * count / max_tf) * idf.get(idx, 0.0)
+            tfidf[idx] = val
+            centroid[idx] = centroid.get(idx, 0.0) + val
+        sentence_tfidf.append(tfidf)
+    inv_n = 1.0 / n_sents
+    centroid_norm_sq = 0.0
+    for idx in centroid:
+        centroid[idx] *= inv_n
+        centroid_norm_sq += centroid[idx] ** 2
+    centroid_norm = math.sqrt(centroid_norm_sq)
+    if centroid_norm == 0:
+        return [0.0] * n_sents
+    scores = []
+    for tfidf in sentence_tfidf:
+        if tfidf is None:
+            scores.append(0.0)
+            continue
+        dot = sum(val * centroid.get(idx, 0.0) for idx, val in tfidf.items())
+        sent_norm = math.sqrt(sum(v * v for v in tfidf.values()))
+        scores.append(dot / (sent_norm * centroid_norm) if sent_norm > 0 else 0.0)
+    return scores
 def get_nltk_tokenizer(language: str) -> Tokenizer:
@@ -911,44 +1022,72 @@ def _compute_ner_batch(
     return all_ner_objects
-def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
+def get_extractive_summary(text, language, max_chars, with_scores=False):
+    """Extract a summary using TF-IDF centroid sentence scoring.
+    Scores all sentences by cosine similarity to the TF-IDF document centroid,
+    then greedily selects the highest-scoring sentences up to max_chars.
+    Args:
+        text: Input text to summarize.
+        language: Language code (e.g. 'en', 'de', 'fr').
+        max_chars: Maximum character budget for the summary.
+        with_scores: If True, return list of (sentence_text, normalized_score,
+                     original_char_offset) 3-tuples in document order.
+                     If False, return a single joined summary string.
+    """
     tokenizer = get_nltk_tokenizer(language)
     stemmer = Stemmer(language)
-    parser = PlaintextParser.from_string(text, tokenizer)
-    if fast:
-        summarizer = LuhnSummarizer(stemmer)
-        summarizer.stop_words = get_stop_words(language)
-        scored_sentences = iter(_sumy__luhn_call(summarizer, parser.document))
-    else:
-        summarizer = LsaSummarizer(stemmer)
-        summarizer.stop_words = get_stop_words(language)
-        scored_sentences = iter(_sumy__lsa_call(summarizer, parser.document))
-    summary = []
-    summary_chars = 0
-    summary_chars_penultimate = 0
-    while summary_chars < max_chars:
-        try:
-            next_sentence = next(scored_sentences)
-            summary.append(next_sentence)
-            summary_chars_penultimate = summary_chars
-            summary_chars += len(" " + next_sentence[0]._text)
-        except StopIteration:
+    stop_words = frozenset(w.lower() for w in get_stop_words(language))
+    sentence_pairs = _tokenize_sentences(text, tokenizer)
+    if not sentence_pairs:
+        return [] if with_scores else ""
+    sentence_texts = [s for s, _ in sentence_pairs]
+    sentence_offsets = [off for _, off in sentence_pairs]
+    word_indices, n_vocab, doc_freq = _build_sentence_vocab(
+        sentence_texts, tokenizer, stemmer, stop_words
+    )
+    scores = _score_tfidf_centroid(word_indices, n_vocab, doc_freq)
+    scored = sorted(
+        ((sentence_texts[i], scores[i], sentence_offsets[i], i)
+         for i in range(len(sentence_texts))),
+        key=lambda x: x[1],
+        reverse=True,
+    )
+    selected = []
+    total_chars = 0
+    chars_before_last = 0
+    for sent_text, score, offset, order in scored:
+        if total_chars >= max_chars:
             break
-    summary = sorted(summary, key=lambda x: x[2])
-    summary = [(sentence[0]._text, sentence[1]) for sentence in summary]
-    if summary_chars > max_chars:
-        summary[-1] = (
-            summary[-1][0][: max_chars - summary_chars_penultimate],
-            summary[-1][1],
-        )
+        selected.append((sent_text, score, offset, order))
+        chars_before_last = total_chars
+        total_chars += len(sent_text) + 1
+    selected.sort(key=lambda x: x[3])
+    summary = [(s[0], s[1], s[2]) for s in selected]
+    if total_chars > max_chars and summary:
+        remaining = max_chars - chars_before_last
+        if remaining > 0:
+            summary[-1] = (summary[-1][0][:remaining], summary[-1][1], summary[-1][2])
+        else:
+            summary.pop()
     if not with_scores:
-        summary = " ".join([s[0] for s in summary])
-    else:
-        min_score = min([s[1] for s in summary]) if summary else 0
-        max_score = max([min_score] + [s[1] for s in summary])
-        score_range = 1 if min_score == max_score else (max_score - min_score)
-        summary = [(s[0], (s[1] - min_score) / score_range) for s in summary]
-    return summary
+        return " ".join(s[0] for s in summary)
+    if not summary:
+        return []
+    min_score = min(s[1] for s in summary)
+    max_score = max(s[1] for s in summary)
+    score_range = (max_score - min_score) if max_score != min_score else 1.0
+    return [(s[0], (s[1] - min_score) / score_range, s[2]) for s in summary]
 def _preprocess_newlines_for_ner(text: str) -> str:
@@ -1008,7 +1147,7 @@ def _strip_honorifics_for_ner(text: str) -> str:
     return result
-def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines):
+def _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines):
     """Preprocess a single text for NER (newlines, honorifics, compression)."""
     if preprocess_newlines:
         text = _preprocess_newlines_for_ner(text)
@@ -1016,11 +1155,11 @@ def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess
     cr = compression_ratio
     if cr == "auto":
-        cr = max(1.0, len(text) / 15000) if fast else 1.0
+        cr = max(1.0, len(text) / 15000)
     if cr > 1.0:
         sentences = get_extractive_summary(
-            text, language, int(len(text) / cr), fast=fast, with_scores=True
+            text, language, int(len(text) / cr), with_scores=True
         )
         text = " ".join([s[0] for s in sentences])
@@ -1032,7 +1171,6 @@ def _ner_pipe_batch(
         language,
         model,
         engine_type="spacy",
-        fast=False,
         compression_ratio="auto",
         with_comentions=True,
         with_context=True,
@@ -1056,7 +1194,7 @@ def _ner_pipe_batch(
         if not isinstance(t, str):
             raise TypeError(f"Each text must be str, not {type(t).__name__}")
         processed_texts.append(
-            _preprocess_text_for_ner(t, language, fast, compression_ratio, preprocess_newlines)
+            _preprocess_text_for_ner(t, language, compression_ratio, preprocess_newlines)
         )
     if _analyzer is None:
@@ -1084,7 +1222,6 @@ def ner_pipe(
         language,
         model,
         engine_type="spacy",
-        fast=False,
         compression_ratio="auto",
         with_scores=False,
         with_comentions=True,
@@ -1109,8 +1246,8 @@ def ner_pipe(
         language: Language code (e.g., 'en', 'de', 'fr')
         model: Model name or instance for spacy/flair engine
         engine_type: 'regex', 'flair', 'spacy' or 'custom'
-        fast: Use fast summarization for long texts
-        compression_ratio: Compression ratio for long texts ('auto' or float)
+        compression_ratio: Compression ratio for long texts ('auto' or float).
+                           'auto' compresses texts over ~15k chars proportionally.
         with_scores: Include confidence scores (not implemented)
         with_comentions: Include co-mentioned entities
         with_context: Include surrounding context
@@ -1129,7 +1266,7 @@ def ner_pipe(
     if isinstance(text, list):
         return _ner_pipe_batch(
-            text, language, model, engine_type, fast, compression_ratio,
+            text, language, model, engine_type, compression_ratio,
             with_comentions=with_comentions, with_context=with_context,
             entities=entities, score_threshold=score_threshold,
             batch_size=batch_size, n_process=n_process,
@@ -1146,7 +1283,7 @@ def ner_pipe(
             model=model,
         )
-    text = _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines)
+    text = _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines)
     ner = compute_ner_presidio(
         text,
@@ -1168,7 +1305,7 @@ def get_ner_handler(
         language,
         model,
         engine_type="spacy",
-        fast=False,
+        compression_ratio="auto",
         entities=None,
         score_threshold=0.5,
         batch_size=32,
@@ -1186,12 +1323,11 @@ def get_ner_handler(
         model=model,
     )
-    return lambda text, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
+    return lambda text, compression_ratio=compression_ratio, with_scores=False, with_comentions=True, with_context=True: ner_pipe(
         text,
         language,
         model,
         engine_type,
-        fast,
         compression_ratio,
         with_scores,
         with_comentions,
@@ -1203,8 +1339,3 @@ def get_ner_handler(
         preprocess_newlines,
         _analyzer=analyzer
     )
-@st.cache_resource
-def get_cached_ner_handler(language, model):
-    return get_ner_handler(language, model)