streamlit-octostar-utils 0.5.0.dev7__tar.gz → 0.5.0.dev9__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/pyproject.toml +1 -1
  3. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/celery.py +7 -0
  4. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/nlp/ner.py +214 -83
  5. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/LICENSE +0 -0
  6. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/README.md +0 -0
  7. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/__init__.py +0 -0
  8. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  9. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/contents.py +0 -0
  10. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
  11. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/nifi.py +0 -0
  12. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
  13. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  14. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  15. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  16. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  17. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  18. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  19. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  20. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  21. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  22. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  23. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/core/__init__.py +0 -0
  24. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/core/dict.py +0 -0
  25. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/core/filetypes.py +0 -0
  26. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  27. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  28. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/core/timestamp.py +0 -0
  29. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  30. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
  31. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/nlp/language.py +0 -0
  32. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  33. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/octostar/client.py +0 -0
  34. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/octostar/context.py +0 -0
  35. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  36. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  37. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  38. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/ontology/relationships.py +0 -0
  39. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/ontology/validation.py +0 -0
  40. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/style/__init__.py +0 -0
  41. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/style/common.py +0 -0
  42. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/threading/__init__.py +0 -0
  43. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  44. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  45. {streamlit_octostar_utils-0.5.0.dev7 → streamlit_octostar_utils-0.5.0.dev9}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.5.0.dev7
3
+ Version: 0.5.0.dev9
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "0.5.0-dev.7"
8
+ version = "0.5.0-dev.9"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -291,6 +291,13 @@ class CeleryExecutor(object):
291
291
  def set_started_state(self, task_id, task, *args, **kwargs):
292
292
  result = AsyncResult(task_id, app=self.app)
293
293
  result.backend.store_result(task_id, result=None, state=CeleryExecutor.STARTED)
294
+ request_timelimit = getattr(getattr(task, "request", None), "timelimit", None) or (None, None)
295
+ time_limit = request_timelimit[0] or getattr(task, "time_limit", None) or 0
296
+ extended_ttl = int(time_limit) + int(self.app.conf.result_expires)
297
+ if extended_ttl > int(self.app.conf.result_expires):
298
+ self.redis_client.expire(
299
+ f"{CeleryExecutor.CELERY_BROKER_PREFIX}{task_id}", extended_ttl
300
+ )
294
301
 
295
302
  def register_worker_initialization(self):
296
303
  if self.preload_functions:
@@ -13,18 +13,14 @@ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, AnalysisExplan
13
13
  EntityRecognizer, RecognizerResult
14
14
  from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
15
15
  from presidio_analyzer.predefined_recognizers import SpacyRecognizer, PhoneRecognizer
16
- import streamlit as st
17
16
  import nltk
18
17
  from flair.data import Sentence
19
18
  from flair.models import SequenceTagger
20
19
 
21
20
  from .custom_recognizers import PhonePatternRecognizer, ModernUrlRecognizer
22
21
 
23
- from sumy.parsers.plaintext import PlaintextParser
24
22
  from sumy.nlp.tokenizers import Tokenizer
25
23
  from sumy.nlp.stemmers import Stemmer
26
- from sumy.summarizers.lsa import LsaSummarizer
27
- from sumy.summarizers.luhn import LuhnSummarizer
28
24
  from sumy.utils import get_stop_words
29
25
 
30
26
  from .language import to_name, SPACY_MODELS
@@ -450,36 +446,151 @@ def expand_entities_for_analyzer(entities_list):
450
446
  return list(expanded)
451
447
 
452
448
 
453
- def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
454
- from operator import attrgetter
455
- from sumy.summarizers._summarizer import SentenceInfo
456
-
457
- rate = rating
458
- if isinstance(rating, dict):
459
- assert not args and not kwargs
460
- rate = lambda s: rating[s]
461
- infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
462
- infos = sorted(infos, key=attrgetter("rating"), reverse=True)
463
- return tuple((i.sentence, i.rating, i.order) for i in infos)
464
-
465
-
466
- def _sumy__lsa_call(summarizer, document):
467
- summarizer._ensure_dependecies_installed()
468
- dictionary = summarizer._create_dictionary(document)
469
- if not dictionary:
470
- return ()
471
- matrix = summarizer._create_matrix(document, dictionary)
472
- matrix = summarizer._compute_term_frequency(matrix)
473
- from numpy.linalg import svd as singular_value_decomposition
449
+ def _map_paragraph_sentences(lines, line_offsets, tokenizer):
450
+ """Sentence-tokenize a joined paragraph and map each sentence back to its
451
+ original character offset using a segment map built from the source lines."""
452
+ joined = ' '.join(lines)
453
+ if not joined:
454
+ return []
455
+ sents = tokenizer.to_sentences(joined)
456
+
457
+ segments = []
458
+ j = 0
459
+ for line_text, orig_start in zip(lines, line_offsets):
460
+ segments.append((j, orig_start, len(line_text)))
461
+ j += len(line_text) + 1
462
+
463
+ def _to_original(pos_in_joined):
464
+ for j_start, o_start, length in segments:
465
+ if pos_in_joined < j_start + length:
466
+ return o_start + (pos_in_joined - j_start)
467
+ last = segments[-1]
468
+ return last[1] + last[2]
469
+
470
+ results = []
471
+ search_pos = 0
472
+ for sent in sents:
473
+ idx = joined.find(sent, search_pos)
474
+ if idx == -1:
475
+ idx = search_pos
476
+ results.append((sent, _to_original(idx)))
477
+ search_pos = idx + len(sent)
478
+ return results
479
+
480
+
481
+ def _tokenize_sentences(text, tokenizer):
482
+ """Split text into (sentence_text, original_char_offset) pairs,
483
+ respecting paragraph boundaries."""
484
+ results = []
485
+ current_lines = []
486
+ line_offsets = []
487
+ pos = 0
488
+ for line in text.splitlines(True):
489
+ stripped = line.strip()
490
+ if not stripped:
491
+ if current_lines:
492
+ results.extend(
493
+ _map_paragraph_sentences(current_lines, line_offsets, tokenizer)
494
+ )
495
+ current_lines = []
496
+ line_offsets = []
497
+ else:
498
+ leading = len(line) - len(line.lstrip())
499
+ line_offsets.append(pos + leading)
500
+ current_lines.append(stripped)
501
+ pos += len(line)
502
+ if current_lines:
503
+ results.extend(
504
+ _map_paragraph_sentences(current_lines, line_offsets, tokenizer)
505
+ )
506
+ return results
474
507
 
475
- u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
476
- ranks = iter(summarizer._compute_ranks(sigma, v))
477
- return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
478
508
 
509
+ def _build_sentence_vocab(sentences, tokenizer, stemmer, stop_words):
510
+ """Tokenize words, stem, filter stop words, and build a vocabulary index.
479
511
 
480
- def _sumy__luhn_call(summarizer, document):
481
- words = summarizer._get_significant_words(document.words)
482
- return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
512
+ Returns:
513
+ sentence_word_indices: list of lists of vocab indices per sentence
514
+ n_vocab: total vocabulary size
515
+ doc_freq: dict mapping vocab index to number of sentences containing it
516
+ """
517
+ vocab = {}
518
+ doc_freq = {}
519
+ n_vocab = 0
520
+ sentence_word_indices = []
521
+
522
+ for sent_text in sentences:
523
+ words = tokenizer.to_words(sent_text)
524
+ indices = []
525
+ seen = set()
526
+ for w in words:
527
+ normalized = w.lower()
528
+ if normalized in stop_words:
529
+ continue
530
+ stemmed = stemmer(normalized)
531
+ if stemmed not in vocab:
532
+ vocab[stemmed] = n_vocab
533
+ n_vocab += 1
534
+ idx = vocab[stemmed]
535
+ indices.append(idx)
536
+ if idx not in seen:
537
+ seen.add(idx)
538
+ doc_freq[idx] = doc_freq.get(idx, 0) + 1
539
+ sentence_word_indices.append(indices)
540
+
541
+ return sentence_word_indices, n_vocab, doc_freq
542
+
543
+
544
+ def _score_tfidf_centroid(sentence_word_indices, n_vocab, doc_freq):
545
+ """Score sentences by TF-IDF cosine similarity to the document centroid.
546
+
547
+ Each sentence gets a TF-IDF vector (sparse dict). The centroid is the mean
548
+ of all sentence vectors. Sentence score = cosine(sentence_vec, centroid).
549
+ Runs in O(S * avg_words) time and memory with no dense matrix allocation.
550
+ """
551
+ n_sents = len(sentence_word_indices)
552
+ if n_sents == 0 or n_vocab == 0:
553
+ return [0.0] * n_sents
554
+
555
+ idf = {idx: math.log(n_sents / count) + 1.0 for idx, count in doc_freq.items()}
556
+
557
+ centroid = {}
558
+ sentence_tfidf = []
559
+ for indices in sentence_word_indices:
560
+ if not indices:
561
+ sentence_tfidf.append(None)
562
+ continue
563
+ tf = {}
564
+ for idx in indices:
565
+ tf[idx] = tf.get(idx, 0) + 1
566
+ max_tf = max(tf.values())
567
+ tfidf = {}
568
+ for idx, count in tf.items():
569
+ val = (0.5 + 0.5 * count / max_tf) * idf.get(idx, 0.0)
570
+ tfidf[idx] = val
571
+ centroid[idx] = centroid.get(idx, 0.0) + val
572
+ sentence_tfidf.append(tfidf)
573
+
574
+ inv_n = 1.0 / n_sents
575
+ centroid_norm_sq = 0.0
576
+ for idx in centroid:
577
+ centroid[idx] *= inv_n
578
+ centroid_norm_sq += centroid[idx] ** 2
579
+ centroid_norm = math.sqrt(centroid_norm_sq)
580
+
581
+ if centroid_norm == 0:
582
+ return [0.0] * n_sents
583
+
584
+ scores = []
585
+ for tfidf in sentence_tfidf:
586
+ if tfidf is None:
587
+ scores.append(0.0)
588
+ continue
589
+ dot = sum(val * centroid.get(idx, 0.0) for idx, val in tfidf.items())
590
+ sent_norm = math.sqrt(sum(v * v for v in tfidf.values()))
591
+ scores.append(dot / (sent_norm * centroid_norm) if sent_norm > 0 else 0.0)
592
+
593
+ return scores
483
594
 
484
595
 
485
596
  def get_nltk_tokenizer(language: str) -> Tokenizer:
@@ -911,44 +1022,72 @@ def _compute_ner_batch(
911
1022
  return all_ner_objects
912
1023
 
913
1024
 
914
- def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
1025
+ def get_extractive_summary(text, language, max_chars, with_scores=False):
1026
+ """Extract a summary using TF-IDF centroid sentence scoring.
1027
+
1028
+ Scores all sentences by cosine similarity to the TF-IDF document centroid,
1029
+ then greedily selects the highest-scoring sentences up to max_chars.
1030
+
1031
+ Args:
1032
+ text: Input text to summarize.
1033
+ language: Language code (e.g. 'en', 'de', 'fr').
1034
+ max_chars: Maximum character budget for the summary.
1035
+ with_scores: If True, return list of (sentence_text, normalized_score,
1036
+ original_char_offset) 3-tuples in document order.
1037
+ If False, return a single joined summary string.
1038
+ """
915
1039
  tokenizer = get_nltk_tokenizer(language)
916
1040
  stemmer = Stemmer(language)
917
- parser = PlaintextParser.from_string(text, tokenizer)
918
- if fast:
919
- summarizer = LuhnSummarizer(stemmer)
920
- summarizer.stop_words = get_stop_words(language)
921
- scored_sentences = iter(_sumy__luhn_call(summarizer, parser.document))
922
- else:
923
- summarizer = LsaSummarizer(stemmer)
924
- summarizer.stop_words = get_stop_words(language)
925
- scored_sentences = iter(_sumy__lsa_call(summarizer, parser.document))
926
- summary = []
927
- summary_chars = 0
928
- summary_chars_penultimate = 0
929
- while summary_chars < max_chars:
930
- try:
931
- next_sentence = next(scored_sentences)
932
- summary.append(next_sentence)
933
- summary_chars_penultimate = summary_chars
934
- summary_chars += len(" " + next_sentence[0]._text)
935
- except StopIteration:
1041
+ stop_words = frozenset(w.lower() for w in get_stop_words(language))
1042
+
1043
+ sentence_pairs = _tokenize_sentences(text, tokenizer)
1044
+ if not sentence_pairs:
1045
+ return [] if with_scores else ""
1046
+
1047
+ sentence_texts = [s for s, _ in sentence_pairs]
1048
+ sentence_offsets = [off for _, off in sentence_pairs]
1049
+
1050
+ word_indices, n_vocab, doc_freq = _build_sentence_vocab(
1051
+ sentence_texts, tokenizer, stemmer, stop_words
1052
+ )
1053
+ scores = _score_tfidf_centroid(word_indices, n_vocab, doc_freq)
1054
+
1055
+ scored = sorted(
1056
+ ((sentence_texts[i], scores[i], sentence_offsets[i], i)
1057
+ for i in range(len(sentence_texts))),
1058
+ key=lambda x: x[1],
1059
+ reverse=True,
1060
+ )
1061
+
1062
+ selected = []
1063
+ total_chars = 0
1064
+ chars_before_last = 0
1065
+ for sent_text, score, offset, order in scored:
1066
+ if total_chars >= max_chars:
936
1067
  break
937
- summary = sorted(summary, key=lambda x: x[2])
938
- summary = [(sentence[0]._text, sentence[1]) for sentence in summary]
939
- if summary_chars > max_chars:
940
- summary[-1] = (
941
- summary[-1][0][: max_chars - summary_chars_penultimate],
942
- summary[-1][1],
943
- )
1068
+ selected.append((sent_text, score, offset, order))
1069
+ chars_before_last = total_chars
1070
+ total_chars += len(sent_text) + 1
1071
+
1072
+ selected.sort(key=lambda x: x[3])
1073
+ summary = [(s[0], s[1], s[2]) for s in selected]
1074
+
1075
+ if total_chars > max_chars and summary:
1076
+ remaining = max_chars - chars_before_last
1077
+ if remaining > 0:
1078
+ summary[-1] = (summary[-1][0][:remaining], summary[-1][1], summary[-1][2])
1079
+ else:
1080
+ summary.pop()
1081
+
944
1082
  if not with_scores:
945
- summary = " ".join([s[0] for s in summary])
946
- else:
947
- min_score = min([s[1] for s in summary]) if summary else 0
948
- max_score = max([min_score] + [s[1] for s in summary])
949
- score_range = 1 if min_score == max_score else (max_score - min_score)
950
- summary = [(s[0], (s[1] - min_score) / score_range) for s in summary]
951
- return summary
1083
+ return " ".join(s[0] for s in summary)
1084
+
1085
+ if not summary:
1086
+ return []
1087
+ min_score = min(s[1] for s in summary)
1088
+ max_score = max(s[1] for s in summary)
1089
+ score_range = (max_score - min_score) if max_score != min_score else 1.0
1090
+ return [(s[0], (s[1] - min_score) / score_range, s[2]) for s in summary]
952
1091
 
953
1092
 
954
1093
  def _preprocess_newlines_for_ner(text: str) -> str:
@@ -1008,7 +1147,7 @@ def _strip_honorifics_for_ner(text: str) -> str:
1008
1147
  return result
1009
1148
 
1010
1149
 
1011
- def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines):
1150
+ def _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines):
1012
1151
  """Preprocess a single text for NER (newlines, honorifics, compression)."""
1013
1152
  if preprocess_newlines:
1014
1153
  text = _preprocess_newlines_for_ner(text)
@@ -1016,11 +1155,11 @@ def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess
1016
1155
 
1017
1156
  cr = compression_ratio
1018
1157
  if cr == "auto":
1019
- cr = max(1.0, len(text) / 15000) if fast else 1.0
1158
+ cr = max(1.0, len(text) / 15000)
1020
1159
 
1021
1160
  if cr > 1.0:
1022
1161
  sentences = get_extractive_summary(
1023
- text, language, int(len(text) / cr), fast=fast, with_scores=True
1162
+ text, language, int(len(text) / cr), with_scores=True
1024
1163
  )
1025
1164
  text = " ".join([s[0] for s in sentences])
1026
1165
 
@@ -1032,7 +1171,6 @@ def _ner_pipe_batch(
1032
1171
  language,
1033
1172
  model,
1034
1173
  engine_type="spacy",
1035
- fast=False,
1036
1174
  compression_ratio="auto",
1037
1175
  with_comentions=True,
1038
1176
  with_context=True,
@@ -1056,7 +1194,7 @@ def _ner_pipe_batch(
1056
1194
  if not isinstance(t, str):
1057
1195
  raise TypeError(f"Each text must be str, not {type(t).__name__}")
1058
1196
  processed_texts.append(
1059
- _preprocess_text_for_ner(t, language, fast, compression_ratio, preprocess_newlines)
1197
+ _preprocess_text_for_ner(t, language, compression_ratio, preprocess_newlines)
1060
1198
  )
1061
1199
 
1062
1200
  if _analyzer is None:
@@ -1084,7 +1222,6 @@ def ner_pipe(
1084
1222
  language,
1085
1223
  model,
1086
1224
  engine_type="spacy",
1087
- fast=False,
1088
1225
  compression_ratio="auto",
1089
1226
  with_scores=False,
1090
1227
  with_comentions=True,
@@ -1109,8 +1246,8 @@ def ner_pipe(
1109
1246
  language: Language code (e.g., 'en', 'de', 'fr')
1110
1247
  model: Model name or instance for spacy/flair engine
1111
1248
  engine_type: 'regex', 'flair', 'spacy' or 'custom'
1112
- fast: Use fast summarization for long texts
1113
- compression_ratio: Compression ratio for long texts ('auto' or float)
1249
+ compression_ratio: Compression ratio for long texts ('auto' or float).
1250
+ 'auto' compresses texts over ~15k chars proportionally.
1114
1251
  with_scores: Include confidence scores (not implemented)
1115
1252
  with_comentions: Include co-mentioned entities
1116
1253
  with_context: Include surrounding context
@@ -1129,7 +1266,7 @@ def ner_pipe(
1129
1266
 
1130
1267
  if isinstance(text, list):
1131
1268
  return _ner_pipe_batch(
1132
- text, language, model, engine_type, fast, compression_ratio,
1269
+ text, language, model, engine_type, compression_ratio,
1133
1270
  with_comentions=with_comentions, with_context=with_context,
1134
1271
  entities=entities, score_threshold=score_threshold,
1135
1272
  batch_size=batch_size, n_process=n_process,
@@ -1146,7 +1283,7 @@ def ner_pipe(
1146
1283
  model=model,
1147
1284
  )
1148
1285
 
1149
- text = _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines)
1286
+ text = _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines)
1150
1287
 
1151
1288
  ner = compute_ner_presidio(
1152
1289
  text,
@@ -1168,7 +1305,7 @@ def get_ner_handler(
1168
1305
  language,
1169
1306
  model,
1170
1307
  engine_type="spacy",
1171
- fast=False,
1308
+ compression_ratio="auto",
1172
1309
  entities=None,
1173
1310
  score_threshold=0.5,
1174
1311
  batch_size=32,
@@ -1186,12 +1323,11 @@ def get_ner_handler(
1186
1323
  model=model,
1187
1324
  )
1188
1325
 
1189
- return lambda text, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
1326
+ return lambda text, compression_ratio=compression_ratio, with_scores=False, with_comentions=True, with_context=True: ner_pipe(
1190
1327
  text,
1191
1328
  language,
1192
1329
  model,
1193
1330
  engine_type,
1194
- fast,
1195
1331
  compression_ratio,
1196
1332
  with_scores,
1197
1333
  with_comentions,
@@ -1203,8 +1339,3 @@ def get_ner_handler(
1203
1339
  preprocess_newlines,
1204
1340
  _analyzer=analyzer
1205
1341
  )
1206
-
1207
-
1208
- @st.cache_resource
1209
- def get_cached_ner_handler(language, model):
1210
- return get_ner_handler(language, model)