streamlit-octostar-utils 0.5.0.dev6__tar.gz → 0.5.0.dev8__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/PKG-INFO +1 -1
  2. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/pyproject.toml +1 -1
  3. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/nifi.py +47 -19
  4. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/ner.py +214 -83
  5. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/LICENSE +0 -0
  6. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/README.md +0 -0
  7. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/__init__.py +0 -0
  8. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/__init__.py +0 -0
  9. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/celery.py +0 -0
  10. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/contents.py +0 -0
  11. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/fastapi.py +0 -0
  12. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parallelism.py +0 -0
  13. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/__init__.py +0 -0
  14. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/combine_fields.py +0 -0
  15. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/entities_parser.py +0 -0
  16. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/generics.py +0 -0
  17. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/info.py +0 -0
  18. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/linkchart_functions.py +0 -0
  19. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/matches.py +0 -0
  20. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/parameters.py +0 -0
  21. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/rules.py +0 -0
  22. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/api_crafter/parser/signals.py +0 -0
  23. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/__init__.py +0 -0
  24. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/dict.py +0 -0
  25. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/filetypes.py +0 -0
  26. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/threading/__init__.py +0 -0
  27. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/threading/key_queue.py +0 -0
  28. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/core/timestamp.py +0 -0
  29. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/__init__.py +0 -0
  30. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/custom_recognizers.py +0 -0
  31. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/nlp/language.py +0 -0
  32. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/__init__.py +0 -0
  33. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/client.py +0 -0
  34. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/context.py +0 -0
  35. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/octostar/permissions.py +0 -0
  36. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/__init__.py +0 -0
  37. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/inheritance.py +0 -0
  38. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/relationships.py +0 -0
  39. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/ontology/validation.py +0 -0
  40. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/style/__init__.py +0 -0
  41. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/style/common.py +0 -0
  42. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/__init__.py +0 -0
  43. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/async_task_manager.py +0 -0
  44. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/session_callback_manager.py +0 -0
  45. {streamlit_octostar_utils-0.5.0.dev6 → streamlit_octostar_utils-0.5.0.dev8}/streamlit_octostar_utils/threading/session_state_hot_swapper.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: streamlit-octostar-utils
3
- Version: 0.5.0.dev6
3
+ Version: 0.5.0.dev8
4
4
  Summary:
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -5,7 +5,7 @@ include = '\.pyi?$'
5
5
 
6
6
  [tool.poetry]
7
7
  name = "streamlit-octostar-utils"
8
- version = "0.5.0-dev.6"
8
+ version = "0.5.0-dev.8"
9
9
  description = ""
10
10
  license = "MIT"
11
11
  authors = ["Octostar"]
@@ -45,6 +45,7 @@ TAG_RELATIONSHIP = RelationshipName(name="has_tag", type="mtm")
45
45
  FRAGMENT_RELATIONSHIP = RelationshipName(name="is_fragment_of", type="otm")
46
46
  PREVIOUS_FRAGMENT_RELATIONSHIP = RelationshipName(name="has_previous_fragment", type="otm")
47
47
  NEXT_FRAGMENT_RELATIONSHIP = RelationshipName(name="has_next_fragment", type="otm")
48
+ SOURCE_FRAGMENT_ENTITY_RELATIONSHIP = RelationshipName(name="is_child_fragment_of", type="otm")
48
49
  OS_RESERVED_FIELDS = [
49
50
  "os_entity_uid", "entity_id", "entity_type", "os_concept",
50
51
  "entity_label", "os_created_at", "os_created_by",
@@ -212,6 +213,7 @@ class NifiFragmenter(object):
212
213
  raise ValueError("Cannot have more than 100k entities for fragmentation")
213
214
  identifier = str(uuid.uuid4())
214
215
  root_uid = fragments[0].record["os_entity_uid"]
216
+ root_type = fragments[0].record["os_concept"]
215
217
  for i, entity in enumerate(fragments):
216
218
  travel_dict(entity.request["nifi_attributes"], fragmenter_keylist.split("."), "w")(
217
219
  {"identifier": identifier, "count": count, "index": i}
@@ -225,7 +227,7 @@ class NifiFragmenter(object):
225
227
  "fragments_stack"
226
228
  ]
227
229
  travel_dict(entity.request["config"]["fragment"], fragmenter_keylist.split("."), "w")(
228
- {"identifier": identifier, "count": count, "index": i, "root_uid": root_uid}
230
+ {"identifier": identifier, "count": count, "index": i, "root_uid": root_uid, "root_type": root_type}
229
231
  )
230
232
 
231
233
  def push_defragment_strategy(fragment, defragmenter_config):
@@ -424,30 +426,34 @@ class NifiFragmenter(object):
424
426
  )
425
427
 
426
428
  @staticmethod
427
- def resolve_source_entity_uid(entity, fragment_root_source=None) -> str:
428
- """Resolve the source_entity_uid to use for child fragments.
429
+ def resolve_source_entity(entity, fragment_root_source=None) -> tuple:
430
+ """Resolve the source entity (recursive originator) for child fragments.
429
431
 
430
432
  When fragment_root_source is set (a fragment name or stack index),
431
- the UID is looked up via entity.get_fragment_root_uid() -- this is
432
- necessary when the current entity is a clone that may not be persisted.
433
- Otherwise falls back to the entity's own UID.
433
+ the root is looked up via entity.get_fragment_root(). When it is
434
+ None and the entity is already fragmented, the oldest fragmentation
435
+ level's root is used (the original non-fragment ancestor). Otherwise
436
+ falls back to the entity itself.
434
437
 
435
438
  Args:
436
439
  entity: A NifiEntity or NifiEntityProxy.
437
- fragment_root_source: None to use the entity's own UID, an int
440
+ fragment_root_source: None for automatic resolution, an int
438
441
  to index into the fragments stack, or a string fragmenter
439
442
  keylist name.
440
443
 
441
444
  Returns:
442
- The resolved source entity UID string.
445
+ (source_entity_uid, source_entity_type) tuple.
443
446
  """
444
- if fragment_root_source is None:
445
- return entity.record["os_entity_uid"]
446
- try:
447
- idx = int(fragment_root_source)
448
- return entity.get_fragment_root_uid(idx)
449
- except (ValueError, TypeError):
450
- return entity.get_fragment_root_uid(fragment_root_source)
447
+ if fragment_root_source is not None:
448
+ try:
449
+ idx = int(fragment_root_source)
450
+ return entity.get_fragment_root(idx)
451
+ except (ValueError, TypeError):
452
+ return entity.get_fragment_root(fragment_root_source)
453
+ stack = entity.request.get("config", {}).get("fragment", {}).get("fragments_stack", [])
454
+ if stack:
455
+ return entity.get_fragment_root(-1)
456
+ return (entity.record["os_entity_uid"], entity.record["os_concept"])
451
457
 
452
458
 
453
459
  class NifiEntityBatch(object):
@@ -1089,7 +1095,17 @@ class NifiEntity(object):
1089
1095
  )
1090
1096
  return NifiFragmenter.get_fragment_info(self, key).get("index", 0) == 0
1091
1097
 
1092
- def get_fragment_root_uid(self, fragment_name_or_idx) -> str:
1098
+ def get_fragment_root(self, fragment_name_or_idx) -> tuple:
1099
+ """Return (root_uid, root_type) for a given fragmentation level.
1100
+
1101
+ Args:
1102
+ fragment_name_or_idx: An int to index into fragments_stack
1103
+ (e.g. -1 for the oldest fragmentation), or a string
1104
+ fragmenter keylist name.
1105
+
1106
+ Returns:
1107
+ (root_uid, root_type) tuple for the root entity at that level.
1108
+ """
1093
1109
  fragment_config = self.request.get("config", {}).get("fragment", {})
1094
1110
  fragments_stack = fragment_config.get("fragments_stack", [])
1095
1111
  if isinstance(fragment_name_or_idx, int):
@@ -1114,7 +1130,10 @@ class NifiEntity(object):
1114
1130
  raise KeyError(
1115
1131
  f"No root_uid found in fragment config for '{fragment_key}'"
1116
1132
  )
1117
- return fragment_data["root_uid"]
1133
+ return (
1134
+ fragment_data["root_uid"],
1135
+ fragment_data.get("root_type", "os_fragment"),
1136
+ )
1118
1137
 
1119
1138
  def to_json(self):
1120
1139
  if self.drop_on_output:
@@ -1408,6 +1427,7 @@ class NifiEntity(object):
1408
1427
  self,
1409
1428
  os_workspace,
1410
1429
  source_entity_uid,
1430
+ source_entity_type,
1411
1431
  filename,
1412
1432
  filetype,
1413
1433
  file: Union[Contents, bytes],
@@ -1422,9 +1442,12 @@ class NifiEntity(object):
1422
1442
  next_fragment_uid=None,
1423
1443
  next_fragment_relationship_uid=None,
1424
1444
  next_fragment_relationship=NEXT_FRAGMENT_RELATIONSHIP,
1445
+ source_entity_relationship_uid=None,
1446
+ source_entity_relationship=SOURCE_FRAGMENT_ENTITY_RELATIONSHIP,
1425
1447
  ):
1426
1448
  fields = {
1427
1449
  **fields,
1450
+ "os_parent_uid": self.record["os_entity_uid"],
1428
1451
  "source_entity_uid": source_entity_uid,
1429
1452
  "previous_entity_uid": previous_fragment_uid,
1430
1453
  "next_entity_uid": next_fragment_uid,
@@ -1440,9 +1463,14 @@ class NifiEntity(object):
1440
1463
  os_relationship_uid,
1441
1464
  os_entity_type,
1442
1465
  )
1443
- prev_rel, next_rel = None, None
1444
1466
  fragment_uid = child_entity.record["os_entity_uid"]
1445
1467
  fragment_type = child_entity.record["os_concept"]
1468
+ source_rel = self._add_relationship(
1469
+ os_workspace, fragment_uid, fragment_type,
1470
+ source_entity_uid, source_entity_type,
1471
+ source_entity_relationship, {}, source_entity_relationship_uid,
1472
+ )
1473
+ prev_rel, next_rel = None, None
1446
1474
  if previous_fragment_uid:
1447
1475
  prev_rel = self._add_relationship(
1448
1476
  os_workspace, fragment_uid, fragment_type,
@@ -1455,7 +1483,7 @@ class NifiEntity(object):
1455
1483
  next_fragment_uid, os_entity_type,
1456
1484
  next_fragment_relationship, {}, next_fragment_relationship_uid,
1457
1485
  )
1458
- return child_entity, child_rel, prev_rel, next_rel
1486
+ return child_entity, child_rel, source_rel, prev_rel, next_rel
1459
1487
 
1460
1488
  def add_tag(self, os_workspace, name, group, order, color, fields={}):
1461
1489
  return self.add_child_entity(
@@ -13,18 +13,14 @@ from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, AnalysisExplan
13
13
  EntityRecognizer, RecognizerResult
14
14
  from presidio_analyzer.nlp_engine import NlpArtifacts, NlpEngineProvider
15
15
  from presidio_analyzer.predefined_recognizers import SpacyRecognizer, PhoneRecognizer
16
- import streamlit as st
17
16
  import nltk
18
17
  from flair.data import Sentence
19
18
  from flair.models import SequenceTagger
20
19
 
21
20
  from .custom_recognizers import PhonePatternRecognizer, ModernUrlRecognizer
22
21
 
23
- from sumy.parsers.plaintext import PlaintextParser
24
22
  from sumy.nlp.tokenizers import Tokenizer
25
23
  from sumy.nlp.stemmers import Stemmer
26
- from sumy.summarizers.lsa import LsaSummarizer
27
- from sumy.summarizers.luhn import LuhnSummarizer
28
24
  from sumy.utils import get_stop_words
29
25
 
30
26
  from .language import to_name, SPACY_MODELS
@@ -450,36 +446,151 @@ def expand_entities_for_analyzer(entities_list):
450
446
  return list(expanded)
451
447
 
452
448
 
453
- def _sumy__get_best_sentences(sentences, rating, *args, **kwargs):
454
- from operator import attrgetter
455
- from sumy.summarizers._summarizer import SentenceInfo
456
-
457
- rate = rating
458
- if isinstance(rating, dict):
459
- assert not args and not kwargs
460
- rate = lambda s: rating[s]
461
- infos = (SentenceInfo(s, o, rate(s, *args, **kwargs)) for o, s in enumerate(sentences))
462
- infos = sorted(infos, key=attrgetter("rating"), reverse=True)
463
- return tuple((i.sentence, i.rating, i.order) for i in infos)
464
-
465
-
466
- def _sumy__lsa_call(summarizer, document):
467
- summarizer._ensure_dependecies_installed()
468
- dictionary = summarizer._create_dictionary(document)
469
- if not dictionary:
470
- return ()
471
- matrix = summarizer._create_matrix(document, dictionary)
472
- matrix = summarizer._compute_term_frequency(matrix)
473
- from numpy.linalg import svd as singular_value_decomposition
449
+ def _map_paragraph_sentences(lines, line_offsets, tokenizer):
450
+ """Sentence-tokenize a joined paragraph and map each sentence back to its
451
+ original character offset using a segment map built from the source lines."""
452
+ joined = ' '.join(lines)
453
+ if not joined:
454
+ return []
455
+ sents = tokenizer.to_sentences(joined)
456
+
457
+ segments = []
458
+ j = 0
459
+ for line_text, orig_start in zip(lines, line_offsets):
460
+ segments.append((j, orig_start, len(line_text)))
461
+ j += len(line_text) + 1
462
+
463
+ def _to_original(pos_in_joined):
464
+ for j_start, o_start, length in segments:
465
+ if pos_in_joined < j_start + length:
466
+ return o_start + (pos_in_joined - j_start)
467
+ last = segments[-1]
468
+ return last[1] + last[2]
469
+
470
+ results = []
471
+ search_pos = 0
472
+ for sent in sents:
473
+ idx = joined.find(sent, search_pos)
474
+ if idx == -1:
475
+ idx = search_pos
476
+ results.append((sent, _to_original(idx)))
477
+ search_pos = idx + len(sent)
478
+ return results
479
+
480
+
481
+ def _tokenize_sentences(text, tokenizer):
482
+ """Split text into (sentence_text, original_char_offset) pairs,
483
+ respecting paragraph boundaries."""
484
+ results = []
485
+ current_lines = []
486
+ line_offsets = []
487
+ pos = 0
488
+ for line in text.splitlines(True):
489
+ stripped = line.strip()
490
+ if not stripped:
491
+ if current_lines:
492
+ results.extend(
493
+ _map_paragraph_sentences(current_lines, line_offsets, tokenizer)
494
+ )
495
+ current_lines = []
496
+ line_offsets = []
497
+ else:
498
+ leading = len(line) - len(line.lstrip())
499
+ line_offsets.append(pos + leading)
500
+ current_lines.append(stripped)
501
+ pos += len(line)
502
+ if current_lines:
503
+ results.extend(
504
+ _map_paragraph_sentences(current_lines, line_offsets, tokenizer)
505
+ )
506
+ return results
474
507
 
475
- u, sigma, v = singular_value_decomposition(matrix, full_matrices=False)
476
- ranks = iter(summarizer._compute_ranks(sigma, v))
477
- return _sumy__get_best_sentences(document.sentences, lambda s: next(ranks))
478
508
 
509
+ def _build_sentence_vocab(sentences, tokenizer, stemmer, stop_words):
510
+ """Tokenize words, stem, filter stop words, and build a vocabulary index.
479
511
 
480
- def _sumy__luhn_call(summarizer, document):
481
- words = summarizer._get_significant_words(document.words)
482
- return _sumy__get_best_sentences(document.sentences, summarizer.rate_sentence, words)
512
+ Returns:
513
+ sentence_word_indices: list of lists of vocab indices per sentence
514
+ n_vocab: total vocabulary size
515
+ doc_freq: dict mapping vocab index to number of sentences containing it
516
+ """
517
+ vocab = {}
518
+ doc_freq = {}
519
+ n_vocab = 0
520
+ sentence_word_indices = []
521
+
522
+ for sent_text in sentences:
523
+ words = tokenizer.to_words(sent_text)
524
+ indices = []
525
+ seen = set()
526
+ for w in words:
527
+ normalized = w.lower()
528
+ if normalized in stop_words:
529
+ continue
530
+ stemmed = stemmer(normalized)
531
+ if stemmed not in vocab:
532
+ vocab[stemmed] = n_vocab
533
+ n_vocab += 1
534
+ idx = vocab[stemmed]
535
+ indices.append(idx)
536
+ if idx not in seen:
537
+ seen.add(idx)
538
+ doc_freq[idx] = doc_freq.get(idx, 0) + 1
539
+ sentence_word_indices.append(indices)
540
+
541
+ return sentence_word_indices, n_vocab, doc_freq
542
+
543
+
544
+ def _score_tfidf_centroid(sentence_word_indices, n_vocab, doc_freq):
545
+ """Score sentences by TF-IDF cosine similarity to the document centroid.
546
+
547
+ Each sentence gets a TF-IDF vector (sparse dict). The centroid is the mean
548
+ of all sentence vectors. Sentence score = cosine(sentence_vec, centroid).
549
+ Runs in O(S * avg_words) time and memory with no dense matrix allocation.
550
+ """
551
+ n_sents = len(sentence_word_indices)
552
+ if n_sents == 0 or n_vocab == 0:
553
+ return [0.0] * n_sents
554
+
555
+ idf = {idx: math.log(n_sents / count) + 1.0 for idx, count in doc_freq.items()}
556
+
557
+ centroid = {}
558
+ sentence_tfidf = []
559
+ for indices in sentence_word_indices:
560
+ if not indices:
561
+ sentence_tfidf.append(None)
562
+ continue
563
+ tf = {}
564
+ for idx in indices:
565
+ tf[idx] = tf.get(idx, 0) + 1
566
+ max_tf = max(tf.values())
567
+ tfidf = {}
568
+ for idx, count in tf.items():
569
+ val = (0.5 + 0.5 * count / max_tf) * idf.get(idx, 0.0)
570
+ tfidf[idx] = val
571
+ centroid[idx] = centroid.get(idx, 0.0) + val
572
+ sentence_tfidf.append(tfidf)
573
+
574
+ inv_n = 1.0 / n_sents
575
+ centroid_norm_sq = 0.0
576
+ for idx in centroid:
577
+ centroid[idx] *= inv_n
578
+ centroid_norm_sq += centroid[idx] ** 2
579
+ centroid_norm = math.sqrt(centroid_norm_sq)
580
+
581
+ if centroid_norm == 0:
582
+ return [0.0] * n_sents
583
+
584
+ scores = []
585
+ for tfidf in sentence_tfidf:
586
+ if tfidf is None:
587
+ scores.append(0.0)
588
+ continue
589
+ dot = sum(val * centroid.get(idx, 0.0) for idx, val in tfidf.items())
590
+ sent_norm = math.sqrt(sum(v * v for v in tfidf.values()))
591
+ scores.append(dot / (sent_norm * centroid_norm) if sent_norm > 0 else 0.0)
592
+
593
+ return scores
483
594
 
484
595
 
485
596
  def get_nltk_tokenizer(language: str) -> Tokenizer:
@@ -911,44 +1022,72 @@ def _compute_ner_batch(
911
1022
  return all_ner_objects
912
1023
 
913
1024
 
914
- def get_extractive_summary(text, language, max_chars, fast=False, with_scores=False):
1025
+ def get_extractive_summary(text, language, max_chars, with_scores=False):
1026
+ """Extract a summary using TF-IDF centroid sentence scoring.
1027
+
1028
+ Scores all sentences by cosine similarity to the TF-IDF document centroid,
1029
+ then greedily selects the highest-scoring sentences up to max_chars.
1030
+
1031
+ Args:
1032
+ text: Input text to summarize.
1033
+ language: Language code (e.g. 'en', 'de', 'fr').
1034
+ max_chars: Maximum character budget for the summary.
1035
+ with_scores: If True, return list of (sentence_text, normalized_score,
1036
+ original_char_offset) 3-tuples in document order.
1037
+ If False, return a single joined summary string.
1038
+ """
915
1039
  tokenizer = get_nltk_tokenizer(language)
916
1040
  stemmer = Stemmer(language)
917
- parser = PlaintextParser.from_string(text, tokenizer)
918
- if fast:
919
- summarizer = LuhnSummarizer(stemmer)
920
- summarizer.stop_words = get_stop_words(language)
921
- scored_sentences = iter(_sumy__luhn_call(summarizer, parser.document))
922
- else:
923
- summarizer = LsaSummarizer(stemmer)
924
- summarizer.stop_words = get_stop_words(language)
925
- scored_sentences = iter(_sumy__lsa_call(summarizer, parser.document))
926
- summary = []
927
- summary_chars = 0
928
- summary_chars_penultimate = 0
929
- while summary_chars < max_chars:
930
- try:
931
- next_sentence = next(scored_sentences)
932
- summary.append(next_sentence)
933
- summary_chars_penultimate = summary_chars
934
- summary_chars += len(" " + next_sentence[0]._text)
935
- except StopIteration:
1041
+ stop_words = frozenset(w.lower() for w in get_stop_words(language))
1042
+
1043
+ sentence_pairs = _tokenize_sentences(text, tokenizer)
1044
+ if not sentence_pairs:
1045
+ return [] if with_scores else ""
1046
+
1047
+ sentence_texts = [s for s, _ in sentence_pairs]
1048
+ sentence_offsets = [off for _, off in sentence_pairs]
1049
+
1050
+ word_indices, n_vocab, doc_freq = _build_sentence_vocab(
1051
+ sentence_texts, tokenizer, stemmer, stop_words
1052
+ )
1053
+ scores = _score_tfidf_centroid(word_indices, n_vocab, doc_freq)
1054
+
1055
+ scored = sorted(
1056
+ ((sentence_texts[i], scores[i], sentence_offsets[i], i)
1057
+ for i in range(len(sentence_texts))),
1058
+ key=lambda x: x[1],
1059
+ reverse=True,
1060
+ )
1061
+
1062
+ selected = []
1063
+ total_chars = 0
1064
+ chars_before_last = 0
1065
+ for sent_text, score, offset, order in scored:
1066
+ if total_chars >= max_chars:
936
1067
  break
937
- summary = sorted(summary, key=lambda x: x[2])
938
- summary = [(sentence[0]._text, sentence[1]) for sentence in summary]
939
- if summary_chars > max_chars:
940
- summary[-1] = (
941
- summary[-1][0][: max_chars - summary_chars_penultimate],
942
- summary[-1][1],
943
- )
1068
+ selected.append((sent_text, score, offset, order))
1069
+ chars_before_last = total_chars
1070
+ total_chars += len(sent_text) + 1
1071
+
1072
+ selected.sort(key=lambda x: x[3])
1073
+ summary = [(s[0], s[1], s[2]) for s in selected]
1074
+
1075
+ if total_chars > max_chars and summary:
1076
+ remaining = max_chars - chars_before_last
1077
+ if remaining > 0:
1078
+ summary[-1] = (summary[-1][0][:remaining], summary[-1][1], summary[-1][2])
1079
+ else:
1080
+ summary.pop()
1081
+
944
1082
  if not with_scores:
945
- summary = " ".join([s[0] for s in summary])
946
- else:
947
- min_score = min([s[1] for s in summary]) if summary else 0
948
- max_score = max([min_score] + [s[1] for s in summary])
949
- score_range = 1 if min_score == max_score else (max_score - min_score)
950
- summary = [(s[0], (s[1] - min_score) / score_range) for s in summary]
951
- return summary
1083
+ return " ".join(s[0] for s in summary)
1084
+
1085
+ if not summary:
1086
+ return []
1087
+ min_score = min(s[1] for s in summary)
1088
+ max_score = max(s[1] for s in summary)
1089
+ score_range = (max_score - min_score) if max_score != min_score else 1.0
1090
+ return [(s[0], (s[1] - min_score) / score_range, s[2]) for s in summary]
952
1091
 
953
1092
 
954
1093
  def _preprocess_newlines_for_ner(text: str) -> str:
@@ -1008,7 +1147,7 @@ def _strip_honorifics_for_ner(text: str) -> str:
1008
1147
  return result
1009
1148
 
1010
1149
 
1011
- def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines):
1150
+ def _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines):
1012
1151
  """Preprocess a single text for NER (newlines, honorifics, compression)."""
1013
1152
  if preprocess_newlines:
1014
1153
  text = _preprocess_newlines_for_ner(text)
@@ -1016,11 +1155,11 @@ def _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess
1016
1155
 
1017
1156
  cr = compression_ratio
1018
1157
  if cr == "auto":
1019
- cr = max(1.0, len(text) / 15000) if fast else 1.0
1158
+ cr = max(1.0, len(text) / 15000)
1020
1159
 
1021
1160
  if cr > 1.0:
1022
1161
  sentences = get_extractive_summary(
1023
- text, language, int(len(text) / cr), fast=fast, with_scores=True
1162
+ text, language, int(len(text) / cr), with_scores=True
1024
1163
  )
1025
1164
  text = " ".join([s[0] for s in sentences])
1026
1165
 
@@ -1032,7 +1171,6 @@ def _ner_pipe_batch(
1032
1171
  language,
1033
1172
  model,
1034
1173
  engine_type="spacy",
1035
- fast=False,
1036
1174
  compression_ratio="auto",
1037
1175
  with_comentions=True,
1038
1176
  with_context=True,
@@ -1056,7 +1194,7 @@ def _ner_pipe_batch(
1056
1194
  if not isinstance(t, str):
1057
1195
  raise TypeError(f"Each text must be str, not {type(t).__name__}")
1058
1196
  processed_texts.append(
1059
- _preprocess_text_for_ner(t, language, fast, compression_ratio, preprocess_newlines)
1197
+ _preprocess_text_for_ner(t, language, compression_ratio, preprocess_newlines)
1060
1198
  )
1061
1199
 
1062
1200
  if _analyzer is None:
@@ -1084,7 +1222,6 @@ def ner_pipe(
1084
1222
  language,
1085
1223
  model,
1086
1224
  engine_type="spacy",
1087
- fast=False,
1088
1225
  compression_ratio="auto",
1089
1226
  with_scores=False,
1090
1227
  with_comentions=True,
@@ -1109,8 +1246,8 @@ def ner_pipe(
1109
1246
  language: Language code (e.g., 'en', 'de', 'fr')
1110
1247
  model: Model name or instance for spacy/flair engine
1111
1248
  engine_type: 'regex', 'flair', 'spacy' or 'custom'
1112
- fast: Use fast summarization for long texts
1113
- compression_ratio: Compression ratio for long texts ('auto' or float)
1249
+ compression_ratio: Compression ratio for long texts ('auto' or float).
1250
+ 'auto' compresses texts over ~15k chars proportionally.
1114
1251
  with_scores: Include confidence scores (not implemented)
1115
1252
  with_comentions: Include co-mentioned entities
1116
1253
  with_context: Include surrounding context
@@ -1129,7 +1266,7 @@ def ner_pipe(
1129
1266
 
1130
1267
  if isinstance(text, list):
1131
1268
  return _ner_pipe_batch(
1132
- text, language, model, engine_type, fast, compression_ratio,
1269
+ text, language, model, engine_type, compression_ratio,
1133
1270
  with_comentions=with_comentions, with_context=with_context,
1134
1271
  entities=entities, score_threshold=score_threshold,
1135
1272
  batch_size=batch_size, n_process=n_process,
@@ -1146,7 +1283,7 @@ def ner_pipe(
1146
1283
  model=model,
1147
1284
  )
1148
1285
 
1149
- text = _preprocess_text_for_ner(text, language, fast, compression_ratio, preprocess_newlines)
1286
+ text = _preprocess_text_for_ner(text, language, compression_ratio, preprocess_newlines)
1150
1287
 
1151
1288
  ner = compute_ner_presidio(
1152
1289
  text,
@@ -1168,7 +1305,7 @@ def get_ner_handler(
1168
1305
  language,
1169
1306
  model,
1170
1307
  engine_type="spacy",
1171
- fast=False,
1308
+ compression_ratio="auto",
1172
1309
  entities=None,
1173
1310
  score_threshold=0.5,
1174
1311
  batch_size=32,
@@ -1186,12 +1323,11 @@ def get_ner_handler(
1186
1323
  model=model,
1187
1324
  )
1188
1325
 
1189
- return lambda text, compression_ratio="auto", with_scores=False, with_comentions=True, with_context=True: ner_pipe(
1326
+ return lambda text, compression_ratio=compression_ratio, with_scores=False, with_comentions=True, with_context=True: ner_pipe(
1190
1327
  text,
1191
1328
  language,
1192
1329
  model,
1193
1330
  engine_type,
1194
- fast,
1195
1331
  compression_ratio,
1196
1332
  with_scores,
1197
1333
  with_comentions,
@@ -1203,8 +1339,3 @@ def get_ner_handler(
1203
1339
  preprocess_newlines,
1204
1340
  _analyzer=analyzer
1205
1341
  )
1206
-
1207
-
1208
- @st.cache_resource
1209
- def get_cached_ner_handler(language, model):
1210
- return get_ner_handler(language, model)