PyRuSH 1.0.10.dev0__tar.gz → 1.0.12.dev0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PKG-INFO +15 -1
  2. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/PyRuSHSentencizer.py +55 -16
  3. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/RuSH.py +10 -46
  4. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/StaticSentencizerFun.cpp +5687 -1536
  5. pyrush-1.0.12.dev0/PyRuSH/StaticSentencizerFun.pyx +215 -0
  6. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/__init__.py +1 -1
  7. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/PKG-INFO +15 -1
  8. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/SOURCES.txt +6 -1
  9. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/requires.txt +1 -0
  10. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/README.rst +13 -0
  11. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/conf/rush_rules.tsv +1 -1
  12. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/pyproject.toml +0 -1
  13. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/requirements.txt +2 -1
  14. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/setup.cfg +0 -1
  15. pyrush-1.0.12.dev0/tests/test_PyRuSHSentencizer_param.py +70 -0
  16. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/tests/test_PyRushSentencizer.py +12 -10
  17. pyrush-1.0.12.dev0/tests/test_PyRushSentencizer2.py +45 -0
  18. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/tests/test_Rush.py +38 -6
  19. pyrush-1.0.12.dev0/tests/test_cpredict_split_gaps.py +198 -0
  20. pyrush-1.0.12.dev0/tests/test_debug.py +35 -0
  21. pyrush-1.0.12.dev0/tests/test_merge_gaps_max_length.py +119 -0
  22. pyrush-1.0.10.dev0/PyRuSH/StaticSentencizerFun.pyx +0 -125
  23. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/LICENSE +0 -0
  24. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/MANIFEST.in +0 -0
  25. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/dependency_links.txt +0 -0
  26. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/not-zip-safe +0 -0
  27. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/top_level.txt +0 -0
  28. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/setup.py +0 -0
  29. {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/tests/test_Rush_w_Logger.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: PyRuSH
3
- Version: 1.0.10.dev0
3
+ Version: 1.0.12.dev0
4
4
  Summary: PyRuSH is the python implementation of RuSH (Rule-based sentence Segmenter using Hashing), which is originally developed using Java. RuSH is an efficient, reliable, and easy adaptable rule-based sentence segmentation solution. It is specifically designed to handle the telegraphic written text in clinical note. It leverages a nested hash table to execute simultaneous rule processing, which reduces the impact of the rule-base growth on execution time and eliminates the effect of rule order on accuracy.
5
5
  Home-page: https://github.com/jianlins/PyRuSH
6
6
  Author: Jianlin
@@ -18,6 +18,7 @@ Requires-Dist: spacy<3.8; python_version < "3.12"
18
18
  Requires-Dist: spacy>=3.8; python_version >= "3.12"
19
19
  Requires-Dist: PyFastNER>=1.0.8
20
20
  Requires-Dist: quicksectx>=0.3.5
21
+ Requires-Dist: loguru
21
22
  Dynamic: author
22
23
  Dynamic: home-page
23
24
  Dynamic: license-file
@@ -76,3 +77,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py
76
77
  A Colab Notebook Demo
77
78
  ---------------------------
78
79
  Feel free to try this runnable `Colab notebook Demo <https://colab.research.google.com/drive/1gX9MzZTQiPw8G3x_vUwZbiSXGtbI0uIX?usp=sharing>`_
80
+
81
+ Revision History
82
+ ----------------
83
+
84
+ **1.0.11 (2025-09-02)**
85
+
86
+ - Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit.
87
+ - Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences.
88
+
89
+ **1.0.9 (2024-10-27)**
90
+
91
+ - Initial release with spaCy 3.x compatibility and core RuSH logic.
92
+ - Added Spacy-compatible PyRuSHSentencizer component.
@@ -25,16 +25,22 @@ from .StaticSentencizerFun import cpredict_merge_gaps,cpredict_split_gaps, cset_
25
25
  @Language.factory("medspacy_pyrush")
26
26
  class PyRuSHSentencizer(Sentencizer):
27
27
  def __init__(self, nlp: Language, name: str = "medspacy_pyrush", rules_path: str = '', max_repeat: int = 50,
28
- auto_fix_gaps: bool = True, merge_gaps: bool = False) -> Sentencizer:
28
+ auto_fix_gaps: bool = True, merge_gaps: bool = False, max_sentence_length: int = None) -> Sentencizer:
29
29
  """
30
+ Initialize the PyRuSH sentencizer component.
30
31
 
31
- @param rules_path: The string of the rule file path or rules themselves. By default, it will look for
32
- rush_rules.tsv in the site_packages/conf folder.
33
- @param max_repeat: Total number of replicates that allows to be handled by "+" wildcard.
34
- @param auto_fix_gaps: If gaps are caused by malcrafted rules, try to fix them.
35
- However, this has no control of sentence end,
36
- @param merge_gaps: When True, gaps between sentences are merged into the preceding sentence.
37
- When False, gaps are split into separate sentences.
32
+ Args:
33
+ nlp (Language): The spaCy language pipeline.
34
+ name (str): Name of the component. Default is "medspacy_pyrush".
35
+ rules_path (str): Path to the rule file or rules themselves. If empty, defaults to 'conf/rush_rules.tsv'.
36
+ max_repeat (int): Maximum number of repeats allowed for the '+' wildcard in rules.
37
+ auto_fix_gaps (bool): If True, attempts to fix gaps caused by malformed rules.
38
+ merge_gaps (bool): If True, merges gaps between sentences into the preceding sentence. If False, splits gaps (might be multiple whitespaces or new line characters) into separate sentences.
39
+ max_sentence_length (int or None): Maximum allowed sentence length in characters. If set, sentences longer than this will be split.
40
+
41
+ Notes:
42
+ - Setting merge_gaps controls whether gaps are merged or split.
43
+ - max_sentence_length applies to both merge and split modes.
38
44
  """
39
45
  self.nlp = nlp
40
46
  self.name = name
@@ -45,32 +51,65 @@ class PyRuSHSentencizer(Sentencizer):
45
51
  self.rules_path = rules_path
46
52
  self.rush = RuSH(rules=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps)
47
53
  self.merge_gaps = merge_gaps
54
+ self.max_sentence_length = max_sentence_length
48
55
 
49
56
  @classmethod
50
57
  def from_nlp(cls, nlp, **cfg):
58
+ """
59
+ Create a PyRuSHSentencizer instance from a spaCy nlp object and configuration.
60
+
61
+ Args:
62
+ nlp (Language): The spaCy language pipeline.
63
+ **cfg: Additional configuration parameters for initialization.
64
+
65
+ Returns:
66
+ PyRuSHSentencizer: An initialized sentencizer instance.
67
+ """
51
68
  return cls(**cfg)
52
69
 
53
70
  def __call__(self, doc):
71
+ """
72
+ Apply sentence boundary detection to a spaCy Doc and set sentence start annotations.
73
+
74
+ Args:
75
+ doc (Doc): The spaCy Doc to process.
76
+
77
+ Returns:
78
+ Doc: The processed Doc with sentence boundaries set.
79
+ """
54
80
  tags = self.predict([doc])
55
81
  cset_annotations([doc], tags)
56
82
  return doc
57
83
 
58
84
  def predict(self, docs):
59
- """Apply the pipeline's model to a batch of docs, without
60
- modifying them.
85
+ """
86
+ Predict sentence boundaries for a batch of spaCy Docs.
87
+
88
+ Args:
89
+ docs (list of Doc): List of spaCy Docs to process.
90
+
91
+ Returns:
92
+ list of list of bool: Sentence start guesses for each Doc.
93
+
94
+ Notes:
95
+ - Does not modify the Docs; only returns sentence start predictions.
61
96
  """
62
97
  if self.merge_gaps:
63
- from .StaticSentencizerFun import cpredict_ww
64
- guesses = cpredict_merge_gaps(docs, self.rush.segToSentenceSpans)
98
+ guesses = cpredict_merge_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
65
99
  else:
66
- guesses = cpredict_split_gaps(docs, self.rush.segToSentenceSpans)
100
+ guesses = cpredict_split_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
67
101
  return guesses
68
102
 
69
103
  def set_annotations(self, docs, batch_tag_ids, tensors=None):
70
104
  """
71
- This function overwrite spacy's Sentencizer.
105
+ Set sentence boundary annotations on spaCy Docs.
106
+
107
+ Args:
108
+ docs (list of Doc): List of spaCy Docs to annotate.
109
+ batch_tag_ids (list of list of bool): Sentence start tags for each Doc.
110
+ tensors: Placeholder for future extensions (optional).
72
111
 
73
- @param batch_tag_ids: a list of doc's tags (a list of boolean values)
74
- @param tensors: a place holder for future extensions
112
+ Notes:
113
+ - This method overwrites spaCy's Sentencizer annotations.
75
114
  """
76
115
  cset_annotations(docs, batch_tag_ids, tensors)
@@ -27,8 +27,7 @@
27
27
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28
28
  # See the License for the specific language governing permissions and
29
29
  # limitations under the License.
30
- import logging
31
- import logging.config
30
+ from loguru import logger
32
31
  import os.path
33
32
  from typing import Union, List
34
33
 
@@ -40,40 +39,7 @@ END = 'stend'
40
39
 
41
40
 
42
41
  def initLogger():
43
- config_files = ['../../../conf/logging.ini', '../../conf/logging.ini', '../conf/logging.ini', 'conf/logging.ini',
44
- 'logging.ini']
45
- config_file = None
46
- for f in config_files:
47
- if os.path.isfile(f):
48
- config_file = f
49
- break
50
- if config_file is None:
51
- config_file = config_files[-1]
52
- with open(config_file, 'w') as f:
53
- f.write('''[loggers]
54
- keys=root
55
-
56
- [handlers]
57
- keys=consoleHandler
58
-
59
- [formatters]
60
- keys=simpleFormatter
61
-
62
- [logger_root]
63
- level=WARNING
64
- handlers=consoleHandler
65
-
66
- [handler_consoleHandler]
67
- class=StreamHandler
68
- level=WARNING
69
- formatter=simpleFormatter
70
- args=(sys.stdout,)
71
-
72
- [formatter_simpleFormatter]
73
- format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
74
- datefmt=
75
- ''')
76
- logging.config.fileConfig(config_file)
42
+ pass # Removed: logging config logic for Loguru migration
77
43
 
78
44
 
79
45
  class RuSH:
@@ -84,9 +50,7 @@ class RuSH:
84
50
  self.fastner = FastCNER(rules, max_repeat)
85
51
  self.fastner.span_compare_method = 'scorewidth'
86
52
  if enable_logger:
87
- initLogger()
88
- self.logger = logging.getLogger(__name__)
89
- print(self.logger.level)
53
+ self.logger = logger
90
54
  else:
91
55
  self.logger = None
92
56
  self.auto_fix_gaps = auto_fix_gaps
@@ -109,13 +73,13 @@ class RuSH:
109
73
  self.fastner.process(text, 0, result)
110
74
 
111
75
  # log important message for debugging use
112
- if self.logger is not None and self.logger.isEnabledFor(logging.DEBUG):
76
+ if self.logger is not None:
113
77
  text = text.replace('\n', ' ')
114
78
  for concept_type, spans in result.items():
115
- self.logger.debug(concept_type)
79
+ self.logger.opt(lazy=True).debug(concept_type)
116
80
  for span in spans:
117
81
  rule = self.fastner.rule_store[span.rule_id]
118
- self.logger.debug(
82
+ self.logger.opt(lazy=True).debug(
119
83
  '\t{0}-{1}:{2}\t{3}<{4}>\t[Rule {5}:\t{6}\t{7}\t{8}\t{9}]'.format(span.begin, span.end,
120
84
  span.score,
121
85
  text[:span.begin],
@@ -185,15 +149,15 @@ class RuSH:
185
149
  if trimed_gap is not None and trimed_gap.width > self.min_sent_chars:
186
150
  output.append(trimed_gap)
187
151
 
188
- if self.logger is not None and self.logger.isEnabledFor(logging.DEBUG):
152
+ if self.logger is not None:
189
153
  for sentence in output:
190
- self.logger.debug(
154
+ self.logger.opt(lazy=True).debug(
191
155
  'Sentence({0}-{1}):\t>{2}<'.format(sentence.begin, sentence.end, text[sentence.begin:sentence.end]))
192
156
 
193
157
  return output
194
158
 
195
159
  @staticmethod
196
- def fix_gap(sentences: [], text: str, previous_end: int, this_begin: int, min_sent_chars: int = 5):
160
+ def fix_gap(sentences: list, text: str, previous_end: int, this_begin: int, min_sent_chars: int = 5):
197
161
  trimed_gap = RuSH.trim_gap(text, previous_end, this_begin)
198
162
  if trimed_gap is None:
199
163
  return
@@ -203,7 +167,7 @@ class RuSH:
203
167
  sentences[-1].end = trimed_gap.end
204
168
 
205
169
  @staticmethod
206
- def trim_gap(text: str, previous_end: int, this_begin: int) -> Span:
170
+ def trim_gap(text: str, previous_end: int, this_begin: int) -> 'Span | None':
207
171
  begin = -1
208
172
  alnum_begin = -1
209
173
  end = 0