PyRuSH 1.0.10.dev0__tar.gz → 1.0.12.dev0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PKG-INFO +15 -1
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/PyRuSHSentencizer.py +55 -16
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/RuSH.py +10 -46
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/StaticSentencizerFun.cpp +5687 -1536
- pyrush-1.0.12.dev0/PyRuSH/StaticSentencizerFun.pyx +215 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/__init__.py +1 -1
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/PKG-INFO +15 -1
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/SOURCES.txt +6 -1
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/requires.txt +1 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/README.rst +13 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/conf/rush_rules.tsv +1 -1
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/pyproject.toml +0 -1
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/requirements.txt +2 -1
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/setup.cfg +0 -1
- pyrush-1.0.12.dev0/tests/test_PyRuSHSentencizer_param.py +70 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/tests/test_PyRushSentencizer.py +12 -10
- pyrush-1.0.12.dev0/tests/test_PyRushSentencizer2.py +45 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/tests/test_Rush.py +38 -6
- pyrush-1.0.12.dev0/tests/test_cpredict_split_gaps.py +198 -0
- pyrush-1.0.12.dev0/tests/test_debug.py +35 -0
- pyrush-1.0.12.dev0/tests/test_merge_gaps_max_length.py +119 -0
- pyrush-1.0.10.dev0/PyRuSH/StaticSentencizerFun.pyx +0 -125
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/LICENSE +0 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/MANIFEST.in +0 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/dependency_links.txt +0 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/not-zip-safe +0 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH.egg-info/top_level.txt +0 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/setup.py +0 -0
- {pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/tests/test_Rush_w_Logger.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PyRuSH
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.12.dev0
|
|
4
4
|
Summary: PyRuSH is the python implementation of RuSH (Rule-based sentence Segmenter using Hashing), which is originally developed using Java. RuSH is an efficient, reliable, and easy adaptable rule-based sentence segmentation solution. It is specifically designed to handle the telegraphic written text in clinical note. It leverages a nested hash table to execute simultaneous rule processing, which reduces the impact of the rule-base growth on execution time and eliminates the effect of rule order on accuracy.
|
|
5
5
|
Home-page: https://github.com/jianlins/PyRuSH
|
|
6
6
|
Author: Jianlin
|
|
@@ -18,6 +18,7 @@ Requires-Dist: spacy<3.8; python_version < "3.12"
|
|
|
18
18
|
Requires-Dist: spacy>=3.8; python_version >= "3.12"
|
|
19
19
|
Requires-Dist: PyFastNER>=1.0.8
|
|
20
20
|
Requires-Dist: quicksectx>=0.3.5
|
|
21
|
+
Requires-Dist: loguru
|
|
21
22
|
Dynamic: author
|
|
22
23
|
Dynamic: home-page
|
|
23
24
|
Dynamic: license-file
|
|
@@ -76,3 +77,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py
|
|
|
76
77
|
A Colab Notebook Demo
|
|
77
78
|
---------------------------
|
|
78
79
|
Feel free to try this runnable `Colab notebook Demo <https://colab.research.google.com/drive/1gX9MzZTQiPw8G3x_vUwZbiSXGtbI0uIX?usp=sharing>`_
|
|
80
|
+
|
|
81
|
+
Revision History
|
|
82
|
+
----------------
|
|
83
|
+
|
|
84
|
+
**1.0.11 (2025-09-02)**
|
|
85
|
+
|
|
86
|
+
- Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit.
|
|
87
|
+
- Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences.
|
|
88
|
+
|
|
89
|
+
**1.0.9 (2024-10-27)**
|
|
90
|
+
|
|
91
|
+
- Initial release with spaCy 3.x compatibility and core RuSH logic.
|
|
92
|
+
- Added Spacy-compatible PyRuSHSentencizer component.
|
|
@@ -25,16 +25,22 @@ from .StaticSentencizerFun import cpredict_merge_gaps,cpredict_split_gaps, cset_
|
|
|
25
25
|
@Language.factory("medspacy_pyrush")
|
|
26
26
|
class PyRuSHSentencizer(Sentencizer):
|
|
27
27
|
def __init__(self, nlp: Language, name: str = "medspacy_pyrush", rules_path: str = '', max_repeat: int = 50,
|
|
28
|
-
auto_fix_gaps: bool = True, merge_gaps: bool = False) -> Sentencizer:
|
|
28
|
+
auto_fix_gaps: bool = True, merge_gaps: bool = False, max_sentence_length: int = None) -> Sentencizer:
|
|
29
29
|
"""
|
|
30
|
+
Initialize the PyRuSH sentencizer component.
|
|
30
31
|
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
32
|
+
Args:
|
|
33
|
+
nlp (Language): The spaCy language pipeline.
|
|
34
|
+
name (str): Name of the component. Default is "medspacy_pyrush".
|
|
35
|
+
rules_path (str): Path to the rule file or rules themselves. If empty, defaults to 'conf/rush_rules.tsv'.
|
|
36
|
+
max_repeat (int): Maximum number of repeats allowed for the '+' wildcard in rules.
|
|
37
|
+
auto_fix_gaps (bool): If True, attempts to fix gaps caused by malformed rules.
|
|
38
|
+
merge_gaps (bool): If True, merges gaps between sentences into the preceding sentence. If False, splits gaps (might be multiple whitespaces or new line characters) into separate sentences.
|
|
39
|
+
max_sentence_length (int or None): Maximum allowed sentence length in characters. If set, sentences longer than this will be split.
|
|
40
|
+
|
|
41
|
+
Notes:
|
|
42
|
+
- Setting merge_gaps controls whether gaps are merged or split.
|
|
43
|
+
- max_sentence_length applies to both merge and split modes.
|
|
38
44
|
"""
|
|
39
45
|
self.nlp = nlp
|
|
40
46
|
self.name = name
|
|
@@ -45,32 +51,65 @@ class PyRuSHSentencizer(Sentencizer):
|
|
|
45
51
|
self.rules_path = rules_path
|
|
46
52
|
self.rush = RuSH(rules=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps)
|
|
47
53
|
self.merge_gaps = merge_gaps
|
|
54
|
+
self.max_sentence_length = max_sentence_length
|
|
48
55
|
|
|
49
56
|
@classmethod
|
|
50
57
|
def from_nlp(cls, nlp, **cfg):
|
|
58
|
+
"""
|
|
59
|
+
Create a PyRuSHSentencizer instance from a spaCy nlp object and configuration.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
nlp (Language): The spaCy language pipeline.
|
|
63
|
+
**cfg: Additional configuration parameters for initialization.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
PyRuSHSentencizer: An initialized sentencizer instance.
|
|
67
|
+
"""
|
|
51
68
|
return cls(**cfg)
|
|
52
69
|
|
|
53
70
|
def __call__(self, doc):
|
|
71
|
+
"""
|
|
72
|
+
Apply sentence boundary detection to a spaCy Doc and set sentence start annotations.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
doc (Doc): The spaCy Doc to process.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Doc: The processed Doc with sentence boundaries set.
|
|
79
|
+
"""
|
|
54
80
|
tags = self.predict([doc])
|
|
55
81
|
cset_annotations([doc], tags)
|
|
56
82
|
return doc
|
|
57
83
|
|
|
58
84
|
def predict(self, docs):
|
|
59
|
-
"""
|
|
60
|
-
|
|
85
|
+
"""
|
|
86
|
+
Predict sentence boundaries for a batch of spaCy Docs.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
docs (list of Doc): List of spaCy Docs to process.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
list of list of bool: Sentence start guesses for each Doc.
|
|
93
|
+
|
|
94
|
+
Notes:
|
|
95
|
+
- Does not modify the Docs; only returns sentence start predictions.
|
|
61
96
|
"""
|
|
62
97
|
if self.merge_gaps:
|
|
63
|
-
|
|
64
|
-
guesses = cpredict_merge_gaps(docs, self.rush.segToSentenceSpans)
|
|
98
|
+
guesses = cpredict_merge_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
|
|
65
99
|
else:
|
|
66
|
-
guesses = cpredict_split_gaps(docs, self.rush.segToSentenceSpans)
|
|
100
|
+
guesses = cpredict_split_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
|
|
67
101
|
return guesses
|
|
68
102
|
|
|
69
103
|
def set_annotations(self, docs, batch_tag_ids, tensors=None):
|
|
70
104
|
"""
|
|
71
|
-
|
|
105
|
+
Set sentence boundary annotations on spaCy Docs.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
docs (list of Doc): List of spaCy Docs to annotate.
|
|
109
|
+
batch_tag_ids (list of list of bool): Sentence start tags for each Doc.
|
|
110
|
+
tensors: Placeholder for future extensions (optional).
|
|
72
111
|
|
|
73
|
-
|
|
74
|
-
|
|
112
|
+
Notes:
|
|
113
|
+
- This method overwrites spaCy's Sentencizer annotations.
|
|
75
114
|
"""
|
|
76
115
|
cset_annotations(docs, batch_tag_ids, tensors)
|
|
@@ -27,8 +27,7 @@
|
|
|
27
27
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
28
28
|
# See the License for the specific language governing permissions and
|
|
29
29
|
# limitations under the License.
|
|
30
|
-
import
|
|
31
|
-
import logging.config
|
|
30
|
+
from loguru import logger
|
|
32
31
|
import os.path
|
|
33
32
|
from typing import Union, List
|
|
34
33
|
|
|
@@ -40,40 +39,7 @@ END = 'stend'
|
|
|
40
39
|
|
|
41
40
|
|
|
42
41
|
def initLogger():
|
|
43
|
-
|
|
44
|
-
'logging.ini']
|
|
45
|
-
config_file = None
|
|
46
|
-
for f in config_files:
|
|
47
|
-
if os.path.isfile(f):
|
|
48
|
-
config_file = f
|
|
49
|
-
break
|
|
50
|
-
if config_file is None:
|
|
51
|
-
config_file = config_files[-1]
|
|
52
|
-
with open(config_file, 'w') as f:
|
|
53
|
-
f.write('''[loggers]
|
|
54
|
-
keys=root
|
|
55
|
-
|
|
56
|
-
[handlers]
|
|
57
|
-
keys=consoleHandler
|
|
58
|
-
|
|
59
|
-
[formatters]
|
|
60
|
-
keys=simpleFormatter
|
|
61
|
-
|
|
62
|
-
[logger_root]
|
|
63
|
-
level=WARNING
|
|
64
|
-
handlers=consoleHandler
|
|
65
|
-
|
|
66
|
-
[handler_consoleHandler]
|
|
67
|
-
class=StreamHandler
|
|
68
|
-
level=WARNING
|
|
69
|
-
formatter=simpleFormatter
|
|
70
|
-
args=(sys.stdout,)
|
|
71
|
-
|
|
72
|
-
[formatter_simpleFormatter]
|
|
73
|
-
format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
|
|
74
|
-
datefmt=
|
|
75
|
-
''')
|
|
76
|
-
logging.config.fileConfig(config_file)
|
|
42
|
+
pass # Removed: logging config logic for Loguru migration
|
|
77
43
|
|
|
78
44
|
|
|
79
45
|
class RuSH:
|
|
@@ -84,9 +50,7 @@ class RuSH:
|
|
|
84
50
|
self.fastner = FastCNER(rules, max_repeat)
|
|
85
51
|
self.fastner.span_compare_method = 'scorewidth'
|
|
86
52
|
if enable_logger:
|
|
87
|
-
|
|
88
|
-
self.logger = logging.getLogger(__name__)
|
|
89
|
-
print(self.logger.level)
|
|
53
|
+
self.logger = logger
|
|
90
54
|
else:
|
|
91
55
|
self.logger = None
|
|
92
56
|
self.auto_fix_gaps = auto_fix_gaps
|
|
@@ -109,13 +73,13 @@ class RuSH:
|
|
|
109
73
|
self.fastner.process(text, 0, result)
|
|
110
74
|
|
|
111
75
|
# log important message for debugging use
|
|
112
|
-
if self.logger is not None
|
|
76
|
+
if self.logger is not None:
|
|
113
77
|
text = text.replace('\n', ' ')
|
|
114
78
|
for concept_type, spans in result.items():
|
|
115
|
-
self.logger.debug(concept_type)
|
|
79
|
+
self.logger.opt(lazy=True).debug(concept_type)
|
|
116
80
|
for span in spans:
|
|
117
81
|
rule = self.fastner.rule_store[span.rule_id]
|
|
118
|
-
self.logger.debug(
|
|
82
|
+
self.logger.opt(lazy=True).debug(
|
|
119
83
|
'\t{0}-{1}:{2}\t{3}<{4}>\t[Rule {5}:\t{6}\t{7}\t{8}\t{9}]'.format(span.begin, span.end,
|
|
120
84
|
span.score,
|
|
121
85
|
text[:span.begin],
|
|
@@ -185,15 +149,15 @@ class RuSH:
|
|
|
185
149
|
if trimed_gap is not None and trimed_gap.width > self.min_sent_chars:
|
|
186
150
|
output.append(trimed_gap)
|
|
187
151
|
|
|
188
|
-
if self.logger is not None
|
|
152
|
+
if self.logger is not None:
|
|
189
153
|
for sentence in output:
|
|
190
|
-
self.logger.debug(
|
|
154
|
+
self.logger.opt(lazy=True).debug(
|
|
191
155
|
'Sentence({0}-{1}):\t>{2}<'.format(sentence.begin, sentence.end, text[sentence.begin:sentence.end]))
|
|
192
156
|
|
|
193
157
|
return output
|
|
194
158
|
|
|
195
159
|
@staticmethod
|
|
196
|
-
def fix_gap(sentences:
|
|
160
|
+
def fix_gap(sentences: list, text: str, previous_end: int, this_begin: int, min_sent_chars: int = 5):
|
|
197
161
|
trimed_gap = RuSH.trim_gap(text, previous_end, this_begin)
|
|
198
162
|
if trimed_gap is None:
|
|
199
163
|
return
|
|
@@ -203,7 +167,7 @@ class RuSH:
|
|
|
203
167
|
sentences[-1].end = trimed_gap.end
|
|
204
168
|
|
|
205
169
|
@staticmethod
|
|
206
|
-
def trim_gap(text: str, previous_end: int, this_begin: int) -> Span:
|
|
170
|
+
def trim_gap(text: str, previous_end: int, this_begin: int) -> 'Span | None':
|
|
207
171
|
begin = -1
|
|
208
172
|
alnum_begin = -1
|
|
209
173
|
end = 0
|