PyRuSH 1.0.13__cp313-cp313-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- PyRuSH/PyRuSHSentencizer.py +115 -0
- PyRuSH/RuSH.py +190 -0
- PyRuSH/StaticSentencizerFun.cp313-win_amd64.pyd +0 -0
- PyRuSH/StaticSentencizerFun.cpp +14217 -0
- PyRuSH/StaticSentencizerFun.pyx +215 -0
- PyRuSH/__init__.py +36 -0
- conf/rush_rules.tsv +977 -0
- pyrush-1.0.13.dist-info/METADATA +92 -0
- pyrush-1.0.13.dist-info/RECORD +13 -0
- pyrush-1.0.13.dist-info/WHEEL +5 -0
- pyrush-1.0.13.dist-info/licenses/LICENSE +21 -0
- pyrush-1.0.13.dist-info/top_level.txt +1 -0
- requirements.txt +7 -0
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
# ******************************************************************************
|
|
2
|
+
# MIT License
|
|
3
|
+
#
|
|
4
|
+
# Copyright (c) 2020 Jianlin Shi
|
|
5
|
+
#
|
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
|
|
7
|
+
# files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
|
|
8
|
+
# modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
|
|
9
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
10
|
+
#
|
|
11
|
+
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
12
|
+
#
|
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
14
|
+
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
15
|
+
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
16
|
+
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
17
|
+
# ******************************************************************************
|
|
18
|
+
from spacy import Language
|
|
19
|
+
from spacy.pipeline import Sentencizer
|
|
20
|
+
from typing import Optional
|
|
21
|
+
from .RuSH import RuSH
|
|
22
|
+
from .StaticSentencizerFun import cpredict_merge_gaps,cpredict_split_gaps, cset_annotations
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@Language.factory("medspacy_pyrush")
|
|
26
|
+
class PyRuSHSentencizer(Sentencizer):
|
|
27
|
+
def __init__(self, nlp: Language, name: str = "medspacy_pyrush", rules_path: str = '', max_repeat: int = 50,
|
|
28
|
+
auto_fix_gaps: bool = True, merge_gaps: bool = False, max_sentence_length: Optional[int] = None) -> Sentencizer:
|
|
29
|
+
"""
|
|
30
|
+
Initialize the PyRuSH sentencizer component.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
nlp (Language): The spaCy language pipeline.
|
|
34
|
+
name (str): Name of the component. Default is "medspacy_pyrush".
|
|
35
|
+
rules_path (str): Path to the rule file or rules themselves. If empty, defaults to 'conf/rush_rules.tsv'.
|
|
36
|
+
max_repeat (int): Maximum number of repeats allowed for the '+' wildcard in rules.
|
|
37
|
+
auto_fix_gaps (bool): If True, attempts to fix gaps caused by malformed rules.
|
|
38
|
+
merge_gaps (bool): If True, merges gaps between sentences into the preceding sentence. If False, splits gaps (might be multiple whitespaces or new line characters) into separate sentences.
|
|
39
|
+
max_sentence_length (int or None): Maximum allowed sentence length in characters. If set, sentences longer than this will be split.
|
|
40
|
+
|
|
41
|
+
Notes:
|
|
42
|
+
- Setting merge_gaps controls whether gaps are merged or split.
|
|
43
|
+
- max_sentence_length applies to both merge and split modes.
|
|
44
|
+
"""
|
|
45
|
+
self.nlp = nlp
|
|
46
|
+
self.name = name
|
|
47
|
+
if rules_path is None or rules_path == '':
|
|
48
|
+
import os
|
|
49
|
+
root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
50
|
+
rules_path = str(os.path.join(root, 'conf', 'rush_rules.tsv'))
|
|
51
|
+
self.rules_path = rules_path
|
|
52
|
+
self.rush = RuSH(rules=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps)
|
|
53
|
+
self.merge_gaps = merge_gaps
|
|
54
|
+
self.max_sentence_length = max_sentence_length
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def from_nlp(cls, nlp, **cfg):
|
|
58
|
+
"""
|
|
59
|
+
Create a PyRuSHSentencizer instance from a spaCy nlp object and configuration.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
nlp (Language): The spaCy language pipeline.
|
|
63
|
+
**cfg: Additional configuration parameters for initialization.
|
|
64
|
+
|
|
65
|
+
Returns:
|
|
66
|
+
PyRuSHSentencizer: An initialized sentencizer instance.
|
|
67
|
+
"""
|
|
68
|
+
return cls(**cfg)
|
|
69
|
+
|
|
70
|
+
def __call__(self, doc):
|
|
71
|
+
"""
|
|
72
|
+
Apply sentence boundary detection to a spaCy Doc and set sentence start annotations.
|
|
73
|
+
|
|
74
|
+
Args:
|
|
75
|
+
doc (Doc): The spaCy Doc to process.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
Doc: The processed Doc with sentence boundaries set.
|
|
79
|
+
"""
|
|
80
|
+
tags = self.predict([doc])
|
|
81
|
+
cset_annotations([doc], tags)
|
|
82
|
+
return doc
|
|
83
|
+
|
|
84
|
+
def predict(self, docs):
|
|
85
|
+
"""
|
|
86
|
+
Predict sentence boundaries for a batch of spaCy Docs.
|
|
87
|
+
|
|
88
|
+
Args:
|
|
89
|
+
docs (list of Doc): List of spaCy Docs to process.
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
list of list of bool: Sentence start guesses for each Doc.
|
|
93
|
+
|
|
94
|
+
Notes:
|
|
95
|
+
- Does not modify the Docs; only returns sentence start predictions.
|
|
96
|
+
"""
|
|
97
|
+
if self.merge_gaps:
|
|
98
|
+
guesses = cpredict_merge_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
|
|
99
|
+
else:
|
|
100
|
+
guesses = cpredict_split_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
|
|
101
|
+
return guesses
|
|
102
|
+
|
|
103
|
+
def set_annotations(self, docs, batch_tag_ids, tensors=None):
|
|
104
|
+
"""
|
|
105
|
+
Set sentence boundary annotations on spaCy Docs.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
docs (list of Doc): List of spaCy Docs to annotate.
|
|
109
|
+
batch_tag_ids (list of list of bool): Sentence start tags for each Doc.
|
|
110
|
+
tensors: Placeholder for future extensions (optional).
|
|
111
|
+
|
|
112
|
+
Notes:
|
|
113
|
+
- This method overwrites spaCy's Sentencizer annotations.
|
|
114
|
+
"""
|
|
115
|
+
cset_annotations(docs, batch_tag_ids, tensors)
|
PyRuSH/RuSH.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# ******************************************************************************
|
|
2
|
+
# MIT License
|
|
3
|
+
#
|
|
4
|
+
# Copyright (c) 2020 Jianlin Shi
|
|
5
|
+
#
|
|
6
|
+
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
|
|
7
|
+
# files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
|
|
8
|
+
# modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
|
|
9
|
+
# Software is furnished to do so, subject to the following conditions:
|
|
10
|
+
#
|
|
11
|
+
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
|
12
|
+
#
|
|
13
|
+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
|
|
14
|
+
# WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
|
|
15
|
+
# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
16
|
+
# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
17
|
+
# ******************************************************************************
|
|
18
|
+
#
|
|
19
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
20
|
+
# you may not use this file except in compliance with the License.
|
|
21
|
+
# You may obtain a copy of the License at
|
|
22
|
+
#
|
|
23
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
24
|
+
#
|
|
25
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
26
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
27
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
28
|
+
# See the License for the specific language governing permissions and
|
|
29
|
+
# limitations under the License.
|
|
30
|
+
from loguru import logger
|
|
31
|
+
import os.path
|
|
32
|
+
from typing import Union, List
|
|
33
|
+
|
|
34
|
+
from PyFastNER import FastCNER
|
|
35
|
+
from PyFastNER import Span
|
|
36
|
+
|
|
37
|
+
BEGIN = 'stbegin'
|
|
38
|
+
END = 'stend'
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def initLogger():
|
|
42
|
+
pass # Removed: logging config logic for Loguru migration
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RuSH:
|
|
46
|
+
|
|
47
|
+
def __init__(self, rules: Union[str, List] = '', max_repeat: int = 50, auto_fix_gaps: bool = True,
|
|
48
|
+
min_sent_chars: int = 5,
|
|
49
|
+
enable_logger: bool = False):
|
|
50
|
+
self.fastner = FastCNER(rules, max_repeat)
|
|
51
|
+
self.fastner.span_compare_method = 'scorewidth'
|
|
52
|
+
if enable_logger:
|
|
53
|
+
self.logger = logger
|
|
54
|
+
else:
|
|
55
|
+
self.logger = None
|
|
56
|
+
self.auto_fix_gaps = auto_fix_gaps
|
|
57
|
+
# for old RuSh rule format (doesn't have PSEUDO and ACTUAL column), make the conversion.
|
|
58
|
+
if not self.fastner.full_definition:
|
|
59
|
+
self.backCompatableParseRule()
|
|
60
|
+
self.min_sent_chars = min_sent_chars
|
|
61
|
+
pass
|
|
62
|
+
|
|
63
|
+
def backCompatableParseRule(self):
|
|
64
|
+
for id, rule in self.fastner.rule_store.items():
|
|
65
|
+
if rule.score % 2 != 0:
|
|
66
|
+
rule.type = 'PSEUDO'
|
|
67
|
+
self.fastner.constructRuleMap(self.fastner.rule_store)
|
|
68
|
+
pass
|
|
69
|
+
|
|
70
|
+
def segToSentenceSpans(self, text):
|
|
71
|
+
output = []
|
|
72
|
+
result = {BEGIN: [], END: []}
|
|
73
|
+
self.fastner.process(text, 0, result)
|
|
74
|
+
|
|
75
|
+
# log important message for debugging use
|
|
76
|
+
if self.logger is not None:
|
|
77
|
+
text = text.replace('\n', ' ')
|
|
78
|
+
for concept_type, spans in result.items():
|
|
79
|
+
self.logger.opt(lazy=True).debug(concept_type)
|
|
80
|
+
for span in spans:
|
|
81
|
+
rule = self.fastner.rule_store[span.rule_id]
|
|
82
|
+
self.logger.opt(lazy=True).debug(
|
|
83
|
+
'\t{0}-{1}:{2}\t{3}<{4}>\t[Rule {5}:\t{6}\t{7}\t{8}\t{9}]'.format(span.begin, span.end,
|
|
84
|
+
span.score,
|
|
85
|
+
text[:span.begin],
|
|
86
|
+
text[
|
|
87
|
+
span.begin:span.begin + 1],
|
|
88
|
+
rule.id, rule.rule,
|
|
89
|
+
rule.rule_name,
|
|
90
|
+
rule.score, rule.type))
|
|
91
|
+
begins = result[BEGIN]
|
|
92
|
+
ends = result[END]
|
|
93
|
+
|
|
94
|
+
st_begin = 0
|
|
95
|
+
st_started = False
|
|
96
|
+
st_end = 0
|
|
97
|
+
j = 0
|
|
98
|
+
|
|
99
|
+
for i in range(0, len(begins)):
|
|
100
|
+
token = begins[i]
|
|
101
|
+
if not st_started:
|
|
102
|
+
st_begin = token.begin
|
|
103
|
+
if st_begin < st_end:
|
|
104
|
+
continue
|
|
105
|
+
st_started = True
|
|
106
|
+
elif token.begin < st_end:
|
|
107
|
+
continue
|
|
108
|
+
|
|
109
|
+
if self.auto_fix_gaps and len(output) > 0 and st_begin > output[-1].end:
|
|
110
|
+
self.fix_gap(output, text, output[-1].end, st_begin, self.min_sent_chars)
|
|
111
|
+
elif self.auto_fix_gaps and len(output) == 0 and st_begin > 0:
|
|
112
|
+
self.fix_gap(output, text, 0, st_begin, self.min_sent_chars)
|
|
113
|
+
|
|
114
|
+
for k in range(j, len(ends)):
|
|
115
|
+
if i < len(begins) - 1 and k < len(ends) - 1 and begins[i + 1].begin < ends[k].begin + 1:
|
|
116
|
+
break
|
|
117
|
+
st_end = ends[k].begin + 1
|
|
118
|
+
j = k
|
|
119
|
+
while st_end >= 1 and (text[st_end - 1].isspace() or ord(text[st_end - 1]) == 160):
|
|
120
|
+
st_end -= 1
|
|
121
|
+
if st_end < st_begin:
|
|
122
|
+
continue
|
|
123
|
+
elif st_started:
|
|
124
|
+
output.append(Span(st_begin, st_end))
|
|
125
|
+
st_started = False
|
|
126
|
+
if i == len(begins) - 1 or (k < len(ends) - 1 and begins[i + 1].begin > ends[k + 1].end):
|
|
127
|
+
continue
|
|
128
|
+
break
|
|
129
|
+
else:
|
|
130
|
+
output[len(output) - 1] = Span(st_begin, st_end)
|
|
131
|
+
st_started = False
|
|
132
|
+
# fix beginning and ending gaps, in case the existing rules will miss some cases
|
|
133
|
+
if self.auto_fix_gaps:
|
|
134
|
+
if len(output) > 0:
|
|
135
|
+
begin_trimed_gap = RuSH.trim_gap(text, 0, output[0].begin)
|
|
136
|
+
if begin_trimed_gap is not None:
|
|
137
|
+
if output[0].begin <= ends[0].begin:
|
|
138
|
+
output[0].begin = begin_trimed_gap.begin
|
|
139
|
+
else:
|
|
140
|
+
output.insert(0, begin_trimed_gap)
|
|
141
|
+
end_trimed_gap = RuSH.trim_gap(text, output[-1].end, len(text))
|
|
142
|
+
if end_trimed_gap is not None:
|
|
143
|
+
if end_trimed_gap.width > self.min_sent_chars:
|
|
144
|
+
output.append(end_trimed_gap)
|
|
145
|
+
else:
|
|
146
|
+
output[-1].end = end_trimed_gap.end
|
|
147
|
+
else:
|
|
148
|
+
trimed_gap = RuSH.trim_gap(text, 0, len(text))
|
|
149
|
+
if trimed_gap is not None and trimed_gap.width > self.min_sent_chars:
|
|
150
|
+
output.append(trimed_gap)
|
|
151
|
+
|
|
152
|
+
if self.logger is not None:
|
|
153
|
+
for sentence in output:
|
|
154
|
+
self.logger.opt(lazy=True).debug(
|
|
155
|
+
'Sentence({0}-{1}):\t>{2}<'.format(sentence.begin, sentence.end, text[sentence.begin:sentence.end]))
|
|
156
|
+
|
|
157
|
+
return output
|
|
158
|
+
|
|
159
|
+
@staticmethod
|
|
160
|
+
def fix_gap(sentences: list, text: str, previous_end: int, this_begin: int, min_sent_chars: int = 5):
|
|
161
|
+
trimed_gap = RuSH.trim_gap(text, previous_end, this_begin)
|
|
162
|
+
if trimed_gap is None:
|
|
163
|
+
return
|
|
164
|
+
if trimed_gap.width > min_sent_chars:
|
|
165
|
+
sentences.append(trimed_gap)
|
|
166
|
+
elif len(sentences) > 0:
|
|
167
|
+
sentences[-1].end = trimed_gap.end
|
|
168
|
+
|
|
169
|
+
@staticmethod
|
|
170
|
+
def trim_gap(text: str, previous_end: int, this_begin: int) -> 'Span | None':
|
|
171
|
+
begin = -1
|
|
172
|
+
alnum_begin = -1
|
|
173
|
+
end = 0
|
|
174
|
+
gap_chars = list(text[previous_end:this_begin])
|
|
175
|
+
# trim left
|
|
176
|
+
for i in range(0, this_begin - previous_end):
|
|
177
|
+
this_char = gap_chars[i]
|
|
178
|
+
if not this_char.isspace():
|
|
179
|
+
begin = i
|
|
180
|
+
break
|
|
181
|
+
for i in range(this_begin - previous_end - 1, begin, -1):
|
|
182
|
+
this_char = gap_chars[i]
|
|
183
|
+
if this_char.isalnum() or this_char == '.' or this_char == '!' or this_char == '?' or this_char == ')' or this_char \
|
|
184
|
+
== ']' or this_char == '\"':
|
|
185
|
+
end = i
|
|
186
|
+
break
|
|
187
|
+
if end > begin != -1:
|
|
188
|
+
return Span(begin + previous_end, end + previous_end + 1)
|
|
189
|
+
else:
|
|
190
|
+
return None
|
|
Binary file
|