PyRuSH 1.0.13__cp313-cp313-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,115 @@
1
+ # ******************************************************************************
2
+ # MIT License
3
+ #
4
+ # Copyright (c) 2020 Jianlin Shi
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
7
+ # files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
8
+ # modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
9
+ # Software is furnished to do so, subject to the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
14
+ # WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
15
+ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
16
+ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17
+ # ******************************************************************************
18
+ from spacy import Language
19
+ from spacy.pipeline import Sentencizer
20
+ from typing import Optional
21
+ from .RuSH import RuSH
22
+ from .StaticSentencizerFun import cpredict_merge_gaps,cpredict_split_gaps, cset_annotations
23
+
24
+
25
+ @Language.factory("medspacy_pyrush")
26
+ class PyRuSHSentencizer(Sentencizer):
27
+ def __init__(self, nlp: Language, name: str = "medspacy_pyrush", rules_path: str = '', max_repeat: int = 50,
28
+ auto_fix_gaps: bool = True, merge_gaps: bool = False, max_sentence_length: Optional[int] = None) -> Sentencizer:
29
+ """
30
+ Initialize the PyRuSH sentencizer component.
31
+
32
+ Args:
33
+ nlp (Language): The spaCy language pipeline.
34
+ name (str): Name of the component. Default is "medspacy_pyrush".
35
+ rules_path (str): Path to the rule file or rules themselves. If empty, defaults to 'conf/rush_rules.tsv'.
36
+ max_repeat (int): Maximum number of repeats allowed for the '+' wildcard in rules.
37
+ auto_fix_gaps (bool): If True, attempts to fix gaps caused by malformed rules.
38
+ merge_gaps (bool): If True, merges gaps between sentences into the preceding sentence. If False, splits gaps (might be multiple whitespaces or new line characters) into separate sentences.
39
+ max_sentence_length (int or None): Maximum allowed sentence length in characters. If set, sentences longer than this will be split.
40
+
41
+ Notes:
42
+ - Setting merge_gaps controls whether gaps are merged or split.
43
+ - max_sentence_length applies to both merge and split modes.
44
+ """
45
+ self.nlp = nlp
46
+ self.name = name
47
+ if rules_path is None or rules_path == '':
48
+ import os
49
+ root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
50
+ rules_path = str(os.path.join(root, 'conf', 'rush_rules.tsv'))
51
+ self.rules_path = rules_path
52
+ self.rush = RuSH(rules=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps)
53
+ self.merge_gaps = merge_gaps
54
+ self.max_sentence_length = max_sentence_length
55
+
56
+ @classmethod
57
+ def from_nlp(cls, nlp, **cfg):
58
+ """
59
+ Create a PyRuSHSentencizer instance from a spaCy nlp object and configuration.
60
+
61
+ Args:
62
+ nlp (Language): The spaCy language pipeline.
63
+ **cfg: Additional configuration parameters for initialization.
64
+
65
+ Returns:
66
+ PyRuSHSentencizer: An initialized sentencizer instance.
67
+ """
68
+ return cls(**cfg)
69
+
70
+ def __call__(self, doc):
71
+ """
72
+ Apply sentence boundary detection to a spaCy Doc and set sentence start annotations.
73
+
74
+ Args:
75
+ doc (Doc): The spaCy Doc to process.
76
+
77
+ Returns:
78
+ Doc: The processed Doc with sentence boundaries set.
79
+ """
80
+ tags = self.predict([doc])
81
+ cset_annotations([doc], tags)
82
+ return doc
83
+
84
+ def predict(self, docs):
85
+ """
86
+ Predict sentence boundaries for a batch of spaCy Docs.
87
+
88
+ Args:
89
+ docs (list of Doc): List of spaCy Docs to process.
90
+
91
+ Returns:
92
+ list of list of bool: Sentence start guesses for each Doc.
93
+
94
+ Notes:
95
+ - Does not modify the Docs; only returns sentence start predictions.
96
+ """
97
+ if self.merge_gaps:
98
+ guesses = cpredict_merge_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
99
+ else:
100
+ guesses = cpredict_split_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
101
+ return guesses
102
+
103
+ def set_annotations(self, docs, batch_tag_ids, tensors=None):
104
+ """
105
+ Set sentence boundary annotations on spaCy Docs.
106
+
107
+ Args:
108
+ docs (list of Doc): List of spaCy Docs to annotate.
109
+ batch_tag_ids (list of list of bool): Sentence start tags for each Doc.
110
+ tensors: Placeholder for future extensions (optional).
111
+
112
+ Notes:
113
+ - This method overwrites spaCy's Sentencizer annotations.
114
+ """
115
+ cset_annotations(docs, batch_tag_ids, tensors)
PyRuSH/RuSH.py ADDED
@@ -0,0 +1,190 @@
1
+ # ******************************************************************************
2
+ # MIT License
3
+ #
4
+ # Copyright (c) 2020 Jianlin Shi
5
+ #
6
+ # Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation
7
+ # files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy,
8
+ # modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the
9
+ # Software is furnished to do so, subject to the following conditions:
10
+ #
11
+ # The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
12
+ #
13
+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
14
+ # WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
15
+ # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
16
+ # ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17
+ # ******************************************************************************
18
+ #
19
+ # Licensed under the Apache License, Version 2.0 (the "License");
20
+ # you may not use this file except in compliance with the License.
21
+ # You may obtain a copy of the License at
22
+ #
23
+ # http://www.apache.org/licenses/LICENSE-2.0
24
+ #
25
+ # Unless required by applicable law or agreed to in writing, software
26
+ # distributed under the License is distributed on an "AS IS" BASIS,
27
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
28
+ # See the License for the specific language governing permissions and
29
+ # limitations under the License.
30
+ from loguru import logger
31
+ import os.path
32
+ from typing import Union, List
33
+
34
+ from PyFastNER import FastCNER
35
+ from PyFastNER import Span
36
+
37
+ BEGIN = 'stbegin'
38
+ END = 'stend'
39
+
40
+
41
+ def initLogger():
42
+ pass # Removed: logging config logic for Loguru migration
43
+
44
+
45
+ class RuSH:
46
+
47
+ def __init__(self, rules: Union[str, List] = '', max_repeat: int = 50, auto_fix_gaps: bool = True,
48
+ min_sent_chars: int = 5,
49
+ enable_logger: bool = False):
50
+ self.fastner = FastCNER(rules, max_repeat)
51
+ self.fastner.span_compare_method = 'scorewidth'
52
+ if enable_logger:
53
+ self.logger = logger
54
+ else:
55
+ self.logger = None
56
+ self.auto_fix_gaps = auto_fix_gaps
57
+ # for old RuSh rule format (doesn't have PSEUDO and ACTUAL column), make the conversion.
58
+ if not self.fastner.full_definition:
59
+ self.backCompatableParseRule()
60
+ self.min_sent_chars = min_sent_chars
61
+ pass
62
+
63
+ def backCompatableParseRule(self):
64
+ for id, rule in self.fastner.rule_store.items():
65
+ if rule.score % 2 != 0:
66
+ rule.type = 'PSEUDO'
67
+ self.fastner.constructRuleMap(self.fastner.rule_store)
68
+ pass
69
+
70
+ def segToSentenceSpans(self, text):
71
+ output = []
72
+ result = {BEGIN: [], END: []}
73
+ self.fastner.process(text, 0, result)
74
+
75
+ # log important message for debugging use
76
+ if self.logger is not None:
77
+ text = text.replace('\n', ' ')
78
+ for concept_type, spans in result.items():
79
+ self.logger.opt(lazy=True).debug(concept_type)
80
+ for span in spans:
81
+ rule = self.fastner.rule_store[span.rule_id]
82
+ self.logger.opt(lazy=True).debug(
83
+ '\t{0}-{1}:{2}\t{3}<{4}>\t[Rule {5}:\t{6}\t{7}\t{8}\t{9}]'.format(span.begin, span.end,
84
+ span.score,
85
+ text[:span.begin],
86
+ text[
87
+ span.begin:span.begin + 1],
88
+ rule.id, rule.rule,
89
+ rule.rule_name,
90
+ rule.score, rule.type))
91
+ begins = result[BEGIN]
92
+ ends = result[END]
93
+
94
+ st_begin = 0
95
+ st_started = False
96
+ st_end = 0
97
+ j = 0
98
+
99
+ for i in range(0, len(begins)):
100
+ token = begins[i]
101
+ if not st_started:
102
+ st_begin = token.begin
103
+ if st_begin < st_end:
104
+ continue
105
+ st_started = True
106
+ elif token.begin < st_end:
107
+ continue
108
+
109
+ if self.auto_fix_gaps and len(output) > 0 and st_begin > output[-1].end:
110
+ self.fix_gap(output, text, output[-1].end, st_begin, self.min_sent_chars)
111
+ elif self.auto_fix_gaps and len(output) == 0 and st_begin > 0:
112
+ self.fix_gap(output, text, 0, st_begin, self.min_sent_chars)
113
+
114
+ for k in range(j, len(ends)):
115
+ if i < len(begins) - 1 and k < len(ends) - 1 and begins[i + 1].begin < ends[k].begin + 1:
116
+ break
117
+ st_end = ends[k].begin + 1
118
+ j = k
119
+ while st_end >= 1 and (text[st_end - 1].isspace() or ord(text[st_end - 1]) == 160):
120
+ st_end -= 1
121
+ if st_end < st_begin:
122
+ continue
123
+ elif st_started:
124
+ output.append(Span(st_begin, st_end))
125
+ st_started = False
126
+ if i == len(begins) - 1 or (k < len(ends) - 1 and begins[i + 1].begin > ends[k + 1].end):
127
+ continue
128
+ break
129
+ else:
130
+ output[len(output) - 1] = Span(st_begin, st_end)
131
+ st_started = False
132
+ # fix beginning and ending gaps, in case the existing rules will miss some cases
133
+ if self.auto_fix_gaps:
134
+ if len(output) > 0:
135
+ begin_trimed_gap = RuSH.trim_gap(text, 0, output[0].begin)
136
+ if begin_trimed_gap is not None:
137
+ if output[0].begin <= ends[0].begin:
138
+ output[0].begin = begin_trimed_gap.begin
139
+ else:
140
+ output.insert(0, begin_trimed_gap)
141
+ end_trimed_gap = RuSH.trim_gap(text, output[-1].end, len(text))
142
+ if end_trimed_gap is not None:
143
+ if end_trimed_gap.width > self.min_sent_chars:
144
+ output.append(end_trimed_gap)
145
+ else:
146
+ output[-1].end = end_trimed_gap.end
147
+ else:
148
+ trimed_gap = RuSH.trim_gap(text, 0, len(text))
149
+ if trimed_gap is not None and trimed_gap.width > self.min_sent_chars:
150
+ output.append(trimed_gap)
151
+
152
+ if self.logger is not None:
153
+ for sentence in output:
154
+ self.logger.opt(lazy=True).debug(
155
+ 'Sentence({0}-{1}):\t>{2}<'.format(sentence.begin, sentence.end, text[sentence.begin:sentence.end]))
156
+
157
+ return output
158
+
159
+ @staticmethod
160
+ def fix_gap(sentences: list, text: str, previous_end: int, this_begin: int, min_sent_chars: int = 5):
161
+ trimed_gap = RuSH.trim_gap(text, previous_end, this_begin)
162
+ if trimed_gap is None:
163
+ return
164
+ if trimed_gap.width > min_sent_chars:
165
+ sentences.append(trimed_gap)
166
+ elif len(sentences) > 0:
167
+ sentences[-1].end = trimed_gap.end
168
+
169
+ @staticmethod
170
+ def trim_gap(text: str, previous_end: int, this_begin: int) -> 'Span | None':
171
+ begin = -1
172
+ alnum_begin = -1
173
+ end = 0
174
+ gap_chars = list(text[previous_end:this_begin])
175
+ # trim left
176
+ for i in range(0, this_begin - previous_end):
177
+ this_char = gap_chars[i]
178
+ if not this_char.isspace():
179
+ begin = i
180
+ break
181
+ for i in range(this_begin - previous_end - 1, begin, -1):
182
+ this_char = gap_chars[i]
183
+ if this_char.isalnum() or this_char == '.' or this_char == '!' or this_char == '?' or this_char == ')' or this_char \
184
+ == ']' or this_char == '\"':
185
+ end = i
186
+ break
187
+ if end > begin != -1:
188
+ return Span(begin + previous_end, end + previous_end + 1)
189
+ else:
190
+ return None