PyRuSH 1.0.11__tar.gz → 1.0.12.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PKG-INFO +14 -1
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/__init__.py +1 -1
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/PKG-INFO +14 -1
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/SOURCES.txt +1 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/README.rst +13 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/conf/rush_rules.tsv +2 -8
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/setup.cfg +0 -1
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_PyRuSHSentencizer_param.py +1 -1
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_PyRushSentencizer.py +6 -5
- pyrush-1.0.12.dev1/tests/test_PyRushSentencizer2.py +45 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_Rush.py +38 -6
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/LICENSE +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/MANIFEST.in +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/PyRuSHSentencizer.py +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/RuSH.py +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/StaticSentencizerFun.cpp +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/StaticSentencizerFun.pyx +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/dependency_links.txt +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/not-zip-safe +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/requires.txt +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/top_level.txt +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/pyproject.toml +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/requirements.txt +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/setup.py +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_Rush_w_Logger.py +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_cpredict_split_gaps.py +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_debug.py +0 -0
- {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_merge_gaps_max_length.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PyRuSH
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.12.dev1
|
|
4
4
|
Summary: PyRuSH is the python implementation of RuSH (Rule-based sentence Segmenter using Hashing), which is originally developed using Java. RuSH is an efficient, reliable, and easy adaptable rule-based sentence segmentation solution. It is specifically designed to handle the telegraphic written text in clinical note. It leverages a nested hash table to execute simultaneous rule processing, which reduces the impact of the rule-base growth on execution time and eliminates the effect of rule order on accuracy.
|
|
5
5
|
Home-page: https://github.com/jianlins/PyRuSH
|
|
6
6
|
Author: Jianlin
|
|
@@ -77,3 +77,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py
|
|
|
77
77
|
A Colab Notebook Demo
|
|
78
78
|
---------------------------
|
|
79
79
|
Feel free to try this runnable `Colab notebook Demo <https://colab.research.google.com/drive/1gX9MzZTQiPw8G3x_vUwZbiSXGtbI0uIX?usp=sharing>`_
|
|
80
|
+
|
|
81
|
+
Revision History
|
|
82
|
+
----------------
|
|
83
|
+
|
|
84
|
+
**1.0.11 (2025-09-02)**
|
|
85
|
+
|
|
86
|
+
- Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit.
|
|
87
|
+
- Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences.
|
|
88
|
+
|
|
89
|
+
**1.0.9 (2024-10-27)**
|
|
90
|
+
|
|
91
|
+
- Initial release with spaCy 3.x compatibility and core RuSH logic.
|
|
92
|
+
- Added Spacy-compatible PyRuSHSentencizer component.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: PyRuSH
|
|
3
|
-
Version: 1.0.
|
|
3
|
+
Version: 1.0.12.dev1
|
|
4
4
|
Summary: PyRuSH is the python implementation of RuSH (Rule-based sentence Segmenter using Hashing), which is originally developed using Java. RuSH is an efficient, reliable, and easy adaptable rule-based sentence segmentation solution. It is specifically designed to handle the telegraphic written text in clinical note. It leverages a nested hash table to execute simultaneous rule processing, which reduces the impact of the rule-base growth on execution time and eliminates the effect of rule order on accuracy.
|
|
5
5
|
Home-page: https://github.com/jianlins/PyRuSH
|
|
6
6
|
Author: Jianlin
|
|
@@ -77,3 +77,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py
|
|
|
77
77
|
A Colab Notebook Demo
|
|
78
78
|
---------------------------
|
|
79
79
|
Feel free to try this runnable `Colab notebook Demo <https://colab.research.google.com/drive/1gX9MzZTQiPw8G3x_vUwZbiSXGtbI0uIX?usp=sharing>`_
|
|
80
|
+
|
|
81
|
+
Revision History
|
|
82
|
+
----------------
|
|
83
|
+
|
|
84
|
+
**1.0.11 (2025-09-02)**
|
|
85
|
+
|
|
86
|
+
- Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit.
|
|
87
|
+
- Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences.
|
|
88
|
+
|
|
89
|
+
**1.0.9 (2024-10-27)**
|
|
90
|
+
|
|
91
|
+
- Initial release with spaCy 3.x compatibility and core RuSH logic.
|
|
92
|
+
- Added Spacy-compatible PyRuSHSentencizer component.
|
|
@@ -52,3 +52,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py
|
|
|
52
52
|
A Colab Notebook Demo
|
|
53
53
|
---------------------------
|
|
54
54
|
Feel free to try this runnable `Colab notebook Demo <https://colab.research.google.com/drive/1gX9MzZTQiPw8G3x_vUwZbiSXGtbI0uIX?usp=sharing>`_
|
|
55
|
+
|
|
56
|
+
Revision History
|
|
57
|
+
----------------
|
|
58
|
+
|
|
59
|
+
**1.0.11 (2025-09-02)**
|
|
60
|
+
|
|
61
|
+
- Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit.
|
|
62
|
+
- Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences.
|
|
63
|
+
|
|
64
|
+
**1.0.9 (2024-10-27)**
|
|
65
|
+
|
|
66
|
+
- Initial release with spaCy 3.x compatibility and core RuSH logic.
|
|
67
|
+
- Added Spacy-compatible PyRuSHSentencizer component.
|
|
@@ -20,7 +20,7 @@
|
|
|
20
20
|
|
|
21
21
|
#stbegin is the marker for sentence begin, the span of sentence will start at the begin of the captured group
|
|
22
22
|
#stbegin has two scores 0, 1: 0 for true sentence begin clues, 1 for false sentence begin clues which will overwrite 0-scored rules when they are overlapping.
|
|
23
|
-
#stend is the marker for sentence
|
|
23
|
+
#stend is the marker for sentence begin, the span of sentence will end at the end of the captured group
|
|
24
24
|
#stend also has two scores 2, 3: 2 for true sentence end clues, 3 for false sentence end clues which will overwrite 2-scored rules when they are overlapping
|
|
25
25
|
|
|
26
26
|
# \b the begin of an input
|
|
@@ -47,12 +47,6 @@
|
|
|
47
47
|
\b\s+(\C 0 stbegin
|
|
48
48
|
\b\s+(\d 0 stbegin
|
|
49
49
|
\c.\s+(\C) 0 stbegin
|
|
50
|
-
Dr.\s+(\C) 1 stbegin
|
|
51
|
-
Mr.\s+(\C) 1 stbegin
|
|
52
|
-
Ms.\s+(\C) 1 stbegin
|
|
53
|
-
Miss.\s+(\C) 1 stbegin
|
|
54
|
-
Mrs.\s+(\C) 1 stbegin
|
|
55
|
-
dr.\s+(\C) 1 stbegin
|
|
56
50
|
mL.\s+(\C) 0 stbegin
|
|
57
51
|
*) 1 stbegin
|
|
58
52
|
\c\c.\s+(\C) 0 stbegin
|
|
@@ -245,7 +239,7 @@ dr.\s+(\C) 1 stbegin
|
|
|
245
239
|
\n(? \C 0 stbegin
|
|
246
240
|
\n(? \c 0 stbegin
|
|
247
241
|
\n(. \C 0 stbegin
|
|
248
|
-
\n(
|
|
242
|
+
\n(+ \C 0 stbegin
|
|
249
243
|
\n(/ \C 0 stbegin
|
|
250
244
|
\n+\d\d-\d\d\s+(\C 0 stbegin
|
|
251
245
|
\n+\d+-\d\d-\d\d\s+(\C 0 stbegin
|
|
@@ -6,7 +6,7 @@ from PyRuSH.PyRuSHSentencizer import PyRuSHSentencizer
|
|
|
6
6
|
text_short = "Sentence one. Sentence two!"
|
|
7
7
|
text_long = "This is a very long sentence that should be split at whitespace before the max length is reached. " * 5
|
|
8
8
|
text_whitespace = "First sentence. Second sentence after spaces.\nThird sentence after newline."
|
|
9
|
-
rule_path = os.path.join(os.path.dirname(
|
|
9
|
+
rule_path = os.path.join(os.path.dirname(__file__), "rush_rules.tsv")
|
|
10
10
|
|
|
11
11
|
def make_nlp(merge_gaps, max_sentence_length):
|
|
12
12
|
nlp = English()
|
|
@@ -10,7 +10,7 @@ from spacy.lang.en import English
|
|
|
10
10
|
class TestRuSH(unittest.TestCase):
|
|
11
11
|
|
|
12
12
|
def setUp(self):
|
|
13
|
-
pwd = os.path.dirname(os.path.abspath(__file__))
|
|
13
|
+
self.pwd = os.path.dirname(os.path.abspath(__file__))
|
|
14
14
|
|
|
15
15
|
def test_doc(self):
|
|
16
16
|
nlp = English()
|
|
@@ -51,14 +51,14 @@ I will see her in a month to six weeks. She is to follow up with Dr. X before t
|
|
|
51
51
|
End Ezoic - MTSam Sample Bottom Matched Content - native_bottom
|
|
52
52
|
'''
|
|
53
53
|
nlp = English()
|
|
54
|
-
nlp.add_pipe("medspacy_pyrush")
|
|
54
|
+
nlp.add_pipe("medspacy_pyrush", config={"rules_path": os.path.join(self.pwd, 'rush_rules.tsv')})
|
|
55
55
|
doc = nlp(input_str)
|
|
56
56
|
sents = [s for s in doc.sents]
|
|
57
57
|
for sent in sents:
|
|
58
58
|
print('>' + str(sent) + '<\n\n')
|
|
59
59
|
|
|
60
60
|
# New expected count includes whitespace-only sentences
|
|
61
|
-
assert (len(sents) ==
|
|
61
|
+
assert (len(sents) == 51)
|
|
62
62
|
# For content checks, filter out whitespace-only sentences
|
|
63
63
|
content_sents = [s for s in sents if s.text.strip()]
|
|
64
64
|
assert (content_sents[0].text == 'Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency.')
|
|
@@ -74,7 +74,7 @@ I will see her in a month to six weeks. She is to follow up with Dr. X before t
|
|
|
74
74
|
from loguru import logger
|
|
75
75
|
logger.add(sys.stdout, level="DEBUG")
|
|
76
76
|
nlp = English()
|
|
77
|
-
nlp.add_pipe("medspacy_pyrush")
|
|
77
|
+
nlp.add_pipe("medspacy_pyrush", config={"rules_path": os.path.join(self.pwd, 'rush_rules.tsv')})
|
|
78
78
|
doc = nlp(input_str)
|
|
79
79
|
sents = [s for s in doc.sents]
|
|
80
80
|
for sent in sents:
|
|
@@ -116,4 +116,5 @@ I will see her in a month to six weeks. She is to follow up with Dr. X before t
|
|
|
116
116
|
|
|
117
117
|
# SpaCy has no control of sentence end. Thus, it ends up with sloppy ends.
|
|
118
118
|
assert (sents[1].text == 'Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with'
|
|
119
|
-
' chest pain and respiratory insufficiency.')
|
|
119
|
+
' chest pain and respiratory insufficiency.')
|
|
120
|
+
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
|
|
5
|
+
|
|
6
|
+
from PyRuSH import PyRuSHSentencizer
|
|
7
|
+
from spacy.lang.en import English
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class TestRuSH(unittest.TestCase):
|
|
11
|
+
|
|
12
|
+
def setUp(self):
|
|
13
|
+
self.pwd = os.path.dirname(os.path.abspath(__file__))
|
|
14
|
+
|
|
15
|
+
# def test_doc(self):
|
|
16
|
+
# nlp = English()
|
|
17
|
+
# nlp.add_pipe("medspacy_pyrush")
|
|
18
|
+
# doc = nlp("This is a sentence. This is another sentence.")
|
|
19
|
+
# print('\n'.join([str(s) for s in doc.sents]))
|
|
20
|
+
# print('\nTotal sentences: {}'.format(len([s for s in doc.sents])))
|
|
21
|
+
# print('\ndoc is an instance of {}'.format(type(doc)))
|
|
22
|
+
|
|
23
|
+
def test_doc4(self):
|
|
24
|
+
input_str='''Ms. [**Known patient lastname 2004**] was admitted on [**2573-5-30**]. Ultrasound
|
|
25
|
+
at the time of admission demonstrated pancreatic duct dilitation and
|
|
26
|
+
edematous gallbladder. She was admitted to the ICU.
|
|
27
|
+
Discharge Medications:
|
|
28
|
+
1. Miconazole Nitrate 2 % Powder Sig: One (1) Appl Topical BID
|
|
29
|
+
(2 times a day) as needed.
|
|
30
|
+
2. Heparin Sodium (Porcine) 5,000 unit/mL Solution Sig: One (1)
|
|
31
|
+
Injection TID (3 times a day).
|
|
32
|
+
3. Acetaminophen 160 mg/5 mL Elixir Sig: One (1) PO Q4-6H
|
|
33
|
+
(every 4 to 6 hours) as needed.'''
|
|
34
|
+
nlp = English()
|
|
35
|
+
nlp.add_pipe("medspacy_pyrush", config={"rules_path": os.path.join(self.pwd, 'rush_rules.tsv')})
|
|
36
|
+
nlp.initialize()
|
|
37
|
+
doc = nlp(input_str)
|
|
38
|
+
sents = [s for s in doc.sents]
|
|
39
|
+
for sent in sents:
|
|
40
|
+
print('>' + str(sent) + '<\n\n')
|
|
41
|
+
assert(sents[-1].text=='''Sig: One (1) PO Q4-6H
|
|
42
|
+
(every 4 to 6 hours) as needed.''')
|
|
43
|
+
|
|
44
|
+
if __name__ == '__main__':
|
|
45
|
+
unittest.main()
|
|
@@ -95,7 +95,7 @@ class TestRuSH(unittest.TestCase):
|
|
|
95
95
|
sentences = rush.segToSentenceSpans(input_str)
|
|
96
96
|
self.printDetails(sentences, input_str)
|
|
97
97
|
|
|
98
|
-
def
|
|
98
|
+
def test8(self):
|
|
99
99
|
input_str = '''
|
|
100
100
|
9. Advair b.i.d.
|
|
101
101
|
10. Xopenex q.i.d. and p.r.n.
|
|
@@ -109,12 +109,44 @@ I will see her in a month to six weeks. She is to follow up with Dr. X before t
|
|
|
109
109
|
sent = sentences[1]
|
|
110
110
|
assert (input_str[sent.begin:sent.end] == '10. Xopenex q.i.d. and p.r.n.')
|
|
111
111
|
|
|
112
|
-
|
|
113
|
-
def test_doc11(self):
|
|
112
|
+
def test9(self):
|
|
114
113
|
input_str=' This is a sentence. This is another sentence.'
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
114
|
+
self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True)
|
|
115
|
+
sentences = self.rush.segToSentenceSpans(input_str)
|
|
116
|
+
self.printDetails(sentences, input_str)
|
|
117
|
+
|
|
118
|
+
def test10(self):
|
|
119
|
+
input_str='''Ms. [**Known patient lastname 2004**] was admitted on [**2573-5-30**]. Ultrasound
|
|
120
|
+
at the time of admission demonstrated pancreatic duct dilitation and
|
|
121
|
+
edematous gallbladder. She was admitted to the ICU.
|
|
122
|
+
Discharge Medications:
|
|
123
|
+
1. Miconazole Nitrate 2 % Powder Sig: One (1) Appl Topical BID
|
|
124
|
+
(2 times a day) as needed.
|
|
125
|
+
2. Heparin Sodium (Porcine) 5,000 unit/mL Solution Sig: One (1)
|
|
126
|
+
Injection TID (3 times a day).
|
|
127
|
+
3. Acetaminophen 160 mg/5 mL Elixir Sig: One (1) PO Q4-6H
|
|
128
|
+
(every 4 to 6 hours) as needed.'''
|
|
129
|
+
self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True)
|
|
130
|
+
sentences = self.rush.segToSentenceSpans(input_str)
|
|
131
|
+
self.printDetails(sentences, input_str)
|
|
132
|
+
assert (sentences[0].begin == 0 and sentences[0].end == 173)
|
|
133
|
+
assert (sentences[1].begin == 174 and sentences[1].end == 202)
|
|
134
|
+
assert (sentences[2].begin == 203 and sentences[2].end == 225)
|
|
135
|
+
assert (sentences[3].begin == 226 and sentences[3].end == 258)
|
|
136
|
+
assert (sentences[4].begin == 259 and sentences[4].end == 316)
|
|
137
|
+
assert (sentences[5].begin == 317 and sentences[5].end == 367)
|
|
138
|
+
assert (sentences[6].begin == 368 and sentences[6].end == 411)
|
|
139
|
+
assert (sentences[7].begin == 412 and sentences[7].end == 447)
|
|
140
|
+
assert (sentences[8].begin == 448 and sentences[8].end == 502)
|
|
141
|
+
|
|
142
|
+
def test11(self):
|
|
143
|
+
input_str = '''Patient doesn't have heart disease or high blood pressure, but their dad did have
|
|
144
|
+
diabetes. Pt is a 63M w/ h/o metastatic carcinoid tumor, HTN and hyperlipidemia.'''
|
|
145
|
+
self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True)
|
|
146
|
+
sentences = self.rush.segToSentenceSpans(input_str)
|
|
147
|
+
self.printDetails(sentences, input_str)
|
|
148
|
+
assert (sentences[0].begin == 0 and sentences[0].end == 91)
|
|
149
|
+
assert (sentences[1].begin == 92 and sentences[1].end == 162)
|
|
118
150
|
|
|
119
151
|
if __name__ == '__main__':
|
|
120
152
|
unittest.main()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|