PyRuSH 1.0.11__tar.gz → 1.0.12.dev1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (28) hide show
  1. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PKG-INFO +14 -1
  2. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/__init__.py +1 -1
  3. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/PKG-INFO +14 -1
  4. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/SOURCES.txt +1 -0
  5. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/README.rst +13 -0
  6. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/conf/rush_rules.tsv +2 -8
  7. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/setup.cfg +0 -1
  8. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_PyRuSHSentencizer_param.py +1 -1
  9. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_PyRushSentencizer.py +6 -5
  10. pyrush-1.0.12.dev1/tests/test_PyRushSentencizer2.py +45 -0
  11. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_Rush.py +38 -6
  12. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/LICENSE +0 -0
  13. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/MANIFEST.in +0 -0
  14. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/PyRuSHSentencizer.py +0 -0
  15. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/RuSH.py +0 -0
  16. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/StaticSentencizerFun.cpp +0 -0
  17. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH/StaticSentencizerFun.pyx +0 -0
  18. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/dependency_links.txt +0 -0
  19. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/not-zip-safe +0 -0
  20. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/requires.txt +0 -0
  21. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/PyRuSH.egg-info/top_level.txt +0 -0
  22. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/pyproject.toml +0 -0
  23. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/requirements.txt +0 -0
  24. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/setup.py +0 -0
  25. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_Rush_w_Logger.py +0 -0
  26. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_cpredict_split_gaps.py +0 -0
  27. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_debug.py +0 -0
  28. {pyrush-1.0.11 → pyrush-1.0.12.dev1}/tests/test_merge_gaps_max_length.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: PyRuSH
3
- Version: 1.0.11
3
+ Version: 1.0.12.dev1
4
4
  Summary: PyRuSH is the python implementation of RuSH (Rule-based sentence Segmenter using Hashing), which is originally developed using Java. RuSH is an efficient, reliable, and easy adaptable rule-based sentence segmentation solution. It is specifically designed to handle the telegraphic written text in clinical note. It leverages a nested hash table to execute simultaneous rule processing, which reduces the impact of the rule-base growth on execution time and eliminates the effect of rule order on accuracy.
5
5
  Home-page: https://github.com/jianlins/PyRuSH
6
6
  Author: Jianlin
@@ -77,3 +77,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py
77
77
  A Colab Notebook Demo
78
78
  ---------------------------
79
79
  Feel free to try this runnable `Colab notebook Demo <https://colab.research.google.com/drive/1gX9MzZTQiPw8G3x_vUwZbiSXGtbI0uIX?usp=sharing>`_
80
+
81
+ Revision History
82
+ ----------------
83
+
84
+ **1.0.11 (2025-09-02)**
85
+
86
+ - Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit.
87
+ - Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences.
88
+
89
+ **1.0.9 (2024-10-27)**
90
+
91
+ - Initial release with spaCy 3.x compatibility and core RuSH logic.
92
+ - Added Spacy-compatible PyRuSHSentencizer component.
@@ -30,7 +30,7 @@
30
30
 
31
31
  from .PyRuSHSentencizer import PyRuSHSentencizer
32
32
  from .RuSH import RuSH, BEGIN, END
33
- __version__ = '1.0.11'
33
+ __version__ = '1.0.12dev1'
34
34
 
35
35
 
36
36
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: PyRuSH
3
- Version: 1.0.11
3
+ Version: 1.0.12.dev1
4
4
  Summary: PyRuSH is the python implementation of RuSH (Rule-based sentence Segmenter using Hashing), which is originally developed using Java. RuSH is an efficient, reliable, and easy adaptable rule-based sentence segmentation solution. It is specifically designed to handle the telegraphic written text in clinical note. It leverages a nested hash table to execute simultaneous rule processing, which reduces the impact of the rule-base growth on execution time and eliminates the effect of rule order on accuracy.
5
5
  Home-page: https://github.com/jianlins/PyRuSH
6
6
  Author: Jianlin
@@ -77,3 +77,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py
77
77
  A Colab Notebook Demo
78
78
  ---------------------------
79
79
  Feel free to try this runnable `Colab notebook Demo <https://colab.research.google.com/drive/1gX9MzZTQiPw8G3x_vUwZbiSXGtbI0uIX?usp=sharing>`_
80
+
81
+ Revision History
82
+ ----------------
83
+
84
+ **1.0.11 (2025-09-02)**
85
+
86
+ - Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit.
87
+ - Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences.
88
+
89
+ **1.0.9 (2024-10-27)**
90
+
91
+ - Initial release with spaCy 3.x compatibility and core RuSH logic.
92
+ - Added Spacy-compatible PyRuSHSentencizer component.
@@ -21,6 +21,7 @@ PyRuSH/../conf/rush_rules.tsv
21
21
  conf/rush_rules.tsv
22
22
  tests/test_PyRuSHSentencizer_param.py
23
23
  tests/test_PyRushSentencizer.py
24
+ tests/test_PyRushSentencizer2.py
24
25
  tests/test_Rush.py
25
26
  tests/test_Rush_w_Logger.py
26
27
  tests/test_cpredict_split_gaps.py
@@ -52,3 +52,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py
52
52
  A Colab Notebook Demo
53
53
  ---------------------------
54
54
  Feel free to try this runnable `Colab notebook Demo <https://colab.research.google.com/drive/1gX9MzZTQiPw8G3x_vUwZbiSXGtbI0uIX?usp=sharing>`_
55
+
56
+ Revision History
57
+ ----------------
58
+
59
+ **1.0.11 (2025-09-02)**
60
+
61
+ - Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit.
62
+ - Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences.
63
+
64
+ **1.0.9 (2024-10-27)**
65
+
66
+ - Initial release with spaCy 3.x compatibility and core RuSH logic.
67
+ - Added Spacy-compatible PyRuSHSentencizer component.
@@ -20,7 +20,7 @@
20
20
 
21
21
  #stbegin is the marker for sentence begin, the span of sentence will start at the begin of the captured group
22
22
  #stbegin has two scores 0, 1: 0 for true sentence begin clues, 1 for false sentence begin clues which will overwrite 0-scored rules when they are overlapping.
23
- #stend is the marker for sentence end, the span of sentence will end at the end of the captured group
23
+ #stend is the marker for sentence begin, the span of sentence will end at the end of the captured group
24
24
  #stend also has two scores 2, 3: 2 for true sentence end clues, 3 for false sentence end clues which will overwrite 2-scored rules when they are overlapping
25
25
 
26
26
  # \b the begin of an input
@@ -47,12 +47,6 @@
47
47
  \b\s+(\C 0 stbegin
48
48
  \b\s+(\d 0 stbegin
49
49
  \c.\s+(\C) 0 stbegin
50
- Dr.\s+(\C) 1 stbegin
51
- Mr.\s+(\C) 1 stbegin
52
- Ms.\s+(\C) 1 stbegin
53
- Miss.\s+(\C) 1 stbegin
54
- Mrs.\s+(\C) 1 stbegin
55
- dr.\s+(\C) 1 stbegin
56
50
  mL.\s+(\C) 0 stbegin
57
51
  *) 1 stbegin
58
52
  \c\c.\s+(\C) 0 stbegin
@@ -245,7 +239,7 @@ dr.\s+(\C) 1 stbegin
245
239
  \n(? \C 0 stbegin
246
240
  \n(? \c 0 stbegin
247
241
  \n(. \C 0 stbegin
248
- \n(\+ \C 0 stbegin
242
+ \n(+ \C 0 stbegin
249
243
  \n(/ \C 0 stbegin
250
244
  \n+\d\d-\d\d\s+(\C 0 stbegin
251
245
  \n+\d+-\d\d-\d\d\s+(\C 0 stbegin
@@ -1,7 +1,6 @@
1
1
  [metadata]
2
2
  readme = README.md
3
3
  license = MIT
4
- license_files = LICENSE
5
4
 
6
5
  [bdist_wheel]
7
6
  python_tag = py3
@@ -6,7 +6,7 @@ from PyRuSH.PyRuSHSentencizer import PyRuSHSentencizer
6
6
  text_short = "Sentence one. Sentence two!"
7
7
  text_long = "This is a very long sentence that should be split at whitespace before the max length is reached. " * 5
8
8
  text_whitespace = "First sentence. Second sentence after spaces.\nThird sentence after newline."
9
- rule_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), "conf", "rush_rules.tsv")
9
+ rule_path = os.path.join(os.path.dirname(__file__), "rush_rules.tsv")
10
10
 
11
11
  def make_nlp(merge_gaps, max_sentence_length):
12
12
  nlp = English()
@@ -10,7 +10,7 @@ from spacy.lang.en import English
10
10
  class TestRuSH(unittest.TestCase):
11
11
 
12
12
  def setUp(self):
13
- pwd = os.path.dirname(os.path.abspath(__file__))
13
+ self.pwd = os.path.dirname(os.path.abspath(__file__))
14
14
 
15
15
  def test_doc(self):
16
16
  nlp = English()
@@ -51,14 +51,14 @@ I will see her in a month to six weeks. She is to follow up with Dr. X before t
51
51
  End Ezoic - MTSam Sample Bottom Matched Content - native_bottom
52
52
  '''
53
53
  nlp = English()
54
- nlp.add_pipe("medspacy_pyrush")
54
+ nlp.add_pipe("medspacy_pyrush", config={"rules_path": os.path.join(self.pwd, 'rush_rules.tsv')})
55
55
  doc = nlp(input_str)
56
56
  sents = [s for s in doc.sents]
57
57
  for sent in sents:
58
58
  print('>' + str(sent) + '<\n\n')
59
59
 
60
60
  # New expected count includes whitespace-only sentences
61
- assert (len(sents) == 53)
61
+ assert (len(sents) == 51)
62
62
  # For content checks, filter out whitespace-only sentences
63
63
  content_sents = [s for s in sents if s.text.strip()]
64
64
  assert (content_sents[0].text == 'Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with chest pain and respiratory insufficiency.')
@@ -74,7 +74,7 @@ I will see her in a month to six weeks. She is to follow up with Dr. X before t
74
74
  from loguru import logger
75
75
  logger.add(sys.stdout, level="DEBUG")
76
76
  nlp = English()
77
- nlp.add_pipe("medspacy_pyrush")
77
+ nlp.add_pipe("medspacy_pyrush", config={"rules_path": os.path.join(self.pwd, 'rush_rules.tsv')})
78
78
  doc = nlp(input_str)
79
79
  sents = [s for s in doc.sents]
80
80
  for sent in sents:
@@ -116,4 +116,5 @@ I will see her in a month to six weeks. She is to follow up with Dr. X before t
116
116
 
117
117
  # SpaCy has no control of sentence end. Thus, it ends up with sloppy ends.
118
118
  assert (sents[1].text == 'Ms. ABCD is a 69-year-old lady, who was admitted to the hospital with'
119
- ' chest pain and respiratory insufficiency.')
119
+ ' chest pain and respiratory insufficiency.')
120
+
@@ -0,0 +1,45 @@
1
+ import unittest
2
+ import os
3
+ import sys
4
+ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
5
+
6
+ from PyRuSH import PyRuSHSentencizer
7
+ from spacy.lang.en import English
8
+
9
+
10
+ class TestRuSH(unittest.TestCase):
11
+
12
+ def setUp(self):
13
+ self.pwd = os.path.dirname(os.path.abspath(__file__))
14
+
15
+ # def test_doc(self):
16
+ # nlp = English()
17
+ # nlp.add_pipe("medspacy_pyrush")
18
+ # doc = nlp("This is a sentence. This is another sentence.")
19
+ # print('\n'.join([str(s) for s in doc.sents]))
20
+ # print('\nTotal sentences: {}'.format(len([s for s in doc.sents])))
21
+ # print('\ndoc is an instance of {}'.format(type(doc)))
22
+
23
+ def test_doc4(self):
24
+ input_str='''Ms. [**Known patient lastname 2004**] was admitted on [**2573-5-30**]. Ultrasound
25
+ at the time of admission demonstrated pancreatic duct dilitation and
26
+ edematous gallbladder. She was admitted to the ICU.
27
+ Discharge Medications:
28
+ 1. Miconazole Nitrate 2 % Powder Sig: One (1) Appl Topical BID
29
+ (2 times a day) as needed.
30
+ 2. Heparin Sodium (Porcine) 5,000 unit/mL Solution Sig: One (1)
31
+ Injection TID (3 times a day).
32
+ 3. Acetaminophen 160 mg/5 mL Elixir Sig: One (1) PO Q4-6H
33
+ (every 4 to 6 hours) as needed.'''
34
+ nlp = English()
35
+ nlp.add_pipe("medspacy_pyrush", config={"rules_path": os.path.join(self.pwd, 'rush_rules.tsv')})
36
+ nlp.initialize()
37
+ doc = nlp(input_str)
38
+ sents = [s for s in doc.sents]
39
+ for sent in sents:
40
+ print('>' + str(sent) + '<\n\n')
41
+ assert(sents[-1].text=='''Sig: One (1) PO Q4-6H
42
+ (every 4 to 6 hours) as needed.''')
43
+
44
+ if __name__ == '__main__':
45
+ unittest.main()
@@ -95,7 +95,7 @@ class TestRuSH(unittest.TestCase):
95
95
  sentences = rush.segToSentenceSpans(input_str)
96
96
  self.printDetails(sentences, input_str)
97
97
 
98
- def test_doc2(self):
98
+ def test8(self):
99
99
  input_str = '''
100
100
  9. Advair b.i.d.
101
101
  10. Xopenex q.i.d. and p.r.n.
@@ -109,12 +109,44 @@ I will see her in a month to six weeks. She is to follow up with Dr. X before t
109
109
  sent = sentences[1]
110
110
  assert (input_str[sent.begin:sent.end] == '10. Xopenex q.i.d. and p.r.n.')
111
111
 
112
-
113
- def test_doc11(self):
112
+ def test9(self):
114
113
  input_str=' This is a sentence. This is another sentence.'
115
- sentences=self.rush.segToSentenceSpans(input_str)
116
- for sent in sentences:
117
- print('>' + input_str[sent.begin:sent.end] + '<\n')
114
+ self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True)
115
+ sentences = self.rush.segToSentenceSpans(input_str)
116
+ self.printDetails(sentences, input_str)
117
+
118
+ def test10(self):
119
+ input_str='''Ms. [**Known patient lastname 2004**] was admitted on [**2573-5-30**]. Ultrasound
120
+ at the time of admission demonstrated pancreatic duct dilitation and
121
+ edematous gallbladder. She was admitted to the ICU.
122
+ Discharge Medications:
123
+ 1. Miconazole Nitrate 2 % Powder Sig: One (1) Appl Topical BID
124
+ (2 times a day) as needed.
125
+ 2. Heparin Sodium (Porcine) 5,000 unit/mL Solution Sig: One (1)
126
+ Injection TID (3 times a day).
127
+ 3. Acetaminophen 160 mg/5 mL Elixir Sig: One (1) PO Q4-6H
128
+ (every 4 to 6 hours) as needed.'''
129
+ self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True)
130
+ sentences = self.rush.segToSentenceSpans(input_str)
131
+ self.printDetails(sentences, input_str)
132
+ assert (sentences[0].begin == 0 and sentences[0].end == 173)
133
+ assert (sentences[1].begin == 174 and sentences[1].end == 202)
134
+ assert (sentences[2].begin == 203 and sentences[2].end == 225)
135
+ assert (sentences[3].begin == 226 and sentences[3].end == 258)
136
+ assert (sentences[4].begin == 259 and sentences[4].end == 316)
137
+ assert (sentences[5].begin == 317 and sentences[5].end == 367)
138
+ assert (sentences[6].begin == 368 and sentences[6].end == 411)
139
+ assert (sentences[7].begin == 412 and sentences[7].end == 447)
140
+ assert (sentences[8].begin == 448 and sentences[8].end == 502)
141
+
142
+ def test11(self):
143
+ input_str = '''Patient doesn't have heart disease or high blood pressure, but their dad did have
144
+ diabetes. Pt is a 63M w/ h/o metastatic carcinoid tumor, HTN and hyperlipidemia.'''
145
+ self.rush = RuSH(str(os.path.join(self.pwd, 'rush_rules.tsv')), min_sent_chars=2, enable_logger=True)
146
+ sentences = self.rush.segToSentenceSpans(input_str)
147
+ self.printDetails(sentences, input_str)
148
+ assert (sentences[0].begin == 0 and sentences[0].end == 91)
149
+ assert (sentences[1].begin == 92 and sentences[1].end == 162)
118
150
 
119
151
  if __name__ == '__main__':
120
152
  unittest.main()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes