PyPI - PyRuSH - Versions diffs - 1.0.10.dev0__tar.gz → 1.0.12.dev0__tar.gz - Mend

PyRuSH 1.0.10.dev0tar.gz → 1.0.12.dev0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

{pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: PyRuSH
-Version: 1.0.10.dev0
+Version: 1.0.12.dev0
 Summary: PyRuSH is the python implementation of RuSH (Rule-based sentence Segmenter using Hashing), which is originally developed using Java. RuSH is an efficient, reliable, and easy adaptable rule-based sentence segmentation solution. It is specifically designed to handle the telegraphic written text in clinical note. It leverages a nested hash table to execute simultaneous rule processing, which reduces the impact of the rule-base growth on execution time and eliminates the effect of rule order on accuracy.
 Home-page: https://github.com/jianlins/PyRuSH
 Author: Jianlin
@@ -18,6 +18,7 @@ Requires-Dist: spacy<3.8; python_version < "3.12"
 Requires-Dist: spacy>=3.8; python_version >= "3.12"
 Requires-Dist: PyFastNER>=1.0.8
 Requires-Dist: quicksectx>=0.3.5
+Requires-Dist: loguru
 Dynamic: author
 Dynamic: home-page
 Dynamic: license-file
@@ -76,3 +77,16 @@ Start from version 1.0.3, PyRuSH adds Spacy compatible Sentencizer component: Py
 A Colab Notebook Demo
 ---------------------------
 Feel free to try this runnable `Colab notebook Demo <https://colab.research.google.com/drive/1gX9MzZTQiPw8G3x_vUwZbiSXGtbI0uIX?usp=sharing>`_
+Revision History
+----------------
+**1.0.11 (2025-09-02)**
+- Improved sentence splitting logic: Sentences are now split at the last token before exceeding the max length, ensuring no chunk exceeds the specified limit.
+- Edge case handling: Trailing whitespaces (caused by spacy sentence labeling mechanism) can be optionally split into a separate sentence (merge_gaps=False) to avoid necessarily long sentences.
+**1.0.9 (2024-10-27)**
+- Initial release with spaCy 3.x compatibility and core RuSH logic.
+- Added Spacy-compatible PyRuSHSentencizer component.

{pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/PyRuSHSentencizer.py RENAMED Viewed

@@ -25,16 +25,22 @@ from .StaticSentencizerFun import cpredict_merge_gaps,cpredict_split_gaps, cset_
 @Language.factory("medspacy_pyrush")
 class PyRuSHSentencizer(Sentencizer):
     def __init__(self, nlp: Language, name: str = "medspacy_pyrush", rules_path: str = '', max_repeat: int = 50,
-                 auto_fix_gaps: bool = True, merge_gaps: bool = False) -> Sentencizer:
+                 auto_fix_gaps: bool = True, merge_gaps: bool = False, max_sentence_length: int = None) -> Sentencizer:
         """
+        Initialize the PyRuSH sentencizer component.
-        @param rules_path: The string of the rule file path or rules themselves. By default, it will look for
-        rush_rules.tsv in the site_packages/conf folder.
-        @param max_repeat: Total number of replicates that allows to be handled by "+" wildcard.
-        @param auto_fix_gaps: If gaps are caused by malcrafted rules, try to fix them.
-            However, this has no control of sentence end,
-        @param merge_gaps: When True, gaps between sentences are merged into the preceding sentence.
-            When False, gaps are split into separate sentences.
+        Args:
+            nlp (Language): The spaCy language pipeline.
+            name (str): Name of the component. Default is "medspacy_pyrush".
+            rules_path (str): Path to the rule file or rules themselves. If empty, defaults to 'conf/rush_rules.tsv'.
+            max_repeat (int): Maximum number of repeats allowed for the '+' wildcard in rules.
+            auto_fix_gaps (bool): If True, attempts to fix gaps caused by malformed rules.
+            merge_gaps (bool): If True, merges gaps between sentences into the preceding sentence. If False, splits gaps (might be multiple whitespaces or new line characters) into separate sentences.
+            max_sentence_length (int or None): Maximum allowed sentence length in characters. If set, sentences longer than this will be split.
+        Notes:
+            - Setting merge_gaps controls whether gaps are merged or split.
+            - max_sentence_length applies to both merge and split modes.
         """
         self.nlp = nlp
         self.name = name
@@ -45,32 +51,65 @@ class PyRuSHSentencizer(Sentencizer):
         self.rules_path = rules_path
         self.rush = RuSH(rules=rules_path, max_repeat=max_repeat, auto_fix_gaps=auto_fix_gaps)
         self.merge_gaps = merge_gaps
+        self.max_sentence_length = max_sentence_length
     @classmethod
     def from_nlp(cls, nlp, **cfg):
+        """
+        Create a PyRuSHSentencizer instance from a spaCy nlp object and configuration.
+        Args:
+            nlp (Language): The spaCy language pipeline.
+            **cfg: Additional configuration parameters for initialization.
+        Returns:
+            PyRuSHSentencizer: An initialized sentencizer instance.
+        """
         return cls(**cfg)
     def __call__(self, doc):
+        """
+        Apply sentence boundary detection to a spaCy Doc and set sentence start annotations.
+        Args:
+            doc (Doc): The spaCy Doc to process.
+        Returns:
+            Doc: The processed Doc with sentence boundaries set.
+        """
         tags = self.predict([doc])
         cset_annotations([doc], tags)
         return doc
     def predict(self, docs):
-        """Apply the pipeline's model to a batch of docs, without
-        modifying them.
+        """
+        Predict sentence boundaries for a batch of spaCy Docs.
+        Args:
+            docs (list of Doc): List of spaCy Docs to process.
+        Returns:
+            list of list of bool: Sentence start guesses for each Doc.
+        Notes:
+            - Does not modify the Docs; only returns sentence start predictions.
         """
         if self.merge_gaps:
-            from .StaticSentencizerFun import cpredict_ww
-            guesses = cpredict_merge_gaps(docs, self.rush.segToSentenceSpans)
+            guesses = cpredict_merge_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
         else:
-            guesses = cpredict_split_gaps(docs, self.rush.segToSentenceSpans)
+            guesses = cpredict_split_gaps(docs, self.rush.segToSentenceSpans, self.max_sentence_length)
         return guesses
     def set_annotations(self, docs, batch_tag_ids, tensors=None):
         """
-        This function overwrite spacy's Sentencizer.
+        Set sentence boundary annotations on spaCy Docs.
+        Args:
+            docs (list of Doc): List of spaCy Docs to annotate.
+            batch_tag_ids (list of list of bool): Sentence start tags for each Doc.
+            tensors: Placeholder for future extensions (optional).
-        @param batch_tag_ids: a list of doc's tags (a list of boolean values)
-        @param tensors: a place holder for future extensions
+        Notes:
+            - This method overwrites spaCy's Sentencizer annotations.
         """
         cset_annotations(docs, batch_tag_ids, tensors)

{pyrush-1.0.10.dev0 → pyrush-1.0.12.dev0}/PyRuSH/RuSH.py RENAMED Viewed

@@ -27,8 +27,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import logging
-import logging.config
+from loguru import logger
 import os.path
 from typing import Union, List
@@ -40,40 +39,7 @@ END = 'stend'
 def initLogger():
-    config_files = ['../../../conf/logging.ini', '../../conf/logging.ini', '../conf/logging.ini', 'conf/logging.ini',
-                    'logging.ini']
-    config_file = None
-    for f in config_files:
-        if os.path.isfile(f):
-            config_file = f
-            break
-    if config_file is None:
-        config_file = config_files[-1]
-        with open(config_file, 'w') as f:
-            f.write('''[loggers]
-keys=root
-[handlers]
-keys=consoleHandler
-[formatters]
-keys=simpleFormatter
-[logger_root]
-level=WARNING
-handlers=consoleHandler
-[handler_consoleHandler]
-class=StreamHandler
-level=WARNING
-formatter=simpleFormatter
-args=(sys.stdout,)
-[formatter_simpleFormatter]
-format=%(asctime)s - %(name)s - %(levelname)s - %(message)s
-datefmt=
-''')
-    logging.config.fileConfig(config_file)
+    pass  # Removed: logging config logic for Loguru migration
 class RuSH:
@@ -84,9 +50,7 @@ class RuSH:
         self.fastner = FastCNER(rules, max_repeat)
         self.fastner.span_compare_method = 'scorewidth'
         if enable_logger:
-            initLogger()
-            self.logger = logging.getLogger(__name__)
-            print(self.logger.level)
+            self.logger = logger
         else:
             self.logger = None
         self.auto_fix_gaps = auto_fix_gaps
@@ -109,13 +73,13 @@ class RuSH:
         self.fastner.process(text, 0, result)
         # log important message for debugging use
-        if self.logger is not None and self.logger.isEnabledFor(logging.DEBUG):
+        if self.logger is not None:
             text = text.replace('\n', ' ')
             for concept_type, spans in result.items():
-                self.logger.debug(concept_type)
+                self.logger.opt(lazy=True).debug(concept_type)
                 for span in spans:
                     rule = self.fastner.rule_store[span.rule_id]
-                    self.logger.debug(
+                    self.logger.opt(lazy=True).debug(
                         '\t{0}-{1}:{2}\t{3}<{4}>\t[Rule {5}:\t{6}\t{7}\t{8}\t{9}]'.format(span.begin, span.end,
                                                                                           span.score,
                                                                                           text[:span.begin],
@@ -185,15 +149,15 @@ class RuSH:
                 if trimed_gap is not None and trimed_gap.width > self.min_sent_chars:
                     output.append(trimed_gap)
-        if self.logger is not None and self.logger.isEnabledFor(logging.DEBUG):
+        if self.logger is not None:
             for sentence in output:
-                self.logger.debug(
+                self.logger.opt(lazy=True).debug(
                     'Sentence({0}-{1}):\t>{2}<'.format(sentence.begin, sentence.end, text[sentence.begin:sentence.end]))
         return output
     @staticmethod
-    def fix_gap(sentences: [], text: str, previous_end: int, this_begin: int, min_sent_chars: int = 5):
+    def fix_gap(sentences: list, text: str, previous_end: int, this_begin: int, min_sent_chars: int = 5):
         trimed_gap = RuSH.trim_gap(text, previous_end, this_begin)
         if trimed_gap is None:
             return
@@ -203,7 +167,7 @@ class RuSH:
             sentences[-1].end = trimed_gap.end
     @staticmethod
-    def trim_gap(text: str, previous_end: int, this_begin: int) -> Span:
+    def trim_gap(text: str, previous_end: int, this_begin: int) -> 'Span | None':
         begin = -1
         alnum_begin = -1
         end = 0

PyRuSH 1.0.10.dev0__tar.gz → 1.0.12.dev0__tar.gz

PyRuSH 1.0.10.dev0tar.gz → 1.0.12.dev0tar.gz