PyPI - fast-sentence-segment - Versions diffs - 0.1.9__py3-none-any.whl → 1.2.0__py3-none-any.whl - Mend

fast-sentence-segment 0.1.9py3-none-any.whl → 1.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

fast_sentence_segment/__init__.py +18 -18
fast_sentence_segment/bp/__init__.py +1 -1
fast_sentence_segment/bp/segmenter.py +65 -68
fast_sentence_segment/cli.py +56 -0
fast_sentence_segment/core/__init__.py +4 -0
fast_sentence_segment/core/base_object.py +18 -0
fast_sentence_segment/core/stopwatch.py +38 -0
fast_sentence_segment/dmo/__init__.py +10 -6
fast_sentence_segment/dmo/abbreviation_merger.py +146 -0
fast_sentence_segment/dmo/abbreviation_splitter.py +95 -0
fast_sentence_segment/dmo/abbreviations.py +96 -0
fast_sentence_segment/dmo/bullet_point_cleaner.py +55 -55
fast_sentence_segment/dmo/ellipsis_normalizer.py +45 -0
fast_sentence_segment/dmo/newlines_to_periods.py +57 -57
fast_sentence_segment/dmo/numbered_list_normalizer.py +47 -53
fast_sentence_segment/dmo/post_process_sentences.py +48 -48
fast_sentence_segment/dmo/question_exclamation_splitter.py +59 -0
fast_sentence_segment/dmo/spacy_doc_segmenter.py +101 -101
fast_sentence_segment/dmo/title_name_merger.py +152 -0
fast_sentence_segment/svc/__init__.py +2 -2
fast_sentence_segment/svc/perform_paragraph_segmentation.py +50 -50
fast_sentence_segment/svc/perform_sentence_segmentation.py +165 -129
fast_sentence_segment-1.2.0.dist-info/METADATA +189 -0
fast_sentence_segment-1.2.0.dist-info/RECORD +27 -0
{fast_sentence_segment-0.1.9.dist-info → fast_sentence_segment-1.2.0.dist-info}/WHEEL +1 -1
fast_sentence_segment-1.2.0.dist-info/entry_points.txt +3 -0
fast_sentence_segment-1.2.0.dist-info/licenses/LICENSE +21 -0
fast_sentence_segment/dmo/delimiters_to_periods.py +0 -37
fast_sentence_segment-0.1.9.dist-info/METADATA +0 -54
fast_sentence_segment-0.1.9.dist-info/RECORD +0 -16

fast_sentence_segment/dmo/bullet_point_cleaner.py CHANGED Viewed

@@ -1,55 +1,55 @@
-#!/usr/bin/env python
-# -*- coding: UTF-8 -*-
-""" Prevent Bullet Points from Triggering False Positive Segmentation """
-from baseblock import BaseObject
-class BulletPointCleaner(BaseObject):
-    """ Prevent Bullet Points from Triggering False Positive Segmentation """
-    def __init__(self):
-        """ Change Log
-        Created:
-            30-Sept-2021
-            craigtrim@gmail.com
-        Updated:
-            19-Oct-2022
-            craigtrim@gmail.com
-            *   clean up for segment_text_3_test.py
-        """
-        BaseObject.__init__(self, __name__)
-    @staticmethod
-    def process(input_text: str) -> str:
-        """
-        Purpose:
-            prevent numbered bullet points from triggering sentence detection
-        :param input_text:
-            any input text
-        :return:
-            preprocessed input text
-        """
-        if input_text.startswith("-"):
-            input_text = input_text[1:]  # segment_text_3_test.py
-        if "  " in input_text:
-            input_text = input_text.replace("  ", " ")
-        # the replacement routine above leaves double '..' in the text
-        # this replacement will solve that
-        while ".." in input_text:
-            input_text = input_text.replace("..", ".")
-        while ". -" in input_text:  # segment_text_3_test.py
-            input_text = input_text.replace(". -", ". ")
-        while ". . " in input_text:
-            input_text = input_text.replace(". . ", ".")
-        while '  ' in input_text:
-            input_text = input_text.replace('  ', ' ')
-        return input_text
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+""" Prevent Bullet Points from Triggering False Positive Segmentation """
+from fast_sentence_segment.core import BaseObject
+class BulletPointCleaner(BaseObject):
+    """ Prevent Bullet Points from Triggering False Positive Segmentation """
+    def __init__(self):
+        """ Change Log
+        Created:
+            30-Sept-2021
+            craigtrim@gmail.com
+        Updated:
+            19-Oct-2022
+            craigtrim@gmail.com
+            *   clean up for segment_text_3_test.py
+        """
+        BaseObject.__init__(self, __name__)
+    @staticmethod
+    def process(input_text: str) -> str:
+        """
+        Purpose:
+            prevent numbered bullet points from triggering sentence detection
+        :param input_text:
+            any input text
+        :return:
+            preprocessed input text
+        """
+        if input_text.startswith("-"):
+            input_text = input_text[1:]  # segment_text_3_test.py
+        if "  " in input_text:
+            input_text = input_text.replace("  ", " ")
+        # the replacement routine above leaves double '..' in the text
+        # this replacement will solve that
+        while ".." in input_text:
+            input_text = input_text.replace("..", ".")
+        while ". -" in input_text:  # segment_text_3_test.py
+            input_text = input_text.replace(". -", ". ")
+        while ". . " in input_text:
+            input_text = input_text.replace(". . ", ".")
+        while '  ' in input_text:
+            input_text = input_text.replace('  ', ' ')
+        return input_text

fast_sentence_segment/dmo/ellipsis_normalizer.py ADDED Viewed

@@ -0,0 +1,45 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+""" Normalize Ellipses to prevent them being stripped by cleanup routines """
+import re
+from fast_sentence_segment.core import BaseObject
+PLACEHOLDER = "〈ELLIPSIS〉"
+# Pattern: ... followed by space and capital letter
+BOUNDARY_PATTERN = re.compile(r'\.\.\.(\s+)([A-Z])')
+class EllipsisNormalizer(BaseObject):
+    """ Normalize Ellipses to prevent them being stripped by cleanup routines """
+    def __init__(self):
+        """
+        Created:
+            27-Dec-2024
+            craigtrim@gmail.com
+            *   preserve ellipses through the pipeline
+                https://github.com/craigtrim/fast-sentence-segment/issues/3
+        """
+        BaseObject.__init__(self, __name__)
+    def process(self,
+                input_text: str,
+                denormalize: bool = False) -> str:
+        if not denormalize:
+            # "... [Capital]" → "〈ELLIPSIS〉. [Capital]" (adds period for spaCy to split)
+            input_text = BOUNDARY_PATTERN.sub(PLACEHOLDER + r'.\1\2', input_text)
+            # Remaining ellipses (mid-sentence): "..." → "〈ELLIPSIS〉"
+            input_text = input_text.replace("...", PLACEHOLDER)
+        else:
+            # "〈ELLIPSIS〉." → "..." (remove extra period added for boundary)
+            input_text = input_text.replace(PLACEHOLDER + ".", "...")
+            # Remaining placeholders: "〈ELLIPSIS〉" → "..."
+            input_text = input_text.replace(PLACEHOLDER, "...")
+        return input_text

fast_sentence_segment/dmo/newlines_to_periods.py CHANGED Viewed

@@ -1,57 +1,57 @@
-#!/usr/bin/env python
-# -*- coding: UTF-8 -*-
-""" Convert New Lines into Periods """
-from baseblock import BaseObject
-class NewlinesToPeriods(BaseObject):
-    """ Convert New Lines into Periods """
-    def __init__(self):
-        """
-        Created:
-            30-Sept-2021
-        """
-        BaseObject.__init__(self, __name__)
-    @staticmethod
-    def process(input_text: str):
-        """
-        Purpose:
-            Take a CSV list and transform to sentences
-        :param input_text:
-        :return:
-        """
-        # def replace(input_text: str,
-        #             variant: str,
-        #             canon: str) -> str:
-        #     v1 = f" {variant} "
-        #     if v1 in input_text:
-        #         return input_text.replace(
-        #             v1, f" {canon} ")
-        #     v2 = f"{variant} "
-        #     if v2 in input_text:
-        #         return input_text.replace(
-        #             v2, f"{canon} ")
-        #     v3 = f" {variant}"
-        #     if v3 in input_text:
-        #         return input_text.replace(
-        #             v3, f" {canon}")
-        #     return input_text
-        # result = replace(input_text=input_text,
-        #                  variant='\n',
-        #                  canon=' . ')
-        # 20230309; don't replace a newline with a period
-        #           that too often causes confusion and puts a period where one should not exist
-        result = input_text.replace('\n', ' ')
-        return result
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+""" Convert New Lines into Periods """
+from fast_sentence_segment.core import BaseObject
+class NewlinesToPeriods(BaseObject):
+    """ Convert New Lines into Periods """
+    def __init__(self):
+        """
+        Created:
+            30-Sept-2021
+        """
+        BaseObject.__init__(self, __name__)
+    @staticmethod
+    def process(input_text: str):
+        """
+        Purpose:
+            Take a CSV list and transform to sentences
+        :param input_text:
+        :return:
+        """
+        # def replace(input_text: str,
+        #             variant: str,
+        #             canon: str) -> str:
+        #     v1 = f" {variant} "
+        #     if v1 in input_text:
+        #         return input_text.replace(
+        #             v1, f" {canon} ")
+        #     v2 = f"{variant} "
+        #     if v2 in input_text:
+        #         return input_text.replace(
+        #             v2, f"{canon} ")
+        #     v3 = f" {variant}"
+        #     if v3 in input_text:
+        #         return input_text.replace(
+        #             v3, f" {canon}")
+        #     return input_text
+        # result = replace(input_text=input_text,
+        #                  variant='\n',
+        #                  canon=' . ')
+        # 20230309; don't replace a newline with a period
+        #           that too often causes confusion and puts a period where one should not exist
+        result = input_text.replace('\n', ' ')
+        return result

fast_sentence_segment/dmo/numbered_list_normalizer.py CHANGED Viewed

@@ -1,53 +1,47 @@
-#!/usr/bin/env python
-# -*- coding: UTF-8 -*-
-""" Normalize Numbered Lists to prevent False Positive Segmentation """
-from baseblock import BaseObject
-class NumberedListNormalizer(BaseObject):
-    """ Normalize Numbered Lists to prevent False Positive Segmentation """
-    __d_candidate_list_elements = {
-        "1. ": "1_ ",
-        "2. ": "2_ ",
-        "3. ": "3_ ",
-        "4. ": "4_ ",
-        "5. ": "5_ ",
-        "6. ": "6_ ",
-        "7. ": "7_ ",
-        "8. ": "8_ ",
-        "9. ": "9_ ",
-        "10. ": "10_ ",
-    }
-    def __init__(self):
-        """
-        Created:
-            19-Oct-2022
-            craigtrim@gmail.com
-            *   https://github.com/craigtrim/fast-sentence-segment/issues/1
-        """
-        BaseObject.__init__(self, __name__)
-    def process(self,
-                input_text: str,
-                denormalize: bool = False) -> str:
-        if not denormalize:
-            for candidate in self.__d_candidate_list_elements:
-                if candidate in input_text:
-                    input_text = input_text.replace(
-                        candidate, self.__d_candidate_list_elements[candidate])
-        else:  # reverse the process
-            d_rev = {self.__d_candidate_list_elements[k]: k
-                     for k in self.__d_candidate_list_elements}
-            for candidate in d_rev:
-                if candidate in input_text:
-                    input_text = input_text.replace(
-                        candidate, d_rev[candidate])
-        return input_text
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+""" Normalize Numbered Lists to prevent False Positive Segmentation """
+import re
+from fast_sentence_segment.core import BaseObject
+class NumberedListNormalizer(BaseObject):
+    """ Normalize Numbered Lists to prevent False Positive Segmentation """
+    # Pattern 1: start of string OR newline, followed by number, period, space
+    __normalize_line_start = re.compile(r'(^|\n\s*)(\d{1,2})\. ')
+    __denormalize_line_start = re.compile(r'(^|\n\s*)(\d{1,2})_ ')
+    # Pattern 2: inline numbered list ". N. " (period + space + number + period + space)
+    __normalize_inline = re.compile(r'(\. )(\d{1,2})\. ')
+    __denormalize_inline = re.compile(r'(\. )(\d{1,2})_ ')
+    def __init__(self):
+        """
+        Created:
+            19-Oct-2022
+            craigtrim@gmail.com
+            *   https://github.com/craigtrim/fast-sentence-segment/issues/1
+        Updated:
+            27-Dec-2024
+            craigtrim@gmail.com
+            *   fix to only match at line starts, not mid-sentence
+                https://github.com/craigtrim/fast-sentence-segment/issues/3
+        """
+        BaseObject.__init__(self, __name__)
+    def process(self,
+                input_text: str,
+                denormalize: bool = False) -> str:
+        if not denormalize:
+            input_text = self.__normalize_line_start.sub(r'\1\2_ ', input_text)
+            input_text = self.__normalize_inline.sub(r'\1\2_ ', input_text)
+        else:
+            input_text = self.__denormalize_line_start.sub(r'\1\2. ', input_text)
+            input_text = self.__denormalize_inline.sub(r'\1\2. ', input_text)
+        return input_text

fast_sentence_segment/dmo/post_process_sentences.py CHANGED Viewed

@@ -1,48 +1,48 @@
-#!/usr/bin/env python
-# -*- coding: UTF-8 -*-
-""" Post Process Sentences """
-from baseblock import BaseObject
-class PostProcessStructure(BaseObject):
-    """ Post Process Sentences """
-    __replace = {
-        '..': '. ',
-        '. .': '. ',
-        ',.': ', ',
-        ', .': ', ',
-        '!.': '! ',
-        '! .': '! ',
-        '?.': '? ',
-        '? .': '? ',
-        ':.': ': ',
-        ': .': ': ',
-    }
-    def __init__(self):
-        """
-        Created:
-            1-Oct-2021
-        """
-        BaseObject.__init__(self, __name__)
-    def process(self,
-                sentences: list) -> list:
-        normalized = []
-        for sentence in sentences:
-            for k in self.__replace:
-                if k in sentence:
-                    sentence = sentence.replace(k, self.__replace[k]).strip()
-            normalized.append(sentence)
-        return normalized
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+""" Post Process Sentences """
+from fast_sentence_segment.core import BaseObject
+class PostProcessStructure(BaseObject):
+    """ Post Process Sentences """
+    __replace = {
+        '..': '. ',
+        '. .': '. ',
+        ',.': ', ',
+        ', .': ', ',
+        '!.': '! ',
+        '! .': '! ',
+        '?.': '? ',
+        '? .': '? ',
+        ':.': ': ',
+        ': .': ': ',
+    }
+    def __init__(self):
+        """
+        Created:
+            1-Oct-2021
+        """
+        BaseObject.__init__(self, __name__)
+    def process(self,
+                sentences: list) -> list:
+        normalized = []
+        for sentence in sentences:
+            for k in self.__replace:
+                if k in sentence:
+                    sentence = sentence.replace(k, self.__replace[k]).strip()
+            normalized.append(sentence)
+        return normalized

fast_sentence_segment/dmo/question_exclamation_splitter.py ADDED Viewed

@@ -0,0 +1,59 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+""" Split sentences at ? and ! followed by capital letter """
+import re
+from typing import List
+from fast_sentence_segment.core import BaseObject
+# Pattern: ? or ! followed by space and capital letter
+BOUNDARY_PATTERN = re.compile(r'([?!])(\s+)([A-Z])')
+class QuestionExclamationSplitter(BaseObject):
+    """ Split sentences at ? and ! followed by capital letter """
+    def __init__(self):
+        """
+        Created:
+            27-Dec-2024
+            craigtrim@gmail.com
+            *   spaCy doesn't always split on ? and ! boundaries
+                https://github.com/craigtrim/fast-sentence-segment/issues/3
+        """
+        BaseObject.__init__(self, __name__)
+    def process(self, sentences: List[str]) -> List[str]:
+        """Split sentences that contain ? or ! followed by capital letter.
+        Args:
+            sentences: List of sentences from earlier processing
+        Returns:
+            List of sentences with ? and ! boundaries split
+        """
+        result = []
+        for sent in sentences:
+            # Split on pattern, keeping the punctuation with the first part
+            parts = BOUNDARY_PATTERN.split(sent)
+            if len(parts) == 1:
+                result.append(sent)
+            else:
+                # Reassemble: parts = [before, punct, space, capital, after, ...]
+                i = 0
+                while i < len(parts):
+                    if i + 3 < len(parts):
+                        # before + punct
+                        result.append(parts[i] + parts[i + 1])
+                        # capital + rest will be handled in next iteration
+                        parts[i + 4] = parts[i + 3] + parts[i + 4] if i + 4 < len(parts) else parts[i + 3]
+                        i += 4
+                    else:
+                        if parts[i].strip():
+                            result.append(parts[i])
+                        i += 1
+        return [s.strip() for s in result if s.strip()]

fast-sentence-segment 0.1.9__py3-none-any.whl → 1.2.0__py3-none-any.whl

fast-sentence-segment 0.1.9py3-none-any.whl → 1.2.0py3-none-any.whl