fast-sentence-segment 0.1.9__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. fast_sentence_segment/__init__.py +18 -18
  2. fast_sentence_segment/bp/__init__.py +1 -1
  3. fast_sentence_segment/bp/segmenter.py +65 -68
  4. fast_sentence_segment/cli.py +56 -0
  5. fast_sentence_segment/core/__init__.py +4 -0
  6. fast_sentence_segment/core/base_object.py +18 -0
  7. fast_sentence_segment/core/stopwatch.py +38 -0
  8. fast_sentence_segment/dmo/__init__.py +10 -6
  9. fast_sentence_segment/dmo/abbreviation_merger.py +146 -0
  10. fast_sentence_segment/dmo/abbreviation_splitter.py +95 -0
  11. fast_sentence_segment/dmo/abbreviations.py +96 -0
  12. fast_sentence_segment/dmo/bullet_point_cleaner.py +55 -55
  13. fast_sentence_segment/dmo/ellipsis_normalizer.py +45 -0
  14. fast_sentence_segment/dmo/newlines_to_periods.py +57 -57
  15. fast_sentence_segment/dmo/numbered_list_normalizer.py +47 -53
  16. fast_sentence_segment/dmo/post_process_sentences.py +48 -48
  17. fast_sentence_segment/dmo/question_exclamation_splitter.py +59 -0
  18. fast_sentence_segment/dmo/spacy_doc_segmenter.py +101 -101
  19. fast_sentence_segment/dmo/title_name_merger.py +152 -0
  20. fast_sentence_segment/svc/__init__.py +2 -2
  21. fast_sentence_segment/svc/perform_paragraph_segmentation.py +50 -50
  22. fast_sentence_segment/svc/perform_sentence_segmentation.py +165 -129
  23. fast_sentence_segment-1.2.0.dist-info/METADATA +189 -0
  24. fast_sentence_segment-1.2.0.dist-info/RECORD +27 -0
  25. {fast_sentence_segment-0.1.9.dist-info → fast_sentence_segment-1.2.0.dist-info}/WHEEL +1 -1
  26. fast_sentence_segment-1.2.0.dist-info/entry_points.txt +3 -0
  27. fast_sentence_segment-1.2.0.dist-info/licenses/LICENSE +21 -0
  28. fast_sentence_segment/dmo/delimiters_to_periods.py +0 -37
  29. fast_sentence_segment-0.1.9.dist-info/METADATA +0 -54
  30. fast_sentence_segment-0.1.9.dist-info/RECORD +0 -16
@@ -1,55 +1,55 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Prevent Bullet Points from Triggering False Positive Segmentation """
4
-
5
-
6
- from baseblock import BaseObject
7
-
8
-
9
- class BulletPointCleaner(BaseObject):
10
- """ Prevent Bullet Points from Triggering False Positive Segmentation """
11
-
12
- def __init__(self):
13
- """ Change Log
14
-
15
- Created:
16
- 30-Sept-2021
17
- craigtrim@gmail.com
18
- Updated:
19
- 19-Oct-2022
20
- craigtrim@gmail.com
21
- * clean up for segment_text_3_test.py
22
- """
23
- BaseObject.__init__(self, __name__)
24
-
25
- @staticmethod
26
- def process(input_text: str) -> str:
27
- """
28
- Purpose:
29
- prevent numbered bullet points from triggering sentence detection
30
- :param input_text:
31
- any input text
32
- :return:
33
- preprocessed input text
34
- """
35
- if input_text.startswith("-"):
36
- input_text = input_text[1:] # segment_text_3_test.py
37
-
38
- if " " in input_text:
39
- input_text = input_text.replace(" ", " ")
40
-
41
- # the replacement routine above leaves double '..' in the text
42
- # this replacement will solve that
43
- while ".." in input_text:
44
- input_text = input_text.replace("..", ".")
45
-
46
- while ". -" in input_text: # segment_text_3_test.py
47
- input_text = input_text.replace(". -", ". ")
48
-
49
- while ". . " in input_text:
50
- input_text = input_text.replace(". . ", ".")
51
-
52
- while ' ' in input_text:
53
- input_text = input_text.replace(' ', ' ')
54
-
55
- return input_text
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Prevent Bullet Points from Triggering False Positive Segmentation """
4
+
5
+
6
+ from fast_sentence_segment.core import BaseObject
7
+
8
+
9
+ class BulletPointCleaner(BaseObject):
10
+ """ Prevent Bullet Points from Triggering False Positive Segmentation """
11
+
12
+ def __init__(self):
13
+ """ Change Log
14
+
15
+ Created:
16
+ 30-Sept-2021
17
+ craigtrim@gmail.com
18
+ Updated:
19
+ 19-Oct-2022
20
+ craigtrim@gmail.com
21
+ * clean up for segment_text_3_test.py
22
+ """
23
+ BaseObject.__init__(self, __name__)
24
+
25
+ @staticmethod
26
+ def process(input_text: str) -> str:
27
+ """
28
+ Purpose:
29
+ prevent numbered bullet points from triggering sentence detection
30
+ :param input_text:
31
+ any input text
32
+ :return:
33
+ preprocessed input text
34
+ """
35
+ if input_text.startswith("-"):
36
+ input_text = input_text[1:] # segment_text_3_test.py
37
+
38
+ if " " in input_text:
39
+ input_text = input_text.replace(" ", " ")
40
+
41
+ # the replacement routine above leaves double '..' in the text
42
+ # this replacement will solve that
43
+ while ".." in input_text:
44
+ input_text = input_text.replace("..", ".")
45
+
46
+ while ". -" in input_text: # segment_text_3_test.py
47
+ input_text = input_text.replace(". -", ". ")
48
+
49
+ while ". . " in input_text:
50
+ input_text = input_text.replace(". . ", ".")
51
+
52
+ while ' ' in input_text:
53
+ input_text = input_text.replace(' ', ' ')
54
+
55
+ return input_text
@@ -0,0 +1,45 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Normalize Ellipses to prevent them being stripped by cleanup routines """
4
+
5
+
6
+ import re
7
+
8
+ from fast_sentence_segment.core import BaseObject
9
+
10
+
11
+ PLACEHOLDER = "〈ELLIPSIS〉"
12
+
13
+ # Pattern: ... followed by space and capital letter
14
+ BOUNDARY_PATTERN = re.compile(r'\.\.\.(\s+)([A-Z])')
15
+
16
+
17
+ class EllipsisNormalizer(BaseObject):
18
+ """ Normalize Ellipses to prevent them being stripped by cleanup routines """
19
+
20
+ def __init__(self):
21
+ """
22
+ Created:
23
+ 27-Dec-2024
24
+ craigtrim@gmail.com
25
+ * preserve ellipses through the pipeline
26
+ https://github.com/craigtrim/fast-sentence-segment/issues/3
27
+ """
28
+ BaseObject.__init__(self, __name__)
29
+
30
+ def process(self,
31
+ input_text: str,
32
+ denormalize: bool = False) -> str:
33
+
34
+ if not denormalize:
35
+ # "... [Capital]" → "〈ELLIPSIS〉. [Capital]" (adds period for spaCy to split)
36
+ input_text = BOUNDARY_PATTERN.sub(PLACEHOLDER + r'.\1\2', input_text)
37
+ # Remaining ellipses (mid-sentence): "..." → "〈ELLIPSIS〉"
38
+ input_text = input_text.replace("...", PLACEHOLDER)
39
+ else:
40
+ # "〈ELLIPSIS〉." → "..." (remove extra period added for boundary)
41
+ input_text = input_text.replace(PLACEHOLDER + ".", "...")
42
+ # Remaining placeholders: "〈ELLIPSIS〉" → "..."
43
+ input_text = input_text.replace(PLACEHOLDER, "...")
44
+
45
+ return input_text
@@ -1,57 +1,57 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Convert New Lines into Periods """
4
-
5
-
6
- from baseblock import BaseObject
7
-
8
-
9
- class NewlinesToPeriods(BaseObject):
10
- """ Convert New Lines into Periods """
11
-
12
- def __init__(self):
13
- """
14
- Created:
15
- 30-Sept-2021
16
- """
17
- BaseObject.__init__(self, __name__)
18
-
19
- @staticmethod
20
- def process(input_text: str):
21
- """
22
- Purpose:
23
- Take a CSV list and transform to sentences
24
- :param input_text:
25
- :return:
26
- """
27
-
28
- # def replace(input_text: str,
29
- # variant: str,
30
- # canon: str) -> str:
31
-
32
- # v1 = f" {variant} "
33
- # if v1 in input_text:
34
- # return input_text.replace(
35
- # v1, f" {canon} ")
36
-
37
- # v2 = f"{variant} "
38
- # if v2 in input_text:
39
- # return input_text.replace(
40
- # v2, f"{canon} ")
41
-
42
- # v3 = f" {variant}"
43
- # if v3 in input_text:
44
- # return input_text.replace(
45
- # v3, f" {canon}")
46
-
47
- # return input_text
48
-
49
- # result = replace(input_text=input_text,
50
- # variant='\n',
51
- # canon=' . ')
52
-
53
- # 20230309; don't replace a newline with a period
54
- # that too often causes confusion and puts a period where one should not exist
55
- result = input_text.replace('\n', ' ')
56
-
57
- return result
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Convert New Lines into Periods """
4
+
5
+
6
+ from fast_sentence_segment.core import BaseObject
7
+
8
+
9
+ class NewlinesToPeriods(BaseObject):
10
+ """ Convert New Lines into Periods """
11
+
12
+ def __init__(self):
13
+ """
14
+ Created:
15
+ 30-Sept-2021
16
+ """
17
+ BaseObject.__init__(self, __name__)
18
+
19
+ @staticmethod
20
+ def process(input_text: str):
21
+ """
22
+ Purpose:
23
+ Take a CSV list and transform to sentences
24
+ :param input_text:
25
+ :return:
26
+ """
27
+
28
+ # def replace(input_text: str,
29
+ # variant: str,
30
+ # canon: str) -> str:
31
+
32
+ # v1 = f" {variant} "
33
+ # if v1 in input_text:
34
+ # return input_text.replace(
35
+ # v1, f" {canon} ")
36
+
37
+ # v2 = f"{variant} "
38
+ # if v2 in input_text:
39
+ # return input_text.replace(
40
+ # v2, f"{canon} ")
41
+
42
+ # v3 = f" {variant}"
43
+ # if v3 in input_text:
44
+ # return input_text.replace(
45
+ # v3, f" {canon}")
46
+
47
+ # return input_text
48
+
49
+ # result = replace(input_text=input_text,
50
+ # variant='\n',
51
+ # canon=' . ')
52
+
53
+ # 20230309; don't replace a newline with a period
54
+ # that too often causes confusion and puts a period where one should not exist
55
+ result = input_text.replace('\n', ' ')
56
+
57
+ return result
@@ -1,53 +1,47 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Normalize Numbered Lists to prevent False Positive Segmentation """
4
-
5
-
6
- from baseblock import BaseObject
7
-
8
-
9
- class NumberedListNormalizer(BaseObject):
10
- """ Normalize Numbered Lists to prevent False Positive Segmentation """
11
-
12
- __d_candidate_list_elements = {
13
- "1. ": "1_ ",
14
- "2. ": "2_ ",
15
- "3. ": "3_ ",
16
- "4. ": "4_ ",
17
- "5. ": "5_ ",
18
- "6. ": "6_ ",
19
- "7. ": "7_ ",
20
- "8. ": "8_ ",
21
- "9. ": "9_ ",
22
- "10. ": "10_ ",
23
- }
24
-
25
- def __init__(self):
26
- """
27
- Created:
28
- 19-Oct-2022
29
- craigtrim@gmail.com
30
- * https://github.com/craigtrim/fast-sentence-segment/issues/1
31
- """
32
- BaseObject.__init__(self, __name__)
33
-
34
- def process(self,
35
- input_text: str,
36
- denormalize: bool = False) -> str:
37
-
38
- if not denormalize:
39
- for candidate in self.__d_candidate_list_elements:
40
- if candidate in input_text:
41
- input_text = input_text.replace(
42
- candidate, self.__d_candidate_list_elements[candidate])
43
-
44
- else: # reverse the process
45
- d_rev = {self.__d_candidate_list_elements[k]: k
46
- for k in self.__d_candidate_list_elements}
47
-
48
- for candidate in d_rev:
49
- if candidate in input_text:
50
- input_text = input_text.replace(
51
- candidate, d_rev[candidate])
52
-
53
- return input_text
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Normalize Numbered Lists to prevent False Positive Segmentation """
4
+
5
+
6
+ import re
7
+
8
+ from fast_sentence_segment.core import BaseObject
9
+
10
+
11
+ class NumberedListNormalizer(BaseObject):
12
+ """ Normalize Numbered Lists to prevent False Positive Segmentation """
13
+
14
+ # Pattern 1: start of string OR newline, followed by number, period, space
15
+ __normalize_line_start = re.compile(r'(^|\n\s*)(\d{1,2})\. ')
16
+ __denormalize_line_start = re.compile(r'(^|\n\s*)(\d{1,2})_ ')
17
+
18
+ # Pattern 2: inline numbered list ". N. " (period + space + number + period + space)
19
+ __normalize_inline = re.compile(r'(\. )(\d{1,2})\. ')
20
+ __denormalize_inline = re.compile(r'(\. )(\d{1,2})_ ')
21
+
22
+ def __init__(self):
23
+ """
24
+ Created:
25
+ 19-Oct-2022
26
+ craigtrim@gmail.com
27
+ * https://github.com/craigtrim/fast-sentence-segment/issues/1
28
+ Updated:
29
+ 27-Dec-2024
30
+ craigtrim@gmail.com
31
+ * fix to only match at line starts, not mid-sentence
32
+ https://github.com/craigtrim/fast-sentence-segment/issues/3
33
+ """
34
+ BaseObject.__init__(self, __name__)
35
+
36
+ def process(self,
37
+ input_text: str,
38
+ denormalize: bool = False) -> str:
39
+
40
+ if not denormalize:
41
+ input_text = self.__normalize_line_start.sub(r'\1\2_ ', input_text)
42
+ input_text = self.__normalize_inline.sub(r'\1\2_ ', input_text)
43
+ else:
44
+ input_text = self.__denormalize_line_start.sub(r'\1\2. ', input_text)
45
+ input_text = self.__denormalize_inline.sub(r'\1\2. ', input_text)
46
+
47
+ return input_text
@@ -1,48 +1,48 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Post Process Sentences """
4
-
5
-
6
- from baseblock import BaseObject
7
-
8
-
9
- class PostProcessStructure(BaseObject):
10
- """ Post Process Sentences """
11
-
12
- __replace = {
13
- '..': '. ',
14
- '. .': '. ',
15
-
16
- ',.': ', ',
17
- ', .': ', ',
18
-
19
- '!.': '! ',
20
- '! .': '! ',
21
-
22
- '?.': '? ',
23
- '? .': '? ',
24
-
25
- ':.': ': ',
26
- ': .': ': ',
27
- }
28
-
29
- def __init__(self):
30
- """
31
- Created:
32
- 1-Oct-2021
33
- """
34
- BaseObject.__init__(self, __name__)
35
-
36
- def process(self,
37
- sentences: list) -> list:
38
- normalized = []
39
-
40
- for sentence in sentences:
41
-
42
- for k in self.__replace:
43
- if k in sentence:
44
- sentence = sentence.replace(k, self.__replace[k]).strip()
45
-
46
- normalized.append(sentence)
47
-
48
- return normalized
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Post Process Sentences """
4
+
5
+
6
+ from fast_sentence_segment.core import BaseObject
7
+
8
+
9
+ class PostProcessStructure(BaseObject):
10
+ """ Post Process Sentences """
11
+
12
+ __replace = {
13
+ '..': '. ',
14
+ '. .': '. ',
15
+
16
+ ',.': ', ',
17
+ ', .': ', ',
18
+
19
+ '!.': '! ',
20
+ '! .': '! ',
21
+
22
+ '?.': '? ',
23
+ '? .': '? ',
24
+
25
+ ':.': ': ',
26
+ ': .': ': ',
27
+ }
28
+
29
+ def __init__(self):
30
+ """
31
+ Created:
32
+ 1-Oct-2021
33
+ """
34
+ BaseObject.__init__(self, __name__)
35
+
36
+ def process(self,
37
+ sentences: list) -> list:
38
+ normalized = []
39
+
40
+ for sentence in sentences:
41
+
42
+ for k in self.__replace:
43
+ if k in sentence:
44
+ sentence = sentence.replace(k, self.__replace[k]).strip()
45
+
46
+ normalized.append(sentence)
47
+
48
+ return normalized
@@ -0,0 +1,59 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Split sentences at ? and ! followed by capital letter """
4
+
5
+
6
+ import re
7
+ from typing import List
8
+
9
+ from fast_sentence_segment.core import BaseObject
10
+
11
+
12
+ # Pattern: ? or ! followed by space and capital letter
13
+ BOUNDARY_PATTERN = re.compile(r'([?!])(\s+)([A-Z])')
14
+
15
+
16
+ class QuestionExclamationSplitter(BaseObject):
17
+ """ Split sentences at ? and ! followed by capital letter """
18
+
19
+ def __init__(self):
20
+ """
21
+ Created:
22
+ 27-Dec-2024
23
+ craigtrim@gmail.com
24
+ * spaCy doesn't always split on ? and ! boundaries
25
+ https://github.com/craigtrim/fast-sentence-segment/issues/3
26
+ """
27
+ BaseObject.__init__(self, __name__)
28
+
29
+ def process(self, sentences: List[str]) -> List[str]:
30
+ """Split sentences that contain ? or ! followed by capital letter.
31
+
32
+ Args:
33
+ sentences: List of sentences from earlier processing
34
+
35
+ Returns:
36
+ List of sentences with ? and ! boundaries split
37
+ """
38
+ result = []
39
+ for sent in sentences:
40
+ # Split on pattern, keeping the punctuation with the first part
41
+ parts = BOUNDARY_PATTERN.split(sent)
42
+ if len(parts) == 1:
43
+ result.append(sent)
44
+ else:
45
+ # Reassemble: parts = [before, punct, space, capital, after, ...]
46
+ i = 0
47
+ while i < len(parts):
48
+ if i + 3 < len(parts):
49
+ # before + punct
50
+ result.append(parts[i] + parts[i + 1])
51
+ # capital + rest will be handled in next iteration
52
+ parts[i + 4] = parts[i + 3] + parts[i + 4] if i + 4 < len(parts) else parts[i + 3]
53
+ i += 4
54
+ else:
55
+ if parts[i].strip():
56
+ result.append(parts[i])
57
+ i += 1
58
+
59
+ return [s.strip() for s in result if s.strip()]