fast-sentence-segment 0.1.9__py3-none-any.whl → 1.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (30) hide show
  1. fast_sentence_segment/__init__.py +18 -18
  2. fast_sentence_segment/bp/__init__.py +1 -1
  3. fast_sentence_segment/bp/segmenter.py +65 -68
  4. fast_sentence_segment/cli.py +56 -0
  5. fast_sentence_segment/core/__init__.py +4 -0
  6. fast_sentence_segment/core/base_object.py +18 -0
  7. fast_sentence_segment/core/stopwatch.py +38 -0
  8. fast_sentence_segment/dmo/__init__.py +10 -6
  9. fast_sentence_segment/dmo/abbreviation_merger.py +146 -0
  10. fast_sentence_segment/dmo/abbreviation_splitter.py +95 -0
  11. fast_sentence_segment/dmo/abbreviations.py +96 -0
  12. fast_sentence_segment/dmo/bullet_point_cleaner.py +55 -55
  13. fast_sentence_segment/dmo/ellipsis_normalizer.py +45 -0
  14. fast_sentence_segment/dmo/newlines_to_periods.py +57 -57
  15. fast_sentence_segment/dmo/numbered_list_normalizer.py +47 -53
  16. fast_sentence_segment/dmo/post_process_sentences.py +48 -48
  17. fast_sentence_segment/dmo/question_exclamation_splitter.py +59 -0
  18. fast_sentence_segment/dmo/spacy_doc_segmenter.py +101 -101
  19. fast_sentence_segment/dmo/title_name_merger.py +152 -0
  20. fast_sentence_segment/svc/__init__.py +2 -2
  21. fast_sentence_segment/svc/perform_paragraph_segmentation.py +50 -50
  22. fast_sentence_segment/svc/perform_sentence_segmentation.py +165 -129
  23. fast_sentence_segment-1.2.0.dist-info/METADATA +189 -0
  24. fast_sentence_segment-1.2.0.dist-info/RECORD +27 -0
  25. {fast_sentence_segment-0.1.9.dist-info → fast_sentence_segment-1.2.0.dist-info}/WHEEL +1 -1
  26. fast_sentence_segment-1.2.0.dist-info/entry_points.txt +3 -0
  27. fast_sentence_segment-1.2.0.dist-info/licenses/LICENSE +21 -0
  28. fast_sentence_segment/dmo/delimiters_to_periods.py +0 -37
  29. fast_sentence_segment-0.1.9.dist-info/METADATA +0 -54
  30. fast_sentence_segment-0.1.9.dist-info/RECORD +0 -16
@@ -1,101 +1,101 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Run Sentence Segmentation using spaCy """
4
-
5
-
6
- from spacy.lang.en import English
7
-
8
-
9
- from baseblock import BaseObject
10
-
11
-
12
- class SpacyDocSegmenter(BaseObject):
13
- """ Run Sentence Segmentation using spaCy """
14
-
15
- def __init__(self,
16
- nlp: English):
17
- """
18
- Created:
19
- 30-Sept-2021
20
- """
21
- BaseObject.__init__(self, __name__)
22
- self._nlp = nlp
23
-
24
- @staticmethod
25
- def _append_period(a_sentence: str) -> str:
26
- """
27
- Purpose:
28
- if the sentence is not terminated with a period, then add one
29
- :return:
30
- a sentence terminated by a period
31
- """
32
- __blacklist = [':', '?', '!']
33
- if not a_sentence.strip().endswith('.'):
34
- for ch in __blacklist:
35
- if not a_sentence.endswith(ch):
36
- return f"{a_sentence}."
37
- return a_sentence
38
-
39
- @staticmethod
40
- def _is_valid_sentence(a_sentence: str) -> bool:
41
- """
42
- Purpose:
43
- enable filtering of invalid sentences
44
- :return:
45
- True if the sentence is a valid one
46
- """
47
- if not a_sentence:
48
- return False
49
- if not len(a_sentence):
50
- return False
51
- if a_sentence.strip() == '.':
52
- return False
53
- return True
54
-
55
- @staticmethod
56
- def _cleanse(sentences: list) -> str:
57
- sentences = [sent for sent in sentences
58
- if sent != '..']
59
-
60
- normalized = []
61
-
62
- for s in sentences:
63
- s = s.replace('\n', ' ')
64
-
65
- if s.startswith('.. '):
66
- s = s[3:]
67
-
68
- if s.endswith('. ..'):
69
- s = s[:len(s) - 3].strip()
70
-
71
- normalized.append(s)
72
-
73
- return normalized
74
-
75
- def process(self,
76
- input_text: str) -> list:
77
- """
78
- Purpose:
79
- Perform Sentence Segmentation
80
- :param input_text:
81
- any input text
82
- :return:
83
- a list of 0-or-More sentences
84
- """
85
-
86
- doc = self._nlp(input_text)
87
-
88
- sentences = [str(sent) for sent in doc.sents]
89
-
90
- sentences = [sent for sent in sentences if
91
- sent and len(sent) and sent != 'None']
92
-
93
- sentences = [self._append_period(sent)
94
- for sent in sentences]
95
-
96
- sentences = [sent.strip() for sent in sentences
97
- if self._is_valid_sentence(sent)]
98
-
99
- sentences = self._cleanse(sentences)
100
-
101
- return sentences
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Run Sentence Segmentation using spaCy """
4
+
5
+
6
+ from spacy.lang.en import English
7
+
8
+
9
+ from fast_sentence_segment.core import BaseObject
10
+
11
+
12
+ class SpacyDocSegmenter(BaseObject):
13
+ """ Run Sentence Segmentation using spaCy """
14
+
15
+ def __init__(self,
16
+ nlp: English):
17
+ """
18
+ Created:
19
+ 30-Sept-2021
20
+ """
21
+ BaseObject.__init__(self, __name__)
22
+ self._nlp = nlp
23
+
24
+ @staticmethod
25
+ def _append_period(a_sentence: str) -> str:
26
+ """
27
+ Purpose:
28
+ if the sentence is not terminated with a period, then add one
29
+ :return:
30
+ a sentence terminated by a period
31
+ """
32
+ __blacklist = [':', '?', '!']
33
+ if not a_sentence.strip().endswith('.'):
34
+ for ch in __blacklist:
35
+ if not a_sentence.endswith(ch):
36
+ return f"{a_sentence}."
37
+ return a_sentence
38
+
39
+ @staticmethod
40
+ def _is_valid_sentence(a_sentence: str) -> bool:
41
+ """
42
+ Purpose:
43
+ enable filtering of invalid sentences
44
+ :return:
45
+ True if the sentence is a valid one
46
+ """
47
+ if not a_sentence:
48
+ return False
49
+ if not len(a_sentence):
50
+ return False
51
+ if a_sentence.strip() == '.':
52
+ return False
53
+ return True
54
+
55
+ @staticmethod
56
+ def _cleanse(sentences: list) -> str:
57
+ sentences = [sent for sent in sentences
58
+ if sent != '..']
59
+
60
+ normalized = []
61
+
62
+ for s in sentences:
63
+ s = s.replace('\n', ' ')
64
+
65
+ if s.startswith('.. '):
66
+ s = s[3:]
67
+
68
+ if s.endswith('. ..'):
69
+ s = s[:len(s) - 3].strip()
70
+
71
+ normalized.append(s)
72
+
73
+ return normalized
74
+
75
+ def process(self,
76
+ input_text: str) -> list:
77
+ """
78
+ Purpose:
79
+ Perform Sentence Segmentation
80
+ :param input_text:
81
+ any input text
82
+ :return:
83
+ a list of 0-or-More sentences
84
+ """
85
+
86
+ doc = self._nlp(input_text)
87
+
88
+ sentences = [str(sent) for sent in doc.sents]
89
+
90
+ sentences = [sent for sent in sentences if
91
+ sent and len(sent) and sent != 'None']
92
+
93
+ sentences = [self._append_period(sent)
94
+ for sent in sentences]
95
+
96
+ sentences = [sent.strip() for sent in sentences
97
+ if self._is_valid_sentence(sent)]
98
+
99
+ sentences = self._cleanse(sentences)
100
+
101
+ return sentences
@@ -0,0 +1,152 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Merge sentences incorrectly split when a title abbreviation is followed by a
5
+ single-word name ending in punctuation.
6
+
7
+ When the question/exclamation splitter splits "Dr. Who?" into ["Dr.", "Who?"],
8
+ this component merges them back together because a title + single capitalized
9
+ word is almost certainly a name, not two sentences.
10
+
11
+ Examples that should be merged:
12
+ ["Dr.", "Who?"] -> ["Dr. Who?"]
13
+ ["Mr.", "T!"] -> ["Mr. T!"]
14
+ ["Do you like Dr.", "Who?"] -> ["Do you like Dr. Who?"]
15
+
16
+ Examples that should NOT be merged:
17
+ ["Dr.", "Where did he go?"] -> stays split (multi-word sentence)
18
+ ["Dr.", "who can help."] -> stays split (lowercase = not a name)
19
+
20
+ Reference: https://github.com/craigtrim/fast-sentence-segment/issues/3
21
+ """
22
+
23
+ import re
24
+ from typing import List, Optional, Tuple
25
+
26
+ from fast_sentence_segment.core import BaseObject
27
+ from fast_sentence_segment.dmo.abbreviations import TITLE_ABBREVIATIONS
28
+
29
+
30
+ # Subset of titles that commonly precede names (not geographic like Mt., St.)
31
+ PERSONAL_TITLES: List[str] = [
32
+ "Dr.",
33
+ "Mr.",
34
+ "Mrs.",
35
+ "Ms.",
36
+ "Prof.",
37
+ "Sr.",
38
+ "Jr.",
39
+ "Rev.",
40
+ "Gen.",
41
+ "Col.",
42
+ "Capt.",
43
+ "Lt.",
44
+ "Sgt.",
45
+ "Rep.",
46
+ "Sen.",
47
+ "Gov.",
48
+ "Pres.",
49
+ "Hon.",
50
+ ]
51
+
52
+
53
+ class TitleNameMerger(BaseObject):
54
+ """Merge sentences incorrectly split at title + single-word name boundaries."""
55
+
56
+ def __init__(self):
57
+ """
58
+ Created:
59
+ 28-Dec-2024
60
+ craigtrim@gmail.com
61
+ Reference:
62
+ https://github.com/craigtrim/fast-sentence-segment/issues/3
63
+ """
64
+ BaseObject.__init__(self, __name__)
65
+
66
+ # Build pattern to match sentences ending with a title abbreviation
67
+ # Escape dots in abbreviations for regex
68
+ escaped_titles = [re.escape(t) for t in PERSONAL_TITLES]
69
+ titles_pattern = "|".join(escaped_titles)
70
+ self._ending_with_title = re.compile(rf"({titles_pattern})$", re.IGNORECASE)
71
+
72
+ # Pattern to match a single capitalized word followed by sentence-ending punctuation
73
+ # at the START of a sentence (may have more content after)
74
+ # Matches: "Who?", "Who? More text", "T!", "T! More", "Who?." (with trailing period), etc.
75
+ # Captures the word+punctuation part for extraction
76
+ # Note: The spaCy segmenter may add a trailing period to sentences ending in ?/!
77
+ self._single_word_with_punct = re.compile(r"^([A-Z][a-zA-Z\-]*[?!]+\.?)\s*(.*)$")
78
+
79
+ def _try_merge(self, current: str, next_sent: str) -> Optional[Tuple[str, str]]:
80
+ """Try to merge two sentences if they match the title + single-word name pattern.
81
+
82
+ Args:
83
+ current: Current sentence (may end with title abbreviation)
84
+ next_sent: Next sentence (may start with single-word name with punctuation)
85
+
86
+ Returns:
87
+ Tuple of (merged_sentence, remainder) if merge needed, else None
88
+ """
89
+ current = current.strip()
90
+ next_sent = next_sent.strip()
91
+
92
+ # Current sentence must end with a title abbreviation
93
+ if not self._ending_with_title.search(current):
94
+ return None
95
+
96
+ # Next sentence must start with a single capitalized word with ?/! punctuation
97
+ match = self._single_word_with_punct.match(next_sent)
98
+ if not match:
99
+ return None
100
+
101
+ # Extract the name part and any remainder
102
+ name_part = match.group(1)
103
+ remainder = match.group(2).strip() if match.group(2) else ""
104
+
105
+ # Clean up trailing period from name if present (added by spaCy)
106
+ if name_part.endswith('?.') or name_part.endswith('!.'):
107
+ name_part = name_part[:-1]
108
+
109
+ merged = current + " " + name_part
110
+ return (merged, remainder)
111
+
112
+ def process(self, sentences: List[str]) -> List[str]:
113
+ """Process a list of sentences, merging title + single-word name splits.
114
+
115
+ Args:
116
+ sentences: List of sentences
117
+
118
+ Returns:
119
+ List of sentences with title+name splits merged
120
+ """
121
+ if not sentences or len(sentences) < 2:
122
+ return sentences
123
+
124
+ # Work with a mutable copy
125
+ sentences = list(sentences)
126
+ result = []
127
+ i = 0
128
+
129
+ while i < len(sentences):
130
+ current = sentences[i]
131
+
132
+ # Check if we should merge with next sentence
133
+ if i + 1 < len(sentences):
134
+ next_sent = sentences[i + 1]
135
+ merge_result = self._try_merge(current, next_sent)
136
+
137
+ if merge_result:
138
+ merged, remainder = merge_result
139
+ result.append(merged)
140
+
141
+ # If there's a remainder, replace next_sent with it for further processing
142
+ if remainder:
143
+ sentences[i + 1] = remainder
144
+ i += 1 # Move to process the remainder (now at i+1, will be i after increment)
145
+ else:
146
+ i += 2 # Skip both merged sentences
147
+ continue
148
+
149
+ result.append(current)
150
+ i += 1
151
+
152
+ return result
@@ -1,2 +1,2 @@
1
- from .perform_paragraph_segmentation import PerformParagraphSegmentation
2
- from .perform_sentence_segmentation import PerformSentenceSegmentation
1
+ from .perform_paragraph_segmentation import PerformParagraphSegmentation
2
+ from .perform_sentence_segmentation import PerformSentenceSegmentation
@@ -1,50 +1,50 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Paragraph Segmentation """
4
-
5
-
6
- from baseblock import BaseObject
7
-
8
-
9
- class PerformParagraphSegmentation(BaseObject):
10
- """ Paragraph Segmentation """
11
-
12
- def __init__(self):
13
- """
14
- Created:
15
- 1-Oct-2021
16
- """
17
- BaseObject.__init__(self, __name__)
18
-
19
- def _process(self,
20
- input_text: str) -> list:
21
- paragraphs = input_text.split('\n\n')
22
-
23
- paragraphs = [x.strip() for x in paragraphs if x]
24
- paragraphs = [x for x in paragraphs if len(x)]
25
-
26
- return paragraphs
27
-
28
- def process(self,
29
- input_text: str) -> list:
30
- """Perform Paragraph Segmentation
31
-
32
- Args:
33
- input_text (str): An input string of any length or type
34
-
35
- Raises:
36
- ValueError: input must be a string
37
-
38
- Returns:
39
- list: a list of 1..* paragraphs
40
- each list item is an input string of any length, but is a paragraph
41
- A paragraph is a structural concept rather than a semantic one
42
- """
43
- if input_text is None or not len(input_text):
44
- raise ValueError("Empty Input")
45
-
46
- if type(input_text) != str:
47
- self.logger.warning(f"Invalid Input Text: {input_text}")
48
- return []
49
-
50
- return self._process(input_text)
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Paragraph Segmentation """
4
+
5
+
6
+ from fast_sentence_segment.core import BaseObject
7
+
8
+
9
+ class PerformParagraphSegmentation(BaseObject):
10
+ """ Paragraph Segmentation """
11
+
12
+ def __init__(self):
13
+ """
14
+ Created:
15
+ 1-Oct-2021
16
+ """
17
+ BaseObject.__init__(self, __name__)
18
+
19
+ def _process(self,
20
+ input_text: str) -> list:
21
+ paragraphs = input_text.split('\n\n')
22
+
23
+ paragraphs = [x.strip() for x in paragraphs if x]
24
+ paragraphs = [x for x in paragraphs if len(x)]
25
+
26
+ return paragraphs
27
+
28
+ def process(self,
29
+ input_text: str) -> list:
30
+ """Perform Paragraph Segmentation
31
+
32
+ Args:
33
+ input_text (str): An input string of any length or type
34
+
35
+ Raises:
36
+ ValueError: input must be a string
37
+
38
+ Returns:
39
+ list: a list of 1..* paragraphs
40
+ each list item is an input string of any length, but is a paragraph
41
+ A paragraph is a structural concept rather than a semantic one
42
+ """
43
+ if input_text is None or not len(input_text):
44
+ raise ValueError("Empty Input")
45
+
46
+ if not isinstance(input_text, str):
47
+ self.logger.warning(f"Invalid Input Text: {input_text}")
48
+ return []
49
+
50
+ return self._process(input_text)