fast-sentence-segment 0.1.9__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_sentence_segment/__init__.py +18 -18
- fast_sentence_segment/bp/__init__.py +1 -1
- fast_sentence_segment/bp/segmenter.py +65 -68
- fast_sentence_segment/cli.py +56 -0
- fast_sentence_segment/core/__init__.py +4 -0
- fast_sentence_segment/core/base_object.py +18 -0
- fast_sentence_segment/core/stopwatch.py +38 -0
- fast_sentence_segment/dmo/__init__.py +10 -6
- fast_sentence_segment/dmo/abbreviation_merger.py +146 -0
- fast_sentence_segment/dmo/abbreviation_splitter.py +95 -0
- fast_sentence_segment/dmo/abbreviations.py +96 -0
- fast_sentence_segment/dmo/bullet_point_cleaner.py +55 -55
- fast_sentence_segment/dmo/ellipsis_normalizer.py +45 -0
- fast_sentence_segment/dmo/newlines_to_periods.py +57 -57
- fast_sentence_segment/dmo/numbered_list_normalizer.py +47 -53
- fast_sentence_segment/dmo/post_process_sentences.py +48 -48
- fast_sentence_segment/dmo/question_exclamation_splitter.py +59 -0
- fast_sentence_segment/dmo/spacy_doc_segmenter.py +101 -101
- fast_sentence_segment/dmo/title_name_merger.py +152 -0
- fast_sentence_segment/svc/__init__.py +2 -2
- fast_sentence_segment/svc/perform_paragraph_segmentation.py +50 -50
- fast_sentence_segment/svc/perform_sentence_segmentation.py +165 -129
- fast_sentence_segment-1.2.0.dist-info/METADATA +189 -0
- fast_sentence_segment-1.2.0.dist-info/RECORD +27 -0
- {fast_sentence_segment-0.1.9.dist-info → fast_sentence_segment-1.2.0.dist-info}/WHEEL +1 -1
- fast_sentence_segment-1.2.0.dist-info/entry_points.txt +3 -0
- fast_sentence_segment-1.2.0.dist-info/licenses/LICENSE +21 -0
- fast_sentence_segment/dmo/delimiters_to_periods.py +0 -37
- fast_sentence_segment-0.1.9.dist-info/METADATA +0 -54
- fast_sentence_segment-0.1.9.dist-info/RECORD +0 -16
|
@@ -1,101 +1,101 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Run Sentence Segmentation using spaCy """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from spacy.lang.en import English
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class SpacyDocSegmenter(BaseObject):
|
|
13
|
-
""" Run Sentence Segmentation using spaCy """
|
|
14
|
-
|
|
15
|
-
def __init__(self,
|
|
16
|
-
nlp: English):
|
|
17
|
-
"""
|
|
18
|
-
Created:
|
|
19
|
-
30-Sept-2021
|
|
20
|
-
"""
|
|
21
|
-
BaseObject.__init__(self, __name__)
|
|
22
|
-
self._nlp = nlp
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def _append_period(a_sentence: str) -> str:
|
|
26
|
-
"""
|
|
27
|
-
Purpose:
|
|
28
|
-
if the sentence is not terminated with a period, then add one
|
|
29
|
-
:return:
|
|
30
|
-
a sentence terminated by a period
|
|
31
|
-
"""
|
|
32
|
-
__blacklist = [':', '?', '!']
|
|
33
|
-
if not a_sentence.strip().endswith('.'):
|
|
34
|
-
for ch in __blacklist:
|
|
35
|
-
if not a_sentence.endswith(ch):
|
|
36
|
-
return f"{a_sentence}."
|
|
37
|
-
return a_sentence
|
|
38
|
-
|
|
39
|
-
@staticmethod
|
|
40
|
-
def _is_valid_sentence(a_sentence: str) -> bool:
|
|
41
|
-
"""
|
|
42
|
-
Purpose:
|
|
43
|
-
enable filtering of invalid sentences
|
|
44
|
-
:return:
|
|
45
|
-
True if the sentence is a valid one
|
|
46
|
-
"""
|
|
47
|
-
if not a_sentence:
|
|
48
|
-
return False
|
|
49
|
-
if not len(a_sentence):
|
|
50
|
-
return False
|
|
51
|
-
if a_sentence.strip() == '.':
|
|
52
|
-
return False
|
|
53
|
-
return True
|
|
54
|
-
|
|
55
|
-
@staticmethod
|
|
56
|
-
def _cleanse(sentences: list) -> str:
|
|
57
|
-
sentences = [sent for sent in sentences
|
|
58
|
-
if sent != '..']
|
|
59
|
-
|
|
60
|
-
normalized = []
|
|
61
|
-
|
|
62
|
-
for s in sentences:
|
|
63
|
-
s = s.replace('\n', ' ')
|
|
64
|
-
|
|
65
|
-
if s.startswith('.. '):
|
|
66
|
-
s = s[3:]
|
|
67
|
-
|
|
68
|
-
if s.endswith('. ..'):
|
|
69
|
-
s = s[:len(s) - 3].strip()
|
|
70
|
-
|
|
71
|
-
normalized.append(s)
|
|
72
|
-
|
|
73
|
-
return normalized
|
|
74
|
-
|
|
75
|
-
def process(self,
|
|
76
|
-
input_text: str) -> list:
|
|
77
|
-
"""
|
|
78
|
-
Purpose:
|
|
79
|
-
Perform Sentence Segmentation
|
|
80
|
-
:param input_text:
|
|
81
|
-
any input text
|
|
82
|
-
:return:
|
|
83
|
-
a list of 0-or-More sentences
|
|
84
|
-
"""
|
|
85
|
-
|
|
86
|
-
doc = self._nlp(input_text)
|
|
87
|
-
|
|
88
|
-
sentences = [str(sent) for sent in doc.sents]
|
|
89
|
-
|
|
90
|
-
sentences = [sent for sent in sentences if
|
|
91
|
-
sent and len(sent) and sent != 'None']
|
|
92
|
-
|
|
93
|
-
sentences = [self._append_period(sent)
|
|
94
|
-
for sent in sentences]
|
|
95
|
-
|
|
96
|
-
sentences = [sent.strip() for sent in sentences
|
|
97
|
-
if self._is_valid_sentence(sent)]
|
|
98
|
-
|
|
99
|
-
sentences = self._cleanse(sentences)
|
|
100
|
-
|
|
101
|
-
return sentences
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Run Sentence Segmentation using spaCy """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from spacy.lang.en import English
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from fast_sentence_segment.core import BaseObject
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SpacyDocSegmenter(BaseObject):
|
|
13
|
+
""" Run Sentence Segmentation using spaCy """
|
|
14
|
+
|
|
15
|
+
def __init__(self,
|
|
16
|
+
nlp: English):
|
|
17
|
+
"""
|
|
18
|
+
Created:
|
|
19
|
+
30-Sept-2021
|
|
20
|
+
"""
|
|
21
|
+
BaseObject.__init__(self, __name__)
|
|
22
|
+
self._nlp = nlp
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def _append_period(a_sentence: str) -> str:
|
|
26
|
+
"""
|
|
27
|
+
Purpose:
|
|
28
|
+
if the sentence is not terminated with a period, then add one
|
|
29
|
+
:return:
|
|
30
|
+
a sentence terminated by a period
|
|
31
|
+
"""
|
|
32
|
+
__blacklist = [':', '?', '!']
|
|
33
|
+
if not a_sentence.strip().endswith('.'):
|
|
34
|
+
for ch in __blacklist:
|
|
35
|
+
if not a_sentence.endswith(ch):
|
|
36
|
+
return f"{a_sentence}."
|
|
37
|
+
return a_sentence
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def _is_valid_sentence(a_sentence: str) -> bool:
|
|
41
|
+
"""
|
|
42
|
+
Purpose:
|
|
43
|
+
enable filtering of invalid sentences
|
|
44
|
+
:return:
|
|
45
|
+
True if the sentence is a valid one
|
|
46
|
+
"""
|
|
47
|
+
if not a_sentence:
|
|
48
|
+
return False
|
|
49
|
+
if not len(a_sentence):
|
|
50
|
+
return False
|
|
51
|
+
if a_sentence.strip() == '.':
|
|
52
|
+
return False
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def _cleanse(sentences: list) -> str:
|
|
57
|
+
sentences = [sent for sent in sentences
|
|
58
|
+
if sent != '..']
|
|
59
|
+
|
|
60
|
+
normalized = []
|
|
61
|
+
|
|
62
|
+
for s in sentences:
|
|
63
|
+
s = s.replace('\n', ' ')
|
|
64
|
+
|
|
65
|
+
if s.startswith('.. '):
|
|
66
|
+
s = s[3:]
|
|
67
|
+
|
|
68
|
+
if s.endswith('. ..'):
|
|
69
|
+
s = s[:len(s) - 3].strip()
|
|
70
|
+
|
|
71
|
+
normalized.append(s)
|
|
72
|
+
|
|
73
|
+
return normalized
|
|
74
|
+
|
|
75
|
+
def process(self,
|
|
76
|
+
input_text: str) -> list:
|
|
77
|
+
"""
|
|
78
|
+
Purpose:
|
|
79
|
+
Perform Sentence Segmentation
|
|
80
|
+
:param input_text:
|
|
81
|
+
any input text
|
|
82
|
+
:return:
|
|
83
|
+
a list of 0-or-More sentences
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
doc = self._nlp(input_text)
|
|
87
|
+
|
|
88
|
+
sentences = [str(sent) for sent in doc.sents]
|
|
89
|
+
|
|
90
|
+
sentences = [sent for sent in sentences if
|
|
91
|
+
sent and len(sent) and sent != 'None']
|
|
92
|
+
|
|
93
|
+
sentences = [self._append_period(sent)
|
|
94
|
+
for sent in sentences]
|
|
95
|
+
|
|
96
|
+
sentences = [sent.strip() for sent in sentences
|
|
97
|
+
if self._is_valid_sentence(sent)]
|
|
98
|
+
|
|
99
|
+
sentences = self._cleanse(sentences)
|
|
100
|
+
|
|
101
|
+
return sentences
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
"""
|
|
4
|
+
Merge sentences incorrectly split when a title abbreviation is followed by a
|
|
5
|
+
single-word name ending in punctuation.
|
|
6
|
+
|
|
7
|
+
When the question/exclamation splitter splits "Dr. Who?" into ["Dr.", "Who?"],
|
|
8
|
+
this component merges them back together because a title + single capitalized
|
|
9
|
+
word is almost certainly a name, not two sentences.
|
|
10
|
+
|
|
11
|
+
Examples that should be merged:
|
|
12
|
+
["Dr.", "Who?"] -> ["Dr. Who?"]
|
|
13
|
+
["Mr.", "T!"] -> ["Mr. T!"]
|
|
14
|
+
["Do you like Dr.", "Who?"] -> ["Do you like Dr. Who?"]
|
|
15
|
+
|
|
16
|
+
Examples that should NOT be merged:
|
|
17
|
+
["Dr.", "Where did he go?"] -> stays split (multi-word sentence)
|
|
18
|
+
["Dr.", "who can help."] -> stays split (lowercase = not a name)
|
|
19
|
+
|
|
20
|
+
Reference: https://github.com/craigtrim/fast-sentence-segment/issues/3
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import re
|
|
24
|
+
from typing import List, Optional, Tuple
|
|
25
|
+
|
|
26
|
+
from fast_sentence_segment.core import BaseObject
|
|
27
|
+
from fast_sentence_segment.dmo.abbreviations import TITLE_ABBREVIATIONS
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Subset of titles that commonly precede names (not geographic like Mt., St.)
|
|
31
|
+
PERSONAL_TITLES: List[str] = [
|
|
32
|
+
"Dr.",
|
|
33
|
+
"Mr.",
|
|
34
|
+
"Mrs.",
|
|
35
|
+
"Ms.",
|
|
36
|
+
"Prof.",
|
|
37
|
+
"Sr.",
|
|
38
|
+
"Jr.",
|
|
39
|
+
"Rev.",
|
|
40
|
+
"Gen.",
|
|
41
|
+
"Col.",
|
|
42
|
+
"Capt.",
|
|
43
|
+
"Lt.",
|
|
44
|
+
"Sgt.",
|
|
45
|
+
"Rep.",
|
|
46
|
+
"Sen.",
|
|
47
|
+
"Gov.",
|
|
48
|
+
"Pres.",
|
|
49
|
+
"Hon.",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class TitleNameMerger(BaseObject):
|
|
54
|
+
"""Merge sentences incorrectly split at title + single-word name boundaries."""
|
|
55
|
+
|
|
56
|
+
def __init__(self):
|
|
57
|
+
"""
|
|
58
|
+
Created:
|
|
59
|
+
28-Dec-2024
|
|
60
|
+
craigtrim@gmail.com
|
|
61
|
+
Reference:
|
|
62
|
+
https://github.com/craigtrim/fast-sentence-segment/issues/3
|
|
63
|
+
"""
|
|
64
|
+
BaseObject.__init__(self, __name__)
|
|
65
|
+
|
|
66
|
+
# Build pattern to match sentences ending with a title abbreviation
|
|
67
|
+
# Escape dots in abbreviations for regex
|
|
68
|
+
escaped_titles = [re.escape(t) for t in PERSONAL_TITLES]
|
|
69
|
+
titles_pattern = "|".join(escaped_titles)
|
|
70
|
+
self._ending_with_title = re.compile(rf"({titles_pattern})$", re.IGNORECASE)
|
|
71
|
+
|
|
72
|
+
# Pattern to match a single capitalized word followed by sentence-ending punctuation
|
|
73
|
+
# at the START of a sentence (may have more content after)
|
|
74
|
+
# Matches: "Who?", "Who? More text", "T!", "T! More", "Who?." (with trailing period), etc.
|
|
75
|
+
# Captures the word+punctuation part for extraction
|
|
76
|
+
# Note: The spaCy segmenter may add a trailing period to sentences ending in ?/!
|
|
77
|
+
self._single_word_with_punct = re.compile(r"^([A-Z][a-zA-Z\-]*[?!]+\.?)\s*(.*)$")
|
|
78
|
+
|
|
79
|
+
def _try_merge(self, current: str, next_sent: str) -> Optional[Tuple[str, str]]:
|
|
80
|
+
"""Try to merge two sentences if they match the title + single-word name pattern.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
current: Current sentence (may end with title abbreviation)
|
|
84
|
+
next_sent: Next sentence (may start with single-word name with punctuation)
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
Tuple of (merged_sentence, remainder) if merge needed, else None
|
|
88
|
+
"""
|
|
89
|
+
current = current.strip()
|
|
90
|
+
next_sent = next_sent.strip()
|
|
91
|
+
|
|
92
|
+
# Current sentence must end with a title abbreviation
|
|
93
|
+
if not self._ending_with_title.search(current):
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
# Next sentence must start with a single capitalized word with ?/! punctuation
|
|
97
|
+
match = self._single_word_with_punct.match(next_sent)
|
|
98
|
+
if not match:
|
|
99
|
+
return None
|
|
100
|
+
|
|
101
|
+
# Extract the name part and any remainder
|
|
102
|
+
name_part = match.group(1)
|
|
103
|
+
remainder = match.group(2).strip() if match.group(2) else ""
|
|
104
|
+
|
|
105
|
+
# Clean up trailing period from name if present (added by spaCy)
|
|
106
|
+
if name_part.endswith('?.') or name_part.endswith('!.'):
|
|
107
|
+
name_part = name_part[:-1]
|
|
108
|
+
|
|
109
|
+
merged = current + " " + name_part
|
|
110
|
+
return (merged, remainder)
|
|
111
|
+
|
|
112
|
+
def process(self, sentences: List[str]) -> List[str]:
|
|
113
|
+
"""Process a list of sentences, merging title + single-word name splits.
|
|
114
|
+
|
|
115
|
+
Args:
|
|
116
|
+
sentences: List of sentences
|
|
117
|
+
|
|
118
|
+
Returns:
|
|
119
|
+
List of sentences with title+name splits merged
|
|
120
|
+
"""
|
|
121
|
+
if not sentences or len(sentences) < 2:
|
|
122
|
+
return sentences
|
|
123
|
+
|
|
124
|
+
# Work with a mutable copy
|
|
125
|
+
sentences = list(sentences)
|
|
126
|
+
result = []
|
|
127
|
+
i = 0
|
|
128
|
+
|
|
129
|
+
while i < len(sentences):
|
|
130
|
+
current = sentences[i]
|
|
131
|
+
|
|
132
|
+
# Check if we should merge with next sentence
|
|
133
|
+
if i + 1 < len(sentences):
|
|
134
|
+
next_sent = sentences[i + 1]
|
|
135
|
+
merge_result = self._try_merge(current, next_sent)
|
|
136
|
+
|
|
137
|
+
if merge_result:
|
|
138
|
+
merged, remainder = merge_result
|
|
139
|
+
result.append(merged)
|
|
140
|
+
|
|
141
|
+
# If there's a remainder, replace next_sent with it for further processing
|
|
142
|
+
if remainder:
|
|
143
|
+
sentences[i + 1] = remainder
|
|
144
|
+
i += 1 # Move to process the remainder (now at i+1, will be i after increment)
|
|
145
|
+
else:
|
|
146
|
+
i += 2 # Skip both merged sentences
|
|
147
|
+
continue
|
|
148
|
+
|
|
149
|
+
result.append(current)
|
|
150
|
+
i += 1
|
|
151
|
+
|
|
152
|
+
return result
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
from .perform_paragraph_segmentation import PerformParagraphSegmentation
|
|
2
|
-
from .perform_sentence_segmentation import PerformSentenceSegmentation
|
|
1
|
+
from .perform_paragraph_segmentation import PerformParagraphSegmentation
|
|
2
|
+
from .perform_sentence_segmentation import PerformSentenceSegmentation
|
|
@@ -1,50 +1,50 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Paragraph Segmentation """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class PerformParagraphSegmentation(BaseObject):
|
|
10
|
-
""" Paragraph Segmentation """
|
|
11
|
-
|
|
12
|
-
def __init__(self):
|
|
13
|
-
"""
|
|
14
|
-
Created:
|
|
15
|
-
1-Oct-2021
|
|
16
|
-
"""
|
|
17
|
-
BaseObject.__init__(self, __name__)
|
|
18
|
-
|
|
19
|
-
def _process(self,
|
|
20
|
-
input_text: str) -> list:
|
|
21
|
-
paragraphs = input_text.split('\n\n')
|
|
22
|
-
|
|
23
|
-
paragraphs = [x.strip() for x in paragraphs if x]
|
|
24
|
-
paragraphs = [x for x in paragraphs if len(x)]
|
|
25
|
-
|
|
26
|
-
return paragraphs
|
|
27
|
-
|
|
28
|
-
def process(self,
|
|
29
|
-
input_text: str) -> list:
|
|
30
|
-
"""Perform Paragraph Segmentation
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
input_text (str): An input string of any length or type
|
|
34
|
-
|
|
35
|
-
Raises:
|
|
36
|
-
ValueError: input must be a string
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
list: a list of 1..* paragraphs
|
|
40
|
-
each list item is an input string of any length, but is a paragraph
|
|
41
|
-
A paragraph is a structural concept rather than a semantic one
|
|
42
|
-
"""
|
|
43
|
-
if input_text is None or not len(input_text):
|
|
44
|
-
raise ValueError("Empty Input")
|
|
45
|
-
|
|
46
|
-
if
|
|
47
|
-
self.logger.warning(f"Invalid Input Text: {input_text}")
|
|
48
|
-
return []
|
|
49
|
-
|
|
50
|
-
return self._process(input_text)
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Paragraph Segmentation """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from fast_sentence_segment.core import BaseObject
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PerformParagraphSegmentation(BaseObject):
|
|
10
|
+
""" Paragraph Segmentation """
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
"""
|
|
14
|
+
Created:
|
|
15
|
+
1-Oct-2021
|
|
16
|
+
"""
|
|
17
|
+
BaseObject.__init__(self, __name__)
|
|
18
|
+
|
|
19
|
+
def _process(self,
|
|
20
|
+
input_text: str) -> list:
|
|
21
|
+
paragraphs = input_text.split('\n\n')
|
|
22
|
+
|
|
23
|
+
paragraphs = [x.strip() for x in paragraphs if x]
|
|
24
|
+
paragraphs = [x for x in paragraphs if len(x)]
|
|
25
|
+
|
|
26
|
+
return paragraphs
|
|
27
|
+
|
|
28
|
+
def process(self,
|
|
29
|
+
input_text: str) -> list:
|
|
30
|
+
"""Perform Paragraph Segmentation
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
input_text (str): An input string of any length or type
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
ValueError: input must be a string
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
list: a list of 1..* paragraphs
|
|
40
|
+
each list item is an input string of any length, but is a paragraph
|
|
41
|
+
A paragraph is a structural concept rather than a semantic one
|
|
42
|
+
"""
|
|
43
|
+
if input_text is None or not len(input_text):
|
|
44
|
+
raise ValueError("Empty Input")
|
|
45
|
+
|
|
46
|
+
if not isinstance(input_text, str):
|
|
47
|
+
self.logger.warning(f"Invalid Input Text: {input_text}")
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
return self._process(input_text)
|