fast-sentence-segment 0.1.9__py3-none-any.whl → 1.1.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_sentence_segment/__init__.py +18 -18
- fast_sentence_segment/bp/__init__.py +1 -1
- fast_sentence_segment/bp/segmenter.py +65 -68
- fast_sentence_segment/core/__init__.py +4 -0
- fast_sentence_segment/core/base_object.py +18 -0
- fast_sentence_segment/core/stopwatch.py +38 -0
- fast_sentence_segment/dmo/__init__.py +6 -6
- fast_sentence_segment/dmo/bullet_point_cleaner.py +55 -55
- fast_sentence_segment/dmo/delimiters_to_periods.py +37 -37
- fast_sentence_segment/dmo/newlines_to_periods.py +57 -57
- fast_sentence_segment/dmo/numbered_list_normalizer.py +53 -53
- fast_sentence_segment/dmo/post_process_sentences.py +48 -48
- fast_sentence_segment/dmo/spacy_doc_segmenter.py +101 -101
- fast_sentence_segment/svc/__init__.py +2 -2
- fast_sentence_segment/svc/perform_paragraph_segmentation.py +50 -50
- fast_sentence_segment/svc/perform_sentence_segmentation.py +129 -129
- fast_sentence_segment-1.1.8.dist-info/METADATA +146 -0
- fast_sentence_segment-1.1.8.dist-info/RECORD +20 -0
- {fast_sentence_segment-0.1.9.dist-info → fast_sentence_segment-1.1.8.dist-info}/WHEEL +1 -1
- fast_sentence_segment-1.1.8.dist-info/licenses/LICENSE +21 -0
- fast_sentence_segment-0.1.9.dist-info/METADATA +0 -54
- fast_sentence_segment-0.1.9.dist-info/RECORD +0 -16
|
@@ -1,48 +1,48 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Post Process Sentences """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class PostProcessStructure(BaseObject):
|
|
10
|
-
""" Post Process Sentences """
|
|
11
|
-
|
|
12
|
-
__replace = {
|
|
13
|
-
'..': '. ',
|
|
14
|
-
'. .': '. ',
|
|
15
|
-
|
|
16
|
-
',.': ', ',
|
|
17
|
-
', .': ', ',
|
|
18
|
-
|
|
19
|
-
'!.': '! ',
|
|
20
|
-
'! .': '! ',
|
|
21
|
-
|
|
22
|
-
'?.': '? ',
|
|
23
|
-
'? .': '? ',
|
|
24
|
-
|
|
25
|
-
':.': ': ',
|
|
26
|
-
': .': ': ',
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
def __init__(self):
|
|
30
|
-
"""
|
|
31
|
-
Created:
|
|
32
|
-
1-Oct-2021
|
|
33
|
-
"""
|
|
34
|
-
BaseObject.__init__(self, __name__)
|
|
35
|
-
|
|
36
|
-
def process(self,
|
|
37
|
-
sentences: list) -> list:
|
|
38
|
-
normalized = []
|
|
39
|
-
|
|
40
|
-
for sentence in sentences:
|
|
41
|
-
|
|
42
|
-
for k in self.__replace:
|
|
43
|
-
if k in sentence:
|
|
44
|
-
sentence = sentence.replace(k, self.__replace[k]).strip()
|
|
45
|
-
|
|
46
|
-
normalized.append(sentence)
|
|
47
|
-
|
|
48
|
-
return normalized
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Post Process Sentences """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from fast_sentence_segment.core import BaseObject
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PostProcessStructure(BaseObject):
|
|
10
|
+
""" Post Process Sentences """
|
|
11
|
+
|
|
12
|
+
__replace = {
|
|
13
|
+
'..': '. ',
|
|
14
|
+
'. .': '. ',
|
|
15
|
+
|
|
16
|
+
',.': ', ',
|
|
17
|
+
', .': ', ',
|
|
18
|
+
|
|
19
|
+
'!.': '! ',
|
|
20
|
+
'! .': '! ',
|
|
21
|
+
|
|
22
|
+
'?.': '? ',
|
|
23
|
+
'? .': '? ',
|
|
24
|
+
|
|
25
|
+
':.': ': ',
|
|
26
|
+
': .': ': ',
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
def __init__(self):
|
|
30
|
+
"""
|
|
31
|
+
Created:
|
|
32
|
+
1-Oct-2021
|
|
33
|
+
"""
|
|
34
|
+
BaseObject.__init__(self, __name__)
|
|
35
|
+
|
|
36
|
+
def process(self,
|
|
37
|
+
sentences: list) -> list:
|
|
38
|
+
normalized = []
|
|
39
|
+
|
|
40
|
+
for sentence in sentences:
|
|
41
|
+
|
|
42
|
+
for k in self.__replace:
|
|
43
|
+
if k in sentence:
|
|
44
|
+
sentence = sentence.replace(k, self.__replace[k]).strip()
|
|
45
|
+
|
|
46
|
+
normalized.append(sentence)
|
|
47
|
+
|
|
48
|
+
return normalized
|
|
@@ -1,101 +1,101 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Run Sentence Segmentation using spaCy """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from spacy.lang.en import English
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
from
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
class SpacyDocSegmenter(BaseObject):
|
|
13
|
-
""" Run Sentence Segmentation using spaCy """
|
|
14
|
-
|
|
15
|
-
def __init__(self,
|
|
16
|
-
nlp: English):
|
|
17
|
-
"""
|
|
18
|
-
Created:
|
|
19
|
-
30-Sept-2021
|
|
20
|
-
"""
|
|
21
|
-
BaseObject.__init__(self, __name__)
|
|
22
|
-
self._nlp = nlp
|
|
23
|
-
|
|
24
|
-
@staticmethod
|
|
25
|
-
def _append_period(a_sentence: str) -> str:
|
|
26
|
-
"""
|
|
27
|
-
Purpose:
|
|
28
|
-
if the sentence is not terminated with a period, then add one
|
|
29
|
-
:return:
|
|
30
|
-
a sentence terminated by a period
|
|
31
|
-
"""
|
|
32
|
-
__blacklist = [':', '?', '!']
|
|
33
|
-
if not a_sentence.strip().endswith('.'):
|
|
34
|
-
for ch in __blacklist:
|
|
35
|
-
if not a_sentence.endswith(ch):
|
|
36
|
-
return f"{a_sentence}."
|
|
37
|
-
return a_sentence
|
|
38
|
-
|
|
39
|
-
@staticmethod
|
|
40
|
-
def _is_valid_sentence(a_sentence: str) -> bool:
|
|
41
|
-
"""
|
|
42
|
-
Purpose:
|
|
43
|
-
enable filtering of invalid sentences
|
|
44
|
-
:return:
|
|
45
|
-
True if the sentence is a valid one
|
|
46
|
-
"""
|
|
47
|
-
if not a_sentence:
|
|
48
|
-
return False
|
|
49
|
-
if not len(a_sentence):
|
|
50
|
-
return False
|
|
51
|
-
if a_sentence.strip() == '.':
|
|
52
|
-
return False
|
|
53
|
-
return True
|
|
54
|
-
|
|
55
|
-
@staticmethod
|
|
56
|
-
def _cleanse(sentences: list) -> str:
|
|
57
|
-
sentences = [sent for sent in sentences
|
|
58
|
-
if sent != '..']
|
|
59
|
-
|
|
60
|
-
normalized = []
|
|
61
|
-
|
|
62
|
-
for s in sentences:
|
|
63
|
-
s = s.replace('\n', ' ')
|
|
64
|
-
|
|
65
|
-
if s.startswith('.. '):
|
|
66
|
-
s = s[3:]
|
|
67
|
-
|
|
68
|
-
if s.endswith('. ..'):
|
|
69
|
-
s = s[:len(s) - 3].strip()
|
|
70
|
-
|
|
71
|
-
normalized.append(s)
|
|
72
|
-
|
|
73
|
-
return normalized
|
|
74
|
-
|
|
75
|
-
def process(self,
|
|
76
|
-
input_text: str) -> list:
|
|
77
|
-
"""
|
|
78
|
-
Purpose:
|
|
79
|
-
Perform Sentence Segmentation
|
|
80
|
-
:param input_text:
|
|
81
|
-
any input text
|
|
82
|
-
:return:
|
|
83
|
-
a list of 0-or-More sentences
|
|
84
|
-
"""
|
|
85
|
-
|
|
86
|
-
doc = self._nlp(input_text)
|
|
87
|
-
|
|
88
|
-
sentences = [str(sent) for sent in doc.sents]
|
|
89
|
-
|
|
90
|
-
sentences = [sent for sent in sentences if
|
|
91
|
-
sent and len(sent) and sent != 'None']
|
|
92
|
-
|
|
93
|
-
sentences = [self._append_period(sent)
|
|
94
|
-
for sent in sentences]
|
|
95
|
-
|
|
96
|
-
sentences = [sent.strip() for sent in sentences
|
|
97
|
-
if self._is_valid_sentence(sent)]
|
|
98
|
-
|
|
99
|
-
sentences = self._cleanse(sentences)
|
|
100
|
-
|
|
101
|
-
return sentences
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Run Sentence Segmentation using spaCy """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from spacy.lang.en import English
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
from fast_sentence_segment.core import BaseObject
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class SpacyDocSegmenter(BaseObject):
|
|
13
|
+
""" Run Sentence Segmentation using spaCy """
|
|
14
|
+
|
|
15
|
+
def __init__(self,
|
|
16
|
+
nlp: English):
|
|
17
|
+
"""
|
|
18
|
+
Created:
|
|
19
|
+
30-Sept-2021
|
|
20
|
+
"""
|
|
21
|
+
BaseObject.__init__(self, __name__)
|
|
22
|
+
self._nlp = nlp
|
|
23
|
+
|
|
24
|
+
@staticmethod
|
|
25
|
+
def _append_period(a_sentence: str) -> str:
|
|
26
|
+
"""
|
|
27
|
+
Purpose:
|
|
28
|
+
if the sentence is not terminated with a period, then add one
|
|
29
|
+
:return:
|
|
30
|
+
a sentence terminated by a period
|
|
31
|
+
"""
|
|
32
|
+
__blacklist = [':', '?', '!']
|
|
33
|
+
if not a_sentence.strip().endswith('.'):
|
|
34
|
+
for ch in __blacklist:
|
|
35
|
+
if not a_sentence.endswith(ch):
|
|
36
|
+
return f"{a_sentence}."
|
|
37
|
+
return a_sentence
|
|
38
|
+
|
|
39
|
+
@staticmethod
|
|
40
|
+
def _is_valid_sentence(a_sentence: str) -> bool:
|
|
41
|
+
"""
|
|
42
|
+
Purpose:
|
|
43
|
+
enable filtering of invalid sentences
|
|
44
|
+
:return:
|
|
45
|
+
True if the sentence is a valid one
|
|
46
|
+
"""
|
|
47
|
+
if not a_sentence:
|
|
48
|
+
return False
|
|
49
|
+
if not len(a_sentence):
|
|
50
|
+
return False
|
|
51
|
+
if a_sentence.strip() == '.':
|
|
52
|
+
return False
|
|
53
|
+
return True
|
|
54
|
+
|
|
55
|
+
@staticmethod
|
|
56
|
+
def _cleanse(sentences: list) -> str:
|
|
57
|
+
sentences = [sent for sent in sentences
|
|
58
|
+
if sent != '..']
|
|
59
|
+
|
|
60
|
+
normalized = []
|
|
61
|
+
|
|
62
|
+
for s in sentences:
|
|
63
|
+
s = s.replace('\n', ' ')
|
|
64
|
+
|
|
65
|
+
if s.startswith('.. '):
|
|
66
|
+
s = s[3:]
|
|
67
|
+
|
|
68
|
+
if s.endswith('. ..'):
|
|
69
|
+
s = s[:len(s) - 3].strip()
|
|
70
|
+
|
|
71
|
+
normalized.append(s)
|
|
72
|
+
|
|
73
|
+
return normalized
|
|
74
|
+
|
|
75
|
+
def process(self,
|
|
76
|
+
input_text: str) -> list:
|
|
77
|
+
"""
|
|
78
|
+
Purpose:
|
|
79
|
+
Perform Sentence Segmentation
|
|
80
|
+
:param input_text:
|
|
81
|
+
any input text
|
|
82
|
+
:return:
|
|
83
|
+
a list of 0-or-More sentences
|
|
84
|
+
"""
|
|
85
|
+
|
|
86
|
+
doc = self._nlp(input_text)
|
|
87
|
+
|
|
88
|
+
sentences = [str(sent) for sent in doc.sents]
|
|
89
|
+
|
|
90
|
+
sentences = [sent for sent in sentences if
|
|
91
|
+
sent and len(sent) and sent != 'None']
|
|
92
|
+
|
|
93
|
+
sentences = [self._append_period(sent)
|
|
94
|
+
for sent in sentences]
|
|
95
|
+
|
|
96
|
+
sentences = [sent.strip() for sent in sentences
|
|
97
|
+
if self._is_valid_sentence(sent)]
|
|
98
|
+
|
|
99
|
+
sentences = self._cleanse(sentences)
|
|
100
|
+
|
|
101
|
+
return sentences
|
|
@@ -1,2 +1,2 @@
|
|
|
1
|
-
from .perform_paragraph_segmentation import PerformParagraphSegmentation
|
|
2
|
-
from .perform_sentence_segmentation import PerformSentenceSegmentation
|
|
1
|
+
from .perform_paragraph_segmentation import PerformParagraphSegmentation
|
|
2
|
+
from .perform_sentence_segmentation import PerformSentenceSegmentation
|
|
@@ -1,50 +1,50 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Paragraph Segmentation """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class PerformParagraphSegmentation(BaseObject):
|
|
10
|
-
""" Paragraph Segmentation """
|
|
11
|
-
|
|
12
|
-
def __init__(self):
|
|
13
|
-
"""
|
|
14
|
-
Created:
|
|
15
|
-
1-Oct-2021
|
|
16
|
-
"""
|
|
17
|
-
BaseObject.__init__(self, __name__)
|
|
18
|
-
|
|
19
|
-
def _process(self,
|
|
20
|
-
input_text: str) -> list:
|
|
21
|
-
paragraphs = input_text.split('\n\n')
|
|
22
|
-
|
|
23
|
-
paragraphs = [x.strip() for x in paragraphs if x]
|
|
24
|
-
paragraphs = [x for x in paragraphs if len(x)]
|
|
25
|
-
|
|
26
|
-
return paragraphs
|
|
27
|
-
|
|
28
|
-
def process(self,
|
|
29
|
-
input_text: str) -> list:
|
|
30
|
-
"""Perform Paragraph Segmentation
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
input_text (str): An input string of any length or type
|
|
34
|
-
|
|
35
|
-
Raises:
|
|
36
|
-
ValueError: input must be a string
|
|
37
|
-
|
|
38
|
-
Returns:
|
|
39
|
-
list: a list of 1..* paragraphs
|
|
40
|
-
each list item is an input string of any length, but is a paragraph
|
|
41
|
-
A paragraph is a structural concept rather than a semantic one
|
|
42
|
-
"""
|
|
43
|
-
if input_text is None or not len(input_text):
|
|
44
|
-
raise ValueError("Empty Input")
|
|
45
|
-
|
|
46
|
-
if
|
|
47
|
-
self.logger.warning(f"Invalid Input Text: {input_text}")
|
|
48
|
-
return []
|
|
49
|
-
|
|
50
|
-
return self._process(input_text)
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Paragraph Segmentation """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from fast_sentence_segment.core import BaseObject
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PerformParagraphSegmentation(BaseObject):
|
|
10
|
+
""" Paragraph Segmentation """
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
"""
|
|
14
|
+
Created:
|
|
15
|
+
1-Oct-2021
|
|
16
|
+
"""
|
|
17
|
+
BaseObject.__init__(self, __name__)
|
|
18
|
+
|
|
19
|
+
def _process(self,
|
|
20
|
+
input_text: str) -> list:
|
|
21
|
+
paragraphs = input_text.split('\n\n')
|
|
22
|
+
|
|
23
|
+
paragraphs = [x.strip() for x in paragraphs if x]
|
|
24
|
+
paragraphs = [x for x in paragraphs if len(x)]
|
|
25
|
+
|
|
26
|
+
return paragraphs
|
|
27
|
+
|
|
28
|
+
def process(self,
|
|
29
|
+
input_text: str) -> list:
|
|
30
|
+
"""Perform Paragraph Segmentation
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
input_text (str): An input string of any length or type
|
|
34
|
+
|
|
35
|
+
Raises:
|
|
36
|
+
ValueError: input must be a string
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
list: a list of 1..* paragraphs
|
|
40
|
+
each list item is an input string of any length, but is a paragraph
|
|
41
|
+
A paragraph is a structural concept rather than a semantic one
|
|
42
|
+
"""
|
|
43
|
+
if input_text is None or not len(input_text):
|
|
44
|
+
raise ValueError("Empty Input")
|
|
45
|
+
|
|
46
|
+
if not isinstance(input_text, str):
|
|
47
|
+
self.logger.warning(f"Invalid Input Text: {input_text}")
|
|
48
|
+
return []
|
|
49
|
+
|
|
50
|
+
return self._process(input_text)
|
|
@@ -1,129 +1,129 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Sentence Segmentation """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
import spacy
|
|
7
|
-
|
|
8
|
-
from
|
|
9
|
-
|
|
10
|
-
from fast_sentence_segment.dmo import NewlinesToPeriods
|
|
11
|
-
from fast_sentence_segment.dmo import DelimitersToPeriods
|
|
12
|
-
from fast_sentence_segment.dmo import BulletPointCleaner
|
|
13
|
-
from fast_sentence_segment.dmo import NumberedListNormalizer
|
|
14
|
-
from fast_sentence_segment.dmo import SpacyDocSegmenter
|
|
15
|
-
from fast_sentence_segment.dmo import PostProcessStructure
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
class PerformSentenceSegmentation(BaseObject):
|
|
19
|
-
""" Sentence Segmentation """
|
|
20
|
-
|
|
21
|
-
__nlp = None
|
|
22
|
-
|
|
23
|
-
def __init__(self):
|
|
24
|
-
""" Change Log
|
|
25
|
-
|
|
26
|
-
Created:
|
|
27
|
-
30-Sept-2021
|
|
28
|
-
craigtrim@gmail.com
|
|
29
|
-
Updated:
|
|
30
|
-
19-Oct-2022
|
|
31
|
-
craigtrim@gmail.com
|
|
32
|
-
* add numbered-list normalization
|
|
33
|
-
https://github.com/craigtrim/fast-sentence-segment/issues/1
|
|
34
|
-
"""
|
|
35
|
-
BaseObject.__init__(self, __name__)
|
|
36
|
-
if not self.__nlp:
|
|
37
|
-
self.__nlp = spacy.load("en_core_web_sm")
|
|
38
|
-
|
|
39
|
-
self._delimiters_to_periods = DelimitersToPeriods.process
|
|
40
|
-
self._newlines_to_periods = NewlinesToPeriods.process
|
|
41
|
-
self._normalize_numbered_lists = NumberedListNormalizer().process
|
|
42
|
-
self._clean_bullet_points = BulletPointCleaner.process
|
|
43
|
-
self._spacy_segmenter = SpacyDocSegmenter(self.__nlp).process
|
|
44
|
-
self._post_process = PostProcessStructure().process
|
|
45
|
-
|
|
46
|
-
@staticmethod
|
|
47
|
-
def _clean_punctuation(input_text: str) -> str:
|
|
48
|
-
""" Purpose:
|
|
49
|
-
Clean punctuation oddities; this is likely highly overfitted (for now)
|
|
50
|
-
"""
|
|
51
|
-
if ", Inc" in input_text:
|
|
52
|
-
input_text = input_text.replace(", Inc", " Inc")
|
|
53
|
-
|
|
54
|
-
return input_text
|
|
55
|
-
|
|
56
|
-
@staticmethod
|
|
57
|
-
def _clean_spacing(a_sentence: str) -> str:
|
|
58
|
-
|
|
59
|
-
# eliminate triple-space
|
|
60
|
-
a_sentence = a_sentence.replace(' ', ' ')
|
|
61
|
-
|
|
62
|
-
# treat double-space as delimiter
|
|
63
|
-
a_sentence = a_sentence.replace(' ', '. ')
|
|
64
|
-
|
|
65
|
-
return a_sentence
|
|
66
|
-
|
|
67
|
-
def _process(self,
|
|
68
|
-
input_text: str) -> list:
|
|
69
|
-
|
|
70
|
-
input_text = self._delimiters_to_periods(
|
|
71
|
-
delimiter=',',
|
|
72
|
-
input_text=input_text)
|
|
73
|
-
|
|
74
|
-
input_text = self._delimiters_to_periods(
|
|
75
|
-
delimiter=';',
|
|
76
|
-
input_text=input_text)
|
|
77
|
-
|
|
78
|
-
input_text = self._normalize_numbered_lists(input_text)
|
|
79
|
-
|
|
80
|
-
input_text = self._newlines_to_periods(input_text)
|
|
81
|
-
|
|
82
|
-
input_text = self._clean_spacing(input_text)
|
|
83
|
-
if "." not in input_text:
|
|
84
|
-
return [input_text]
|
|
85
|
-
|
|
86
|
-
input_text = self._clean_bullet_points(input_text)
|
|
87
|
-
if "." not in input_text:
|
|
88
|
-
return [input_text]
|
|
89
|
-
|
|
90
|
-
input_text = self._clean_punctuation(input_text)
|
|
91
|
-
if "." not in input_text:
|
|
92
|
-
return [input_text]
|
|
93
|
-
|
|
94
|
-
sentences = self._spacy_segmenter(input_text)
|
|
95
|
-
if "." not in input_text:
|
|
96
|
-
return [input_text]
|
|
97
|
-
|
|
98
|
-
sentences = self._post_process(sentences)
|
|
99
|
-
|
|
100
|
-
sentences = [
|
|
101
|
-
self._normalize_numbered_lists(x, denormalize=True)
|
|
102
|
-
for x in sentences
|
|
103
|
-
]
|
|
104
|
-
|
|
105
|
-
return sentences
|
|
106
|
-
|
|
107
|
-
def process(self,
|
|
108
|
-
input_text: str) -> list:
|
|
109
|
-
"""Perform Sentence Segmentation
|
|
110
|
-
|
|
111
|
-
Args:
|
|
112
|
-
input_text (str): An input string of any length or type
|
|
113
|
-
|
|
114
|
-
Raises:
|
|
115
|
-
ValueError: input must be a string
|
|
116
|
-
|
|
117
|
-
Returns:
|
|
118
|
-
list: a list of sentences
|
|
119
|
-
each list item is an input string of any length, but is a semantic sentence
|
|
120
|
-
"""
|
|
121
|
-
|
|
122
|
-
if input_text is None or not len(input_text):
|
|
123
|
-
raise ValueError("Empty Input")
|
|
124
|
-
|
|
125
|
-
if
|
|
126
|
-
self.logger.warning(f"Invalid Input Text: {input_text}")
|
|
127
|
-
return []
|
|
128
|
-
|
|
129
|
-
return self._process(input_text)
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Sentence Segmentation """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import spacy
|
|
7
|
+
|
|
8
|
+
from fast_sentence_segment.core import BaseObject
|
|
9
|
+
|
|
10
|
+
from fast_sentence_segment.dmo import NewlinesToPeriods
|
|
11
|
+
from fast_sentence_segment.dmo import DelimitersToPeriods
|
|
12
|
+
from fast_sentence_segment.dmo import BulletPointCleaner
|
|
13
|
+
from fast_sentence_segment.dmo import NumberedListNormalizer
|
|
14
|
+
from fast_sentence_segment.dmo import SpacyDocSegmenter
|
|
15
|
+
from fast_sentence_segment.dmo import PostProcessStructure
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class PerformSentenceSegmentation(BaseObject):
|
|
19
|
+
""" Sentence Segmentation """
|
|
20
|
+
|
|
21
|
+
__nlp = None
|
|
22
|
+
|
|
23
|
+
def __init__(self):
|
|
24
|
+
""" Change Log
|
|
25
|
+
|
|
26
|
+
Created:
|
|
27
|
+
30-Sept-2021
|
|
28
|
+
craigtrim@gmail.com
|
|
29
|
+
Updated:
|
|
30
|
+
19-Oct-2022
|
|
31
|
+
craigtrim@gmail.com
|
|
32
|
+
* add numbered-list normalization
|
|
33
|
+
https://github.com/craigtrim/fast-sentence-segment/issues/1
|
|
34
|
+
"""
|
|
35
|
+
BaseObject.__init__(self, __name__)
|
|
36
|
+
if not self.__nlp:
|
|
37
|
+
self.__nlp = spacy.load("en_core_web_sm")
|
|
38
|
+
|
|
39
|
+
self._delimiters_to_periods = DelimitersToPeriods.process
|
|
40
|
+
self._newlines_to_periods = NewlinesToPeriods.process
|
|
41
|
+
self._normalize_numbered_lists = NumberedListNormalizer().process
|
|
42
|
+
self._clean_bullet_points = BulletPointCleaner.process
|
|
43
|
+
self._spacy_segmenter = SpacyDocSegmenter(self.__nlp).process
|
|
44
|
+
self._post_process = PostProcessStructure().process
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def _clean_punctuation(input_text: str) -> str:
|
|
48
|
+
""" Purpose:
|
|
49
|
+
Clean punctuation oddities; this is likely highly overfitted (for now)
|
|
50
|
+
"""
|
|
51
|
+
if ", Inc" in input_text:
|
|
52
|
+
input_text = input_text.replace(", Inc", " Inc")
|
|
53
|
+
|
|
54
|
+
return input_text
|
|
55
|
+
|
|
56
|
+
@staticmethod
|
|
57
|
+
def _clean_spacing(a_sentence: str) -> str:
|
|
58
|
+
|
|
59
|
+
# eliminate triple-space
|
|
60
|
+
a_sentence = a_sentence.replace(' ', ' ')
|
|
61
|
+
|
|
62
|
+
# treat double-space as delimiter
|
|
63
|
+
a_sentence = a_sentence.replace(' ', '. ')
|
|
64
|
+
|
|
65
|
+
return a_sentence
|
|
66
|
+
|
|
67
|
+
def _process(self,
|
|
68
|
+
input_text: str) -> list:
|
|
69
|
+
|
|
70
|
+
input_text = self._delimiters_to_periods(
|
|
71
|
+
delimiter=',',
|
|
72
|
+
input_text=input_text)
|
|
73
|
+
|
|
74
|
+
input_text = self._delimiters_to_periods(
|
|
75
|
+
delimiter=';',
|
|
76
|
+
input_text=input_text)
|
|
77
|
+
|
|
78
|
+
input_text = self._normalize_numbered_lists(input_text)
|
|
79
|
+
|
|
80
|
+
input_text = self._newlines_to_periods(input_text)
|
|
81
|
+
|
|
82
|
+
input_text = self._clean_spacing(input_text)
|
|
83
|
+
if "." not in input_text:
|
|
84
|
+
return [input_text]
|
|
85
|
+
|
|
86
|
+
input_text = self._clean_bullet_points(input_text)
|
|
87
|
+
if "." not in input_text:
|
|
88
|
+
return [input_text]
|
|
89
|
+
|
|
90
|
+
input_text = self._clean_punctuation(input_text)
|
|
91
|
+
if "." not in input_text:
|
|
92
|
+
return [input_text]
|
|
93
|
+
|
|
94
|
+
sentences = self._spacy_segmenter(input_text)
|
|
95
|
+
if "." not in input_text:
|
|
96
|
+
return [input_text]
|
|
97
|
+
|
|
98
|
+
sentences = self._post_process(sentences)
|
|
99
|
+
|
|
100
|
+
sentences = [
|
|
101
|
+
self._normalize_numbered_lists(x, denormalize=True)
|
|
102
|
+
for x in sentences
|
|
103
|
+
]
|
|
104
|
+
|
|
105
|
+
return sentences
|
|
106
|
+
|
|
107
|
+
def process(self,
|
|
108
|
+
input_text: str) -> list:
|
|
109
|
+
"""Perform Sentence Segmentation
|
|
110
|
+
|
|
111
|
+
Args:
|
|
112
|
+
input_text (str): An input string of any length or type
|
|
113
|
+
|
|
114
|
+
Raises:
|
|
115
|
+
ValueError: input must be a string
|
|
116
|
+
|
|
117
|
+
Returns:
|
|
118
|
+
list: a list of sentences
|
|
119
|
+
each list item is an input string of any length, but is a semantic sentence
|
|
120
|
+
"""
|
|
121
|
+
|
|
122
|
+
if input_text is None or not len(input_text):
|
|
123
|
+
raise ValueError("Empty Input")
|
|
124
|
+
|
|
125
|
+
if not isinstance(input_text, str):
|
|
126
|
+
self.logger.warning(f"Invalid Input Text: {input_text}")
|
|
127
|
+
return []
|
|
128
|
+
|
|
129
|
+
return self._process(input_text)
|