fast-sentence-segment 0.1.9__py3-none-any.whl → 1.1.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,48 +1,48 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Post Process Sentences """
4
-
5
-
6
- from baseblock import BaseObject
7
-
8
-
9
- class PostProcessStructure(BaseObject):
10
- """ Post Process Sentences """
11
-
12
- __replace = {
13
- '..': '. ',
14
- '. .': '. ',
15
-
16
- ',.': ', ',
17
- ', .': ', ',
18
-
19
- '!.': '! ',
20
- '! .': '! ',
21
-
22
- '?.': '? ',
23
- '? .': '? ',
24
-
25
- ':.': ': ',
26
- ': .': ': ',
27
- }
28
-
29
- def __init__(self):
30
- """
31
- Created:
32
- 1-Oct-2021
33
- """
34
- BaseObject.__init__(self, __name__)
35
-
36
- def process(self,
37
- sentences: list) -> list:
38
- normalized = []
39
-
40
- for sentence in sentences:
41
-
42
- for k in self.__replace:
43
- if k in sentence:
44
- sentence = sentence.replace(k, self.__replace[k]).strip()
45
-
46
- normalized.append(sentence)
47
-
48
- return normalized
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Post Process Sentences """
4
+
5
+
6
+ from fast_sentence_segment.core import BaseObject
7
+
8
+
9
+ class PostProcessStructure(BaseObject):
10
+ """ Post Process Sentences """
11
+
12
+ __replace = {
13
+ '..': '. ',
14
+ '. .': '. ',
15
+
16
+ ',.': ', ',
17
+ ', .': ', ',
18
+
19
+ '!.': '! ',
20
+ '! .': '! ',
21
+
22
+ '?.': '? ',
23
+ '? .': '? ',
24
+
25
+ ':.': ': ',
26
+ ': .': ': ',
27
+ }
28
+
29
+ def __init__(self):
30
+ """
31
+ Created:
32
+ 1-Oct-2021
33
+ """
34
+ BaseObject.__init__(self, __name__)
35
+
36
+ def process(self,
37
+ sentences: list) -> list:
38
+ normalized = []
39
+
40
+ for sentence in sentences:
41
+
42
+ for k in self.__replace:
43
+ if k in sentence:
44
+ sentence = sentence.replace(k, self.__replace[k]).strip()
45
+
46
+ normalized.append(sentence)
47
+
48
+ return normalized
@@ -1,101 +1,101 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Run Sentence Segmentation using spaCy """
4
-
5
-
6
- from spacy.lang.en import English
7
-
8
-
9
- from baseblock import BaseObject
10
-
11
-
12
- class SpacyDocSegmenter(BaseObject):
13
- """ Run Sentence Segmentation using spaCy """
14
-
15
- def __init__(self,
16
- nlp: English):
17
- """
18
- Created:
19
- 30-Sept-2021
20
- """
21
- BaseObject.__init__(self, __name__)
22
- self._nlp = nlp
23
-
24
- @staticmethod
25
- def _append_period(a_sentence: str) -> str:
26
- """
27
- Purpose:
28
- if the sentence is not terminated with a period, then add one
29
- :return:
30
- a sentence terminated by a period
31
- """
32
- __blacklist = [':', '?', '!']
33
- if not a_sentence.strip().endswith('.'):
34
- for ch in __blacklist:
35
- if not a_sentence.endswith(ch):
36
- return f"{a_sentence}."
37
- return a_sentence
38
-
39
- @staticmethod
40
- def _is_valid_sentence(a_sentence: str) -> bool:
41
- """
42
- Purpose:
43
- enable filtering of invalid sentences
44
- :return:
45
- True if the sentence is a valid one
46
- """
47
- if not a_sentence:
48
- return False
49
- if not len(a_sentence):
50
- return False
51
- if a_sentence.strip() == '.':
52
- return False
53
- return True
54
-
55
- @staticmethod
56
- def _cleanse(sentences: list) -> str:
57
- sentences = [sent for sent in sentences
58
- if sent != '..']
59
-
60
- normalized = []
61
-
62
- for s in sentences:
63
- s = s.replace('\n', ' ')
64
-
65
- if s.startswith('.. '):
66
- s = s[3:]
67
-
68
- if s.endswith('. ..'):
69
- s = s[:len(s) - 3].strip()
70
-
71
- normalized.append(s)
72
-
73
- return normalized
74
-
75
- def process(self,
76
- input_text: str) -> list:
77
- """
78
- Purpose:
79
- Perform Sentence Segmentation
80
- :param input_text:
81
- any input text
82
- :return:
83
- a list of 0-or-More sentences
84
- """
85
-
86
- doc = self._nlp(input_text)
87
-
88
- sentences = [str(sent) for sent in doc.sents]
89
-
90
- sentences = [sent for sent in sentences if
91
- sent and len(sent) and sent != 'None']
92
-
93
- sentences = [self._append_period(sent)
94
- for sent in sentences]
95
-
96
- sentences = [sent.strip() for sent in sentences
97
- if self._is_valid_sentence(sent)]
98
-
99
- sentences = self._cleanse(sentences)
100
-
101
- return sentences
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Run Sentence Segmentation using spaCy """
4
+
5
+
6
+ from spacy.lang.en import English
7
+
8
+
9
+ from fast_sentence_segment.core import BaseObject
10
+
11
+
12
+ class SpacyDocSegmenter(BaseObject):
13
+ """ Run Sentence Segmentation using spaCy """
14
+
15
+ def __init__(self,
16
+ nlp: English):
17
+ """
18
+ Created:
19
+ 30-Sept-2021
20
+ """
21
+ BaseObject.__init__(self, __name__)
22
+ self._nlp = nlp
23
+
24
+ @staticmethod
25
+ def _append_period(a_sentence: str) -> str:
26
+ """
27
+ Purpose:
28
+ if the sentence is not terminated with a period, then add one
29
+ :return:
30
+ a sentence terminated by a period
31
+ """
32
+ __blacklist = [':', '?', '!']
33
+ if not a_sentence.strip().endswith('.'):
34
+ for ch in __blacklist:
35
+ if not a_sentence.endswith(ch):
36
+ return f"{a_sentence}."
37
+ return a_sentence
38
+
39
+ @staticmethod
40
+ def _is_valid_sentence(a_sentence: str) -> bool:
41
+ """
42
+ Purpose:
43
+ enable filtering of invalid sentences
44
+ :return:
45
+ True if the sentence is a valid one
46
+ """
47
+ if not a_sentence:
48
+ return False
49
+ if not len(a_sentence):
50
+ return False
51
+ if a_sentence.strip() == '.':
52
+ return False
53
+ return True
54
+
55
+ @staticmethod
56
+ def _cleanse(sentences: list) -> str:
57
+ sentences = [sent for sent in sentences
58
+ if sent != '..']
59
+
60
+ normalized = []
61
+
62
+ for s in sentences:
63
+ s = s.replace('\n', ' ')
64
+
65
+ if s.startswith('.. '):
66
+ s = s[3:]
67
+
68
+ if s.endswith('. ..'):
69
+ s = s[:len(s) - 3].strip()
70
+
71
+ normalized.append(s)
72
+
73
+ return normalized
74
+
75
+ def process(self,
76
+ input_text: str) -> list:
77
+ """
78
+ Purpose:
79
+ Perform Sentence Segmentation
80
+ :param input_text:
81
+ any input text
82
+ :return:
83
+ a list of 0-or-More sentences
84
+ """
85
+
86
+ doc = self._nlp(input_text)
87
+
88
+ sentences = [str(sent) for sent in doc.sents]
89
+
90
+ sentences = [sent for sent in sentences if
91
+ sent and len(sent) and sent != 'None']
92
+
93
+ sentences = [self._append_period(sent)
94
+ for sent in sentences]
95
+
96
+ sentences = [sent.strip() for sent in sentences
97
+ if self._is_valid_sentence(sent)]
98
+
99
+ sentences = self._cleanse(sentences)
100
+
101
+ return sentences
@@ -1,2 +1,2 @@
1
- from .perform_paragraph_segmentation import PerformParagraphSegmentation
2
- from .perform_sentence_segmentation import PerformSentenceSegmentation
1
+ from .perform_paragraph_segmentation import PerformParagraphSegmentation
2
+ from .perform_sentence_segmentation import PerformSentenceSegmentation
@@ -1,50 +1,50 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Paragraph Segmentation """
4
-
5
-
6
- from baseblock import BaseObject
7
-
8
-
9
- class PerformParagraphSegmentation(BaseObject):
10
- """ Paragraph Segmentation """
11
-
12
- def __init__(self):
13
- """
14
- Created:
15
- 1-Oct-2021
16
- """
17
- BaseObject.__init__(self, __name__)
18
-
19
- def _process(self,
20
- input_text: str) -> list:
21
- paragraphs = input_text.split('\n\n')
22
-
23
- paragraphs = [x.strip() for x in paragraphs if x]
24
- paragraphs = [x for x in paragraphs if len(x)]
25
-
26
- return paragraphs
27
-
28
- def process(self,
29
- input_text: str) -> list:
30
- """Perform Paragraph Segmentation
31
-
32
- Args:
33
- input_text (str): An input string of any length or type
34
-
35
- Raises:
36
- ValueError: input must be a string
37
-
38
- Returns:
39
- list: a list of 1..* paragraphs
40
- each list item is an input string of any length, but is a paragraph
41
- A paragraph is a structural concept rather than a semantic one
42
- """
43
- if input_text is None or not len(input_text):
44
- raise ValueError("Empty Input")
45
-
46
- if type(input_text) != str:
47
- self.logger.warning(f"Invalid Input Text: {input_text}")
48
- return []
49
-
50
- return self._process(input_text)
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Paragraph Segmentation """
4
+
5
+
6
+ from fast_sentence_segment.core import BaseObject
7
+
8
+
9
+ class PerformParagraphSegmentation(BaseObject):
10
+ """ Paragraph Segmentation """
11
+
12
+ def __init__(self):
13
+ """
14
+ Created:
15
+ 1-Oct-2021
16
+ """
17
+ BaseObject.__init__(self, __name__)
18
+
19
+ def _process(self,
20
+ input_text: str) -> list:
21
+ paragraphs = input_text.split('\n\n')
22
+
23
+ paragraphs = [x.strip() for x in paragraphs if x]
24
+ paragraphs = [x for x in paragraphs if len(x)]
25
+
26
+ return paragraphs
27
+
28
+ def process(self,
29
+ input_text: str) -> list:
30
+ """Perform Paragraph Segmentation
31
+
32
+ Args:
33
+ input_text (str): An input string of any length or type
34
+
35
+ Raises:
36
+ ValueError: input must be a string
37
+
38
+ Returns:
39
+ list: a list of 1..* paragraphs
40
+ each list item is an input string of any length, but is a paragraph
41
+ A paragraph is a structural concept rather than a semantic one
42
+ """
43
+ if input_text is None or not len(input_text):
44
+ raise ValueError("Empty Input")
45
+
46
+ if not isinstance(input_text, str):
47
+ self.logger.warning(f"Invalid Input Text: {input_text}")
48
+ return []
49
+
50
+ return self._process(input_text)
@@ -1,129 +1,129 @@
1
- #!/usr/bin/env python
2
- # -*- coding: UTF-8 -*-
3
- """ Sentence Segmentation """
4
-
5
-
6
- import spacy
7
-
8
- from baseblock import BaseObject
9
-
10
- from fast_sentence_segment.dmo import NewlinesToPeriods
11
- from fast_sentence_segment.dmo import DelimitersToPeriods
12
- from fast_sentence_segment.dmo import BulletPointCleaner
13
- from fast_sentence_segment.dmo import NumberedListNormalizer
14
- from fast_sentence_segment.dmo import SpacyDocSegmenter
15
- from fast_sentence_segment.dmo import PostProcessStructure
16
-
17
-
18
- class PerformSentenceSegmentation(BaseObject):
19
- """ Sentence Segmentation """
20
-
21
- __nlp = None
22
-
23
- def __init__(self):
24
- """ Change Log
25
-
26
- Created:
27
- 30-Sept-2021
28
- craigtrim@gmail.com
29
- Updated:
30
- 19-Oct-2022
31
- craigtrim@gmail.com
32
- * add numbered-list normalization
33
- https://github.com/craigtrim/fast-sentence-segment/issues/1
34
- """
35
- BaseObject.__init__(self, __name__)
36
- if not self.__nlp:
37
- self.__nlp = spacy.load("en_core_web_sm")
38
-
39
- self._delimiters_to_periods = DelimitersToPeriods.process
40
- self._newlines_to_periods = NewlinesToPeriods.process
41
- self._normalize_numbered_lists = NumberedListNormalizer().process
42
- self._clean_bullet_points = BulletPointCleaner.process
43
- self._spacy_segmenter = SpacyDocSegmenter(self.__nlp).process
44
- self._post_process = PostProcessStructure().process
45
-
46
- @staticmethod
47
- def _clean_punctuation(input_text: str) -> str:
48
- """ Purpose:
49
- Clean punctuation oddities; this is likely highly overfitted (for now)
50
- """
51
- if ", Inc" in input_text:
52
- input_text = input_text.replace(", Inc", " Inc")
53
-
54
- return input_text
55
-
56
- @staticmethod
57
- def _clean_spacing(a_sentence: str) -> str:
58
-
59
- # eliminate triple-space
60
- a_sentence = a_sentence.replace(' ', ' ')
61
-
62
- # treat double-space as delimiter
63
- a_sentence = a_sentence.replace(' ', '. ')
64
-
65
- return a_sentence
66
-
67
- def _process(self,
68
- input_text: str) -> list:
69
-
70
- input_text = self._delimiters_to_periods(
71
- delimiter=',',
72
- input_text=input_text)
73
-
74
- input_text = self._delimiters_to_periods(
75
- delimiter=';',
76
- input_text=input_text)
77
-
78
- input_text = self._normalize_numbered_lists(input_text)
79
-
80
- input_text = self._newlines_to_periods(input_text)
81
-
82
- input_text = self._clean_spacing(input_text)
83
- if "." not in input_text:
84
- return [input_text]
85
-
86
- input_text = self._clean_bullet_points(input_text)
87
- if "." not in input_text:
88
- return [input_text]
89
-
90
- input_text = self._clean_punctuation(input_text)
91
- if "." not in input_text:
92
- return [input_text]
93
-
94
- sentences = self._spacy_segmenter(input_text)
95
- if "." not in input_text:
96
- return [input_text]
97
-
98
- sentences = self._post_process(sentences)
99
-
100
- sentences = [
101
- self._normalize_numbered_lists(x, denormalize=True)
102
- for x in sentences
103
- ]
104
-
105
- return sentences
106
-
107
- def process(self,
108
- input_text: str) -> list:
109
- """Perform Sentence Segmentation
110
-
111
- Args:
112
- input_text (str): An input string of any length or type
113
-
114
- Raises:
115
- ValueError: input must be a string
116
-
117
- Returns:
118
- list: a list of sentences
119
- each list item is an input string of any length, but is a semantic sentence
120
- """
121
-
122
- if input_text is None or not len(input_text):
123
- raise ValueError("Empty Input")
124
-
125
- if type(input_text) != str:
126
- self.logger.warning(f"Invalid Input Text: {input_text}")
127
- return []
128
-
129
- return self._process(input_text)
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """ Sentence Segmentation """
4
+
5
+
6
+ import spacy
7
+
8
+ from fast_sentence_segment.core import BaseObject
9
+
10
+ from fast_sentence_segment.dmo import NewlinesToPeriods
11
+ from fast_sentence_segment.dmo import DelimitersToPeriods
12
+ from fast_sentence_segment.dmo import BulletPointCleaner
13
+ from fast_sentence_segment.dmo import NumberedListNormalizer
14
+ from fast_sentence_segment.dmo import SpacyDocSegmenter
15
+ from fast_sentence_segment.dmo import PostProcessStructure
16
+
17
+
18
+ class PerformSentenceSegmentation(BaseObject):
19
+ """ Sentence Segmentation """
20
+
21
+ __nlp = None
22
+
23
+ def __init__(self):
24
+ """ Change Log
25
+
26
+ Created:
27
+ 30-Sept-2021
28
+ craigtrim@gmail.com
29
+ Updated:
30
+ 19-Oct-2022
31
+ craigtrim@gmail.com
32
+ * add numbered-list normalization
33
+ https://github.com/craigtrim/fast-sentence-segment/issues/1
34
+ """
35
+ BaseObject.__init__(self, __name__)
36
+ if not self.__nlp:
37
+ self.__nlp = spacy.load("en_core_web_sm")
38
+
39
+ self._delimiters_to_periods = DelimitersToPeriods.process
40
+ self._newlines_to_periods = NewlinesToPeriods.process
41
+ self._normalize_numbered_lists = NumberedListNormalizer().process
42
+ self._clean_bullet_points = BulletPointCleaner.process
43
+ self._spacy_segmenter = SpacyDocSegmenter(self.__nlp).process
44
+ self._post_process = PostProcessStructure().process
45
+
46
+ @staticmethod
47
+ def _clean_punctuation(input_text: str) -> str:
48
+ """ Purpose:
49
+ Clean punctuation oddities; this is likely highly overfitted (for now)
50
+ """
51
+ if ", Inc" in input_text:
52
+ input_text = input_text.replace(", Inc", " Inc")
53
+
54
+ return input_text
55
+
56
+ @staticmethod
57
+ def _clean_spacing(a_sentence: str) -> str:
58
+
59
+ # eliminate triple-space
60
+ a_sentence = a_sentence.replace(' ', ' ')
61
+
62
+ # treat double-space as delimiter
63
+ a_sentence = a_sentence.replace(' ', '. ')
64
+
65
+ return a_sentence
66
+
67
+ def _process(self,
68
+ input_text: str) -> list:
69
+
70
+ input_text = self._delimiters_to_periods(
71
+ delimiter=',',
72
+ input_text=input_text)
73
+
74
+ input_text = self._delimiters_to_periods(
75
+ delimiter=';',
76
+ input_text=input_text)
77
+
78
+ input_text = self._normalize_numbered_lists(input_text)
79
+
80
+ input_text = self._newlines_to_periods(input_text)
81
+
82
+ input_text = self._clean_spacing(input_text)
83
+ if "." not in input_text:
84
+ return [input_text]
85
+
86
+ input_text = self._clean_bullet_points(input_text)
87
+ if "." not in input_text:
88
+ return [input_text]
89
+
90
+ input_text = self._clean_punctuation(input_text)
91
+ if "." not in input_text:
92
+ return [input_text]
93
+
94
+ sentences = self._spacy_segmenter(input_text)
95
+ if "." not in input_text:
96
+ return [input_text]
97
+
98
+ sentences = self._post_process(sentences)
99
+
100
+ sentences = [
101
+ self._normalize_numbered_lists(x, denormalize=True)
102
+ for x in sentences
103
+ ]
104
+
105
+ return sentences
106
+
107
+ def process(self,
108
+ input_text: str) -> list:
109
+ """Perform Sentence Segmentation
110
+
111
+ Args:
112
+ input_text (str): An input string of any length or type
113
+
114
+ Raises:
115
+ ValueError: input must be a string
116
+
117
+ Returns:
118
+ list: a list of sentences
119
+ each list item is an input string of any length, but is a semantic sentence
120
+ """
121
+
122
+ if input_text is None or not len(input_text):
123
+ raise ValueError("Empty Input")
124
+
125
+ if not isinstance(input_text, str):
126
+ self.logger.warning(f"Invalid Input Text: {input_text}")
127
+ return []
128
+
129
+ return self._process(input_text)