fast-sentence-segment 0.1.9__py3-none-any.whl → 1.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- fast_sentence_segment/__init__.py +18 -18
- fast_sentence_segment/bp/__init__.py +1 -1
- fast_sentence_segment/bp/segmenter.py +65 -68
- fast_sentence_segment/cli.py +56 -0
- fast_sentence_segment/core/__init__.py +4 -0
- fast_sentence_segment/core/base_object.py +18 -0
- fast_sentence_segment/core/stopwatch.py +38 -0
- fast_sentence_segment/dmo/__init__.py +10 -6
- fast_sentence_segment/dmo/abbreviation_merger.py +146 -0
- fast_sentence_segment/dmo/abbreviation_splitter.py +95 -0
- fast_sentence_segment/dmo/abbreviations.py +96 -0
- fast_sentence_segment/dmo/bullet_point_cleaner.py +55 -55
- fast_sentence_segment/dmo/ellipsis_normalizer.py +45 -0
- fast_sentence_segment/dmo/newlines_to_periods.py +57 -57
- fast_sentence_segment/dmo/numbered_list_normalizer.py +47 -53
- fast_sentence_segment/dmo/post_process_sentences.py +48 -48
- fast_sentence_segment/dmo/question_exclamation_splitter.py +59 -0
- fast_sentence_segment/dmo/spacy_doc_segmenter.py +101 -101
- fast_sentence_segment/dmo/title_name_merger.py +152 -0
- fast_sentence_segment/svc/__init__.py +2 -2
- fast_sentence_segment/svc/perform_paragraph_segmentation.py +50 -50
- fast_sentence_segment/svc/perform_sentence_segmentation.py +165 -129
- fast_sentence_segment-1.2.0.dist-info/METADATA +189 -0
- fast_sentence_segment-1.2.0.dist-info/RECORD +27 -0
- {fast_sentence_segment-0.1.9.dist-info → fast_sentence_segment-1.2.0.dist-info}/WHEEL +1 -1
- fast_sentence_segment-1.2.0.dist-info/entry_points.txt +3 -0
- fast_sentence_segment-1.2.0.dist-info/licenses/LICENSE +21 -0
- fast_sentence_segment/dmo/delimiters_to_periods.py +0 -37
- fast_sentence_segment-0.1.9.dist-info/METADATA +0 -54
- fast_sentence_segment-0.1.9.dist-info/RECORD +0 -16
|
@@ -1,55 +1,55 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Prevent Bullet Points from Triggering False Positive Segmentation """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class BulletPointCleaner(BaseObject):
|
|
10
|
-
""" Prevent Bullet Points from Triggering False Positive Segmentation """
|
|
11
|
-
|
|
12
|
-
def __init__(self):
|
|
13
|
-
""" Change Log
|
|
14
|
-
|
|
15
|
-
Created:
|
|
16
|
-
30-Sept-2021
|
|
17
|
-
craigtrim@gmail.com
|
|
18
|
-
Updated:
|
|
19
|
-
19-Oct-2022
|
|
20
|
-
craigtrim@gmail.com
|
|
21
|
-
* clean up for segment_text_3_test.py
|
|
22
|
-
"""
|
|
23
|
-
BaseObject.__init__(self, __name__)
|
|
24
|
-
|
|
25
|
-
@staticmethod
|
|
26
|
-
def process(input_text: str) -> str:
|
|
27
|
-
"""
|
|
28
|
-
Purpose:
|
|
29
|
-
prevent numbered bullet points from triggering sentence detection
|
|
30
|
-
:param input_text:
|
|
31
|
-
any input text
|
|
32
|
-
:return:
|
|
33
|
-
preprocessed input text
|
|
34
|
-
"""
|
|
35
|
-
if input_text.startswith("-"):
|
|
36
|
-
input_text = input_text[1:] # segment_text_3_test.py
|
|
37
|
-
|
|
38
|
-
if " " in input_text:
|
|
39
|
-
input_text = input_text.replace(" ", " ")
|
|
40
|
-
|
|
41
|
-
# the replacement routine above leaves double '..' in the text
|
|
42
|
-
# this replacement will solve that
|
|
43
|
-
while ".." in input_text:
|
|
44
|
-
input_text = input_text.replace("..", ".")
|
|
45
|
-
|
|
46
|
-
while ". -" in input_text: # segment_text_3_test.py
|
|
47
|
-
input_text = input_text.replace(". -", ". ")
|
|
48
|
-
|
|
49
|
-
while ". . " in input_text:
|
|
50
|
-
input_text = input_text.replace(". . ", ".")
|
|
51
|
-
|
|
52
|
-
while ' ' in input_text:
|
|
53
|
-
input_text = input_text.replace(' ', ' ')
|
|
54
|
-
|
|
55
|
-
return input_text
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Prevent Bullet Points from Triggering False Positive Segmentation """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from fast_sentence_segment.core import BaseObject
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class BulletPointCleaner(BaseObject):
|
|
10
|
+
""" Prevent Bullet Points from Triggering False Positive Segmentation """
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
""" Change Log
|
|
14
|
+
|
|
15
|
+
Created:
|
|
16
|
+
30-Sept-2021
|
|
17
|
+
craigtrim@gmail.com
|
|
18
|
+
Updated:
|
|
19
|
+
19-Oct-2022
|
|
20
|
+
craigtrim@gmail.com
|
|
21
|
+
* clean up for segment_text_3_test.py
|
|
22
|
+
"""
|
|
23
|
+
BaseObject.__init__(self, __name__)
|
|
24
|
+
|
|
25
|
+
@staticmethod
|
|
26
|
+
def process(input_text: str) -> str:
|
|
27
|
+
"""
|
|
28
|
+
Purpose:
|
|
29
|
+
prevent numbered bullet points from triggering sentence detection
|
|
30
|
+
:param input_text:
|
|
31
|
+
any input text
|
|
32
|
+
:return:
|
|
33
|
+
preprocessed input text
|
|
34
|
+
"""
|
|
35
|
+
if input_text.startswith("-"):
|
|
36
|
+
input_text = input_text[1:] # segment_text_3_test.py
|
|
37
|
+
|
|
38
|
+
if " " in input_text:
|
|
39
|
+
input_text = input_text.replace(" ", " ")
|
|
40
|
+
|
|
41
|
+
# the replacement routine above leaves double '..' in the text
|
|
42
|
+
# this replacement will solve that
|
|
43
|
+
while ".." in input_text:
|
|
44
|
+
input_text = input_text.replace("..", ".")
|
|
45
|
+
|
|
46
|
+
while ". -" in input_text: # segment_text_3_test.py
|
|
47
|
+
input_text = input_text.replace(". -", ". ")
|
|
48
|
+
|
|
49
|
+
while ". . " in input_text:
|
|
50
|
+
input_text = input_text.replace(". . ", ".")
|
|
51
|
+
|
|
52
|
+
while ' ' in input_text:
|
|
53
|
+
input_text = input_text.replace(' ', ' ')
|
|
54
|
+
|
|
55
|
+
return input_text
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Normalize Ellipses to prevent them being stripped by cleanup routines """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from fast_sentence_segment.core import BaseObject
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
PLACEHOLDER = "〈ELLIPSIS〉"
|
|
12
|
+
|
|
13
|
+
# Pattern: ... followed by space and capital letter
|
|
14
|
+
BOUNDARY_PATTERN = re.compile(r'\.\.\.(\s+)([A-Z])')
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class EllipsisNormalizer(BaseObject):
|
|
18
|
+
""" Normalize Ellipses to prevent them being stripped by cleanup routines """
|
|
19
|
+
|
|
20
|
+
def __init__(self):
|
|
21
|
+
"""
|
|
22
|
+
Created:
|
|
23
|
+
27-Dec-2024
|
|
24
|
+
craigtrim@gmail.com
|
|
25
|
+
* preserve ellipses through the pipeline
|
|
26
|
+
https://github.com/craigtrim/fast-sentence-segment/issues/3
|
|
27
|
+
"""
|
|
28
|
+
BaseObject.__init__(self, __name__)
|
|
29
|
+
|
|
30
|
+
def process(self,
|
|
31
|
+
input_text: str,
|
|
32
|
+
denormalize: bool = False) -> str:
|
|
33
|
+
|
|
34
|
+
if not denormalize:
|
|
35
|
+
# "... [Capital]" → "〈ELLIPSIS〉. [Capital]" (adds period for spaCy to split)
|
|
36
|
+
input_text = BOUNDARY_PATTERN.sub(PLACEHOLDER + r'.\1\2', input_text)
|
|
37
|
+
# Remaining ellipses (mid-sentence): "..." → "〈ELLIPSIS〉"
|
|
38
|
+
input_text = input_text.replace("...", PLACEHOLDER)
|
|
39
|
+
else:
|
|
40
|
+
# "〈ELLIPSIS〉." → "..." (remove extra period added for boundary)
|
|
41
|
+
input_text = input_text.replace(PLACEHOLDER + ".", "...")
|
|
42
|
+
# Remaining placeholders: "〈ELLIPSIS〉" → "..."
|
|
43
|
+
input_text = input_text.replace(PLACEHOLDER, "...")
|
|
44
|
+
|
|
45
|
+
return input_text
|
|
@@ -1,57 +1,57 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Convert New Lines into Periods """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class NewlinesToPeriods(BaseObject):
|
|
10
|
-
""" Convert New Lines into Periods """
|
|
11
|
-
|
|
12
|
-
def __init__(self):
|
|
13
|
-
"""
|
|
14
|
-
Created:
|
|
15
|
-
30-Sept-2021
|
|
16
|
-
"""
|
|
17
|
-
BaseObject.__init__(self, __name__)
|
|
18
|
-
|
|
19
|
-
@staticmethod
|
|
20
|
-
def process(input_text: str):
|
|
21
|
-
"""
|
|
22
|
-
Purpose:
|
|
23
|
-
Take a CSV list and transform to sentences
|
|
24
|
-
:param input_text:
|
|
25
|
-
:return:
|
|
26
|
-
"""
|
|
27
|
-
|
|
28
|
-
# def replace(input_text: str,
|
|
29
|
-
# variant: str,
|
|
30
|
-
# canon: str) -> str:
|
|
31
|
-
|
|
32
|
-
# v1 = f" {variant} "
|
|
33
|
-
# if v1 in input_text:
|
|
34
|
-
# return input_text.replace(
|
|
35
|
-
# v1, f" {canon} ")
|
|
36
|
-
|
|
37
|
-
# v2 = f"{variant} "
|
|
38
|
-
# if v2 in input_text:
|
|
39
|
-
# return input_text.replace(
|
|
40
|
-
# v2, f"{canon} ")
|
|
41
|
-
|
|
42
|
-
# v3 = f" {variant}"
|
|
43
|
-
# if v3 in input_text:
|
|
44
|
-
# return input_text.replace(
|
|
45
|
-
# v3, f" {canon}")
|
|
46
|
-
|
|
47
|
-
# return input_text
|
|
48
|
-
|
|
49
|
-
# result = replace(input_text=input_text,
|
|
50
|
-
# variant='\n',
|
|
51
|
-
# canon=' . ')
|
|
52
|
-
|
|
53
|
-
# 20230309; don't replace a newline with a period
|
|
54
|
-
# that too often causes confusion and puts a period where one should not exist
|
|
55
|
-
result = input_text.replace('\n', ' ')
|
|
56
|
-
|
|
57
|
-
return result
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Convert New Lines into Periods """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from fast_sentence_segment.core import BaseObject
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NewlinesToPeriods(BaseObject):
|
|
10
|
+
""" Convert New Lines into Periods """
|
|
11
|
+
|
|
12
|
+
def __init__(self):
|
|
13
|
+
"""
|
|
14
|
+
Created:
|
|
15
|
+
30-Sept-2021
|
|
16
|
+
"""
|
|
17
|
+
BaseObject.__init__(self, __name__)
|
|
18
|
+
|
|
19
|
+
@staticmethod
|
|
20
|
+
def process(input_text: str):
|
|
21
|
+
"""
|
|
22
|
+
Purpose:
|
|
23
|
+
Take a CSV list and transform to sentences
|
|
24
|
+
:param input_text:
|
|
25
|
+
:return:
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
# def replace(input_text: str,
|
|
29
|
+
# variant: str,
|
|
30
|
+
# canon: str) -> str:
|
|
31
|
+
|
|
32
|
+
# v1 = f" {variant} "
|
|
33
|
+
# if v1 in input_text:
|
|
34
|
+
# return input_text.replace(
|
|
35
|
+
# v1, f" {canon} ")
|
|
36
|
+
|
|
37
|
+
# v2 = f"{variant} "
|
|
38
|
+
# if v2 in input_text:
|
|
39
|
+
# return input_text.replace(
|
|
40
|
+
# v2, f"{canon} ")
|
|
41
|
+
|
|
42
|
+
# v3 = f" {variant}"
|
|
43
|
+
# if v3 in input_text:
|
|
44
|
+
# return input_text.replace(
|
|
45
|
+
# v3, f" {canon}")
|
|
46
|
+
|
|
47
|
+
# return input_text
|
|
48
|
+
|
|
49
|
+
# result = replace(input_text=input_text,
|
|
50
|
+
# variant='\n',
|
|
51
|
+
# canon=' . ')
|
|
52
|
+
|
|
53
|
+
# 20230309; don't replace a newline with a period
|
|
54
|
+
# that too often causes confusion and puts a period where one should not exist
|
|
55
|
+
result = input_text.replace('\n', ' ')
|
|
56
|
+
|
|
57
|
+
return result
|
|
@@ -1,53 +1,47 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Normalize Numbered Lists to prevent False Positive Segmentation """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
for candidate in d_rev:
|
|
49
|
-
if candidate in input_text:
|
|
50
|
-
input_text = input_text.replace(
|
|
51
|
-
candidate, d_rev[candidate])
|
|
52
|
-
|
|
53
|
-
return input_text
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Normalize Numbered Lists to prevent False Positive Segmentation """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
|
|
8
|
+
from fast_sentence_segment.core import BaseObject
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class NumberedListNormalizer(BaseObject):
|
|
12
|
+
""" Normalize Numbered Lists to prevent False Positive Segmentation """
|
|
13
|
+
|
|
14
|
+
# Pattern 1: start of string OR newline, followed by number, period, space
|
|
15
|
+
__normalize_line_start = re.compile(r'(^|\n\s*)(\d{1,2})\. ')
|
|
16
|
+
__denormalize_line_start = re.compile(r'(^|\n\s*)(\d{1,2})_ ')
|
|
17
|
+
|
|
18
|
+
# Pattern 2: inline numbered list ". N. " (period + space + number + period + space)
|
|
19
|
+
__normalize_inline = re.compile(r'(\. )(\d{1,2})\. ')
|
|
20
|
+
__denormalize_inline = re.compile(r'(\. )(\d{1,2})_ ')
|
|
21
|
+
|
|
22
|
+
def __init__(self):
|
|
23
|
+
"""
|
|
24
|
+
Created:
|
|
25
|
+
19-Oct-2022
|
|
26
|
+
craigtrim@gmail.com
|
|
27
|
+
* https://github.com/craigtrim/fast-sentence-segment/issues/1
|
|
28
|
+
Updated:
|
|
29
|
+
27-Dec-2024
|
|
30
|
+
craigtrim@gmail.com
|
|
31
|
+
* fix to only match at line starts, not mid-sentence
|
|
32
|
+
https://github.com/craigtrim/fast-sentence-segment/issues/3
|
|
33
|
+
"""
|
|
34
|
+
BaseObject.__init__(self, __name__)
|
|
35
|
+
|
|
36
|
+
def process(self,
|
|
37
|
+
input_text: str,
|
|
38
|
+
denormalize: bool = False) -> str:
|
|
39
|
+
|
|
40
|
+
if not denormalize:
|
|
41
|
+
input_text = self.__normalize_line_start.sub(r'\1\2_ ', input_text)
|
|
42
|
+
input_text = self.__normalize_inline.sub(r'\1\2_ ', input_text)
|
|
43
|
+
else:
|
|
44
|
+
input_text = self.__denormalize_line_start.sub(r'\1\2. ', input_text)
|
|
45
|
+
input_text = self.__denormalize_inline.sub(r'\1\2. ', input_text)
|
|
46
|
+
|
|
47
|
+
return input_text
|
|
@@ -1,48 +1,48 @@
|
|
|
1
|
-
#!/usr/bin/env python
|
|
2
|
-
# -*- coding: UTF-8 -*-
|
|
3
|
-
""" Post Process Sentences """
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
from
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
class PostProcessStructure(BaseObject):
|
|
10
|
-
""" Post Process Sentences """
|
|
11
|
-
|
|
12
|
-
__replace = {
|
|
13
|
-
'..': '. ',
|
|
14
|
-
'. .': '. ',
|
|
15
|
-
|
|
16
|
-
',.': ', ',
|
|
17
|
-
', .': ', ',
|
|
18
|
-
|
|
19
|
-
'!.': '! ',
|
|
20
|
-
'! .': '! ',
|
|
21
|
-
|
|
22
|
-
'?.': '? ',
|
|
23
|
-
'? .': '? ',
|
|
24
|
-
|
|
25
|
-
':.': ': ',
|
|
26
|
-
': .': ': ',
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
def __init__(self):
|
|
30
|
-
"""
|
|
31
|
-
Created:
|
|
32
|
-
1-Oct-2021
|
|
33
|
-
"""
|
|
34
|
-
BaseObject.__init__(self, __name__)
|
|
35
|
-
|
|
36
|
-
def process(self,
|
|
37
|
-
sentences: list) -> list:
|
|
38
|
-
normalized = []
|
|
39
|
-
|
|
40
|
-
for sentence in sentences:
|
|
41
|
-
|
|
42
|
-
for k in self.__replace:
|
|
43
|
-
if k in sentence:
|
|
44
|
-
sentence = sentence.replace(k, self.__replace[k]).strip()
|
|
45
|
-
|
|
46
|
-
normalized.append(sentence)
|
|
47
|
-
|
|
48
|
-
return normalized
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Post Process Sentences """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
from fast_sentence_segment.core import BaseObject
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class PostProcessStructure(BaseObject):
|
|
10
|
+
""" Post Process Sentences """
|
|
11
|
+
|
|
12
|
+
__replace = {
|
|
13
|
+
'..': '. ',
|
|
14
|
+
'. .': '. ',
|
|
15
|
+
|
|
16
|
+
',.': ', ',
|
|
17
|
+
', .': ', ',
|
|
18
|
+
|
|
19
|
+
'!.': '! ',
|
|
20
|
+
'! .': '! ',
|
|
21
|
+
|
|
22
|
+
'?.': '? ',
|
|
23
|
+
'? .': '? ',
|
|
24
|
+
|
|
25
|
+
':.': ': ',
|
|
26
|
+
': .': ': ',
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
def __init__(self):
|
|
30
|
+
"""
|
|
31
|
+
Created:
|
|
32
|
+
1-Oct-2021
|
|
33
|
+
"""
|
|
34
|
+
BaseObject.__init__(self, __name__)
|
|
35
|
+
|
|
36
|
+
def process(self,
|
|
37
|
+
sentences: list) -> list:
|
|
38
|
+
normalized = []
|
|
39
|
+
|
|
40
|
+
for sentence in sentences:
|
|
41
|
+
|
|
42
|
+
for k in self.__replace:
|
|
43
|
+
if k in sentence:
|
|
44
|
+
sentence = sentence.replace(k, self.__replace[k]).strip()
|
|
45
|
+
|
|
46
|
+
normalized.append(sentence)
|
|
47
|
+
|
|
48
|
+
return normalized
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
#!/usr/bin/env python
|
|
2
|
+
# -*- coding: UTF-8 -*-
|
|
3
|
+
""" Split sentences at ? and ! followed by capital letter """
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
import re
|
|
7
|
+
from typing import List
|
|
8
|
+
|
|
9
|
+
from fast_sentence_segment.core import BaseObject
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
# Pattern: ? or ! followed by space and capital letter
|
|
13
|
+
BOUNDARY_PATTERN = re.compile(r'([?!])(\s+)([A-Z])')
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class QuestionExclamationSplitter(BaseObject):
|
|
17
|
+
""" Split sentences at ? and ! followed by capital letter """
|
|
18
|
+
|
|
19
|
+
def __init__(self):
|
|
20
|
+
"""
|
|
21
|
+
Created:
|
|
22
|
+
27-Dec-2024
|
|
23
|
+
craigtrim@gmail.com
|
|
24
|
+
* spaCy doesn't always split on ? and ! boundaries
|
|
25
|
+
https://github.com/craigtrim/fast-sentence-segment/issues/3
|
|
26
|
+
"""
|
|
27
|
+
BaseObject.__init__(self, __name__)
|
|
28
|
+
|
|
29
|
+
def process(self, sentences: List[str]) -> List[str]:
|
|
30
|
+
"""Split sentences that contain ? or ! followed by capital letter.
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
sentences: List of sentences from earlier processing
|
|
34
|
+
|
|
35
|
+
Returns:
|
|
36
|
+
List of sentences with ? and ! boundaries split
|
|
37
|
+
"""
|
|
38
|
+
result = []
|
|
39
|
+
for sent in sentences:
|
|
40
|
+
# Split on pattern, keeping the punctuation with the first part
|
|
41
|
+
parts = BOUNDARY_PATTERN.split(sent)
|
|
42
|
+
if len(parts) == 1:
|
|
43
|
+
result.append(sent)
|
|
44
|
+
else:
|
|
45
|
+
# Reassemble: parts = [before, punct, space, capital, after, ...]
|
|
46
|
+
i = 0
|
|
47
|
+
while i < len(parts):
|
|
48
|
+
if i + 3 < len(parts):
|
|
49
|
+
# before + punct
|
|
50
|
+
result.append(parts[i] + parts[i + 1])
|
|
51
|
+
# capital + rest will be handled in next iteration
|
|
52
|
+
parts[i + 4] = parts[i + 3] + parts[i + 4] if i + 4 < len(parts) else parts[i + 3]
|
|
53
|
+
i += 4
|
|
54
|
+
else:
|
|
55
|
+
if parts[i].strip():
|
|
56
|
+
result.append(parts[i])
|
|
57
|
+
i += 1
|
|
58
|
+
|
|
59
|
+
return [s.strip() for s in result if s.strip()]
|