fast-sentence-segment 1.1.8__tar.gz → 1.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/PKG-INFO +98 -17
  2. fast_sentence_segment-1.2.1/README.md +199 -0
  3. fast_sentence_segment-1.2.1/fast_sentence_segment/cli.py +56 -0
  4. fast_sentence_segment-1.2.1/fast_sentence_segment/dmo/__init__.py +10 -0
  5. fast_sentence_segment-1.2.1/fast_sentence_segment/dmo/abbreviation_merger.py +146 -0
  6. fast_sentence_segment-1.2.1/fast_sentence_segment/dmo/abbreviation_splitter.py +95 -0
  7. fast_sentence_segment-1.2.1/fast_sentence_segment/dmo/abbreviations.py +96 -0
  8. fast_sentence_segment-1.2.1/fast_sentence_segment/dmo/ellipsis_normalizer.py +45 -0
  9. fast_sentence_segment-1.2.1/fast_sentence_segment/dmo/numbered_list_normalizer.py +47 -0
  10. fast_sentence_segment-1.2.1/fast_sentence_segment/dmo/question_exclamation_splitter.py +59 -0
  11. fast_sentence_segment-1.2.1/fast_sentence_segment/dmo/title_name_merger.py +152 -0
  12. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/svc/perform_sentence_segmentation.py +53 -17
  13. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/pyproject.toml +4 -1
  14. fast_sentence_segment-1.2.1/setup.py +38 -0
  15. fast_sentence_segment-1.1.8/README.md +0 -118
  16. fast_sentence_segment-1.1.8/fast_sentence_segment/dmo/__init__.py +0 -6
  17. fast_sentence_segment-1.1.8/fast_sentence_segment/dmo/delimiters_to_periods.py +0 -37
  18. fast_sentence_segment-1.1.8/fast_sentence_segment/dmo/numbered_list_normalizer.py +0 -53
  19. fast_sentence_segment-1.1.8/setup.py +0 -34
  20. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/LICENSE +0 -0
  21. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/__init__.py +0 -0
  22. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/bp/__init__.py +0 -0
  23. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/bp/segmenter.py +0 -0
  24. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/core/__init__.py +0 -0
  25. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/core/base_object.py +0 -0
  26. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/core/stopwatch.py +0 -0
  27. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/dmo/bullet_point_cleaner.py +0 -0
  28. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/dmo/newlines_to_periods.py +0 -0
  29. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/dmo/post_process_sentences.py +0 -0
  30. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/dmo/spacy_doc_segmenter.py +0 -0
  31. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/svc/__init__.py +0 -0
  32. {fast_sentence_segment-1.1.8 → fast_sentence_segment-1.2.1}/fast_sentence_segment/svc/perform_paragraph_segmentation.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: fast-sentence-segment
3
- Version: 1.1.8
3
+ Version: 1.2.1
4
4
  Summary: Fast and Efficient Sentence Segmentation
5
5
  License: MIT
6
6
  License-File: LICENSE
@@ -30,17 +30,25 @@ Description-Content-Type: text/markdown
30
30
  [![PyPI version](https://img.shields.io/pypi/v/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
31
31
  [![Python versions](https://img.shields.io/pypi/pyversions/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
32
32
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
33
- [![spaCy](https://img.shields.io/badge/spaCy-3.5-blue.svg)](https://spacy.io/)
33
+ [![spaCy](https://img.shields.io/badge/spaCy-3.8-blue.svg)](https://spacy.io/)
34
34
 
35
- Fast and efficient sentence segmentation using spaCy. Handles complex edge cases like abbreviations (Dr., Mr., etc.), quoted text, and multi-paragraph documents.
35
+ Fast and efficient sentence segmentation using spaCy with surgical post-processing fixes. Handles complex edge cases like abbreviations (Dr., Mr., etc.), ellipses, quoted text, and multi-paragraph documents.
36
+
37
+ ## Why This Library?
38
+
39
+ 1. **Keep it local**: LLM API calls cost money and send your data to third parties. Run sentence segmentation entirely on your machine.
40
+ 2. **spaCy perfected**: spaCy is a great local model, but it makes mistakes. This library fixes most of spaCy's shortcomings.
36
41
 
37
42
  ## Features
38
43
 
39
44
  - **Paragraph-aware segmentation**: Returns sentences grouped by paragraph
40
- - **Abbreviation handling**: Correctly handles "Dr.", "Mr.", "etc." without false splits
45
+ - **Abbreviation handling**: Correctly handles "Dr.", "Mr.", "etc.", "p.m.", "a.m." without false splits
46
+ - **Ellipsis preservation**: Keeps `...` intact while detecting sentence boundaries
47
+ - **Question/exclamation splitting**: Properly splits on `?` and `!` followed by capital letters
41
48
  - **Cached processing**: LRU cache for repeated text processing
42
49
  - **Flexible output**: Nested lists (by paragraph) or flattened list of sentences
43
50
  - **Bullet point & numbered list normalization**: Cleans common list formats
51
+ - **CLI tool**: Command-line interface for quick segmentation
44
52
 
45
53
  ## Installation
46
54
 
@@ -59,12 +67,21 @@ python -m spacy download en_core_web_sm
59
67
  ```python
60
68
  from fast_sentence_segment import segment_text
61
69
 
62
- text = "Here is a Dr. who says something. And then again, what else? I don't know. Do you?"
70
+ text = "Do you like Dr. Who? I prefer Dr. Strange! Mr. T is also cool."
63
71
 
64
- results = segment_text(text)
65
- # Returns: [['Here is a Dr. who says something.', 'And then again, what else?', "I don't know.", 'Do you?']]
72
+ results = segment_text(text, flatten=True)
66
73
  ```
67
74
 
75
+ ```json
76
+ [
77
+ "Do you like Dr. Who?",
78
+ "I prefer Dr. Strange!",
79
+ "Mr. T is also cool."
80
+ ]
81
+ ```
82
+
83
+ Notice how "Dr. Who?" stays together as a single sentence—the library correctly recognizes that a title followed by a single-word name ending in `?` or `!` is a name reference, not a sentence boundary.
84
+
68
85
  ## Usage
69
86
 
70
87
  ### Basic Segmentation
@@ -74,16 +91,24 @@ The `segment_text` function returns a list of lists, where each inner list repre
74
91
  ```python
75
92
  from fast_sentence_segment import segment_text
76
93
 
77
- text = """First paragraph here. It has two sentences.
94
+ text = """Gandalf spoke softly. "All we have to decide is what to do with the time given us."
78
95
 
79
- Second paragraph starts here. This one also has multiple sentences. And a third."""
96
+ Frodo nodded. The weight of the Ring pressed against his chest."""
80
97
 
81
98
  results = segment_text(text)
82
- # Returns:
83
- # [
84
- # ['First paragraph here.', 'It has two sentences.'],
85
- # ['Second paragraph starts here.', 'This one also has multiple sentences.', 'And a third.']
86
- # ]
99
+ ```
100
+
101
+ ```json
102
+ [
103
+ [
104
+ "Gandalf spoke softly.",
105
+ "\"All we have to decide is what to do with the time given us.\"."
106
+ ],
107
+ [
108
+ "Frodo nodded.",
109
+ "The weight of the Ring pressed against his chest."
110
+ ]
111
+ ]
87
112
  ```
88
113
 
89
114
  ### Flattened Output
@@ -91,8 +116,17 @@ results = segment_text(text)
91
116
  If you don't need paragraph boundaries, use the `flatten` parameter:
92
117
 
93
118
  ```python
119
+ text = "At 9 a.m. the hobbits set out. By 3 p.m. they reached Rivendell. Mr. Frodo was exhausted."
120
+
94
121
  results = segment_text(text, flatten=True)
95
- # Returns: ['First paragraph here.', 'It has two sentences.', 'Second paragraph starts here.', ...]
122
+ ```
123
+
124
+ ```json
125
+ [
126
+ "At 9 a.m. the hobbits set out.",
127
+ "By 3 p.m. they reached Rivendell.",
128
+ "Mr. Frodo was exhausted."
129
+ ]
96
130
  ```
97
131
 
98
132
  ### Direct Segmenter Access
@@ -106,6 +140,36 @@ segmenter = Segmenter()
106
140
  results = segmenter.input_text("Your text here.")
107
141
  ```
108
142
 
143
+ ### Command Line Interface
144
+
145
+ Segment text directly from the terminal:
146
+
147
+ ```bash
148
+ # Direct text input
149
+ echo "Have you seen Dr. Who? It's brilliant!" | segment
150
+ ```
151
+
152
+ ```
153
+ Have you seen Dr. Who?
154
+ It's brilliant!
155
+ ```
156
+
157
+ ```bash
158
+ # Numbered output
159
+ segment -n "Gandalf paused... You shall not pass! The Balrog roared."
160
+ ```
161
+
162
+ ```
163
+ 1. Gandalf paused...
164
+ 2. You shall not pass!
165
+ 3. The Balrog roared.
166
+ ```
167
+
168
+ ```bash
169
+ # From file
170
+ segment -f silmarillion.txt
171
+ ```
172
+
109
173
  ## API Reference
110
174
 
111
175
  | Function | Parameters | Returns | Description |
@@ -113,6 +177,14 @@ results = segmenter.input_text("Your text here.")
113
177
  | `segment_text()` | `input_text: str`, `flatten: bool = False` | `list` | Main entry point for segmentation |
114
178
  | `Segmenter.input_text()` | `input_text: str` | `list[list[str]]` | Cached paragraph-aware segmentation |
115
179
 
180
+ ### CLI Options
181
+
182
+ | Option | Description |
183
+ |--------|-------------|
184
+ | `text` | Text to segment (positional argument) |
185
+ | `-f, --file` | Read text from file |
186
+ | `-n, --numbered` | Number output lines |
187
+
116
188
  ## Why Nested Lists?
117
189
 
118
190
  The segmentation process preserves document structure by segmenting into both paragraphs and sentences. Each outer list represents a paragraph, and each inner list contains that paragraph's sentences. This is useful for:
@@ -125,10 +197,19 @@ Use `flatten=True` when you only need sentences without paragraph context.
125
197
 
126
198
  ## Requirements
127
199
 
128
- - Python 3.8.5+
129
- - spaCy 3.5.3
200
+ - Python 3.9+
201
+ - spaCy 3.8+
130
202
  - en_core_web_sm spaCy model
131
203
 
204
+ ## How It Works
205
+
206
+ This library uses spaCy for initial sentence segmentation, then applies surgical post-processing fixes for cases where spaCy's default behavior is incorrect:
207
+
208
+ 1. **Pre-processing**: Normalize numbered lists, preserve ellipses with placeholders
209
+ 2. **spaCy segmentation**: Use spaCy's sentence boundary detection
210
+ 3. **Post-processing**: Split on abbreviation boundaries, handle `?`/`!` + capital patterns
211
+ 4. **Denormalization**: Restore placeholders to original text
212
+
132
213
  ## License
133
214
 
134
215
  MIT License - see [LICENSE](LICENSE) for details.
@@ -0,0 +1,199 @@
1
+ # Fast Sentence Segmentation
2
+
3
+ [![PyPI version](https://img.shields.io/pypi/v/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
4
+ [![Python versions](https://img.shields.io/pypi/pyversions/fast-sentence-segment.svg)](https://pypi.org/project/fast-sentence-segment/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+ [![spaCy](https://img.shields.io/badge/spaCy-3.8-blue.svg)](https://spacy.io/)
7
+
8
+ Fast and efficient sentence segmentation using spaCy with surgical post-processing fixes. Handles complex edge cases like abbreviations (Dr., Mr., etc.), ellipses, quoted text, and multi-paragraph documents.
9
+
10
+ ## Why This Library?
11
+
12
+ 1. **Keep it local**: LLM API calls cost money and send your data to third parties. Run sentence segmentation entirely on your machine.
13
+ 2. **spaCy perfected**: spaCy is a great local model, but it makes mistakes. This library fixes most of spaCy's shortcomings.
14
+
15
+ ## Features
16
+
17
+ - **Paragraph-aware segmentation**: Returns sentences grouped by paragraph
18
+ - **Abbreviation handling**: Correctly handles "Dr.", "Mr.", "etc.", "p.m.", "a.m." without false splits
19
+ - **Ellipsis preservation**: Keeps `...` intact while detecting sentence boundaries
20
+ - **Question/exclamation splitting**: Properly splits on `?` and `!` followed by capital letters
21
+ - **Cached processing**: LRU cache for repeated text processing
22
+ - **Flexible output**: Nested lists (by paragraph) or flattened list of sentences
23
+ - **Bullet point & numbered list normalization**: Cleans common list formats
24
+ - **CLI tool**: Command-line interface for quick segmentation
25
+
26
+ ## Installation
27
+
28
+ ```bash
29
+ pip install fast-sentence-segment
30
+ ```
31
+
32
+ After installation, download the spaCy model:
33
+
34
+ ```bash
35
+ python -m spacy download en_core_web_sm
36
+ ```
37
+
38
+ ## Quick Start
39
+
40
+ ```python
41
+ from fast_sentence_segment import segment_text
42
+
43
+ text = "Do you like Dr. Who? I prefer Dr. Strange! Mr. T is also cool."
44
+
45
+ results = segment_text(text, flatten=True)
46
+ ```
47
+
48
+ ```json
49
+ [
50
+ "Do you like Dr. Who?",
51
+ "I prefer Dr. Strange!",
52
+ "Mr. T is also cool."
53
+ ]
54
+ ```
55
+
56
+ Notice how "Dr. Who?" stays together as a single sentence—the library correctly recognizes that a title followed by a single-word name ending in `?` or `!` is a name reference, not a sentence boundary.
57
+
58
+ ## Usage
59
+
60
+ ### Basic Segmentation
61
+
62
+ The `segment_text` function returns a list of lists, where each inner list represents a paragraph containing its sentences:
63
+
64
+ ```python
65
+ from fast_sentence_segment import segment_text
66
+
67
+ text = """Gandalf spoke softly. "All we have to decide is what to do with the time given us."
68
+
69
+ Frodo nodded. The weight of the Ring pressed against his chest."""
70
+
71
+ results = segment_text(text)
72
+ ```
73
+
74
+ ```json
75
+ [
76
+ [
77
+ "Gandalf spoke softly.",
78
+ "\"All we have to decide is what to do with the time given us.\"."
79
+ ],
80
+ [
81
+ "Frodo nodded.",
82
+ "The weight of the Ring pressed against his chest."
83
+ ]
84
+ ]
85
+ ```
86
+
87
+ ### Flattened Output
88
+
89
+ If you don't need paragraph boundaries, use the `flatten` parameter:
90
+
91
+ ```python
92
+ text = "At 9 a.m. the hobbits set out. By 3 p.m. they reached Rivendell. Mr. Frodo was exhausted."
93
+
94
+ results = segment_text(text, flatten=True)
95
+ ```
96
+
97
+ ```json
98
+ [
99
+ "At 9 a.m. the hobbits set out.",
100
+ "By 3 p.m. they reached Rivendell.",
101
+ "Mr. Frodo was exhausted."
102
+ ]
103
+ ```
104
+
105
+ ### Direct Segmenter Access
106
+
107
+ For more control, use the `Segmenter` class directly:
108
+
109
+ ```python
110
+ from fast_sentence_segment import Segmenter
111
+
112
+ segmenter = Segmenter()
113
+ results = segmenter.input_text("Your text here.")
114
+ ```
115
+
116
+ ### Command Line Interface
117
+
118
+ Segment text directly from the terminal:
119
+
120
+ ```bash
121
+ # Direct text input
122
+ echo "Have you seen Dr. Who? It's brilliant!" | segment
123
+ ```
124
+
125
+ ```
126
+ Have you seen Dr. Who?
127
+ It's brilliant!
128
+ ```
129
+
130
+ ```bash
131
+ # Numbered output
132
+ segment -n "Gandalf paused... You shall not pass! The Balrog roared."
133
+ ```
134
+
135
+ ```
136
+ 1. Gandalf paused...
137
+ 2. You shall not pass!
138
+ 3. The Balrog roared.
139
+ ```
140
+
141
+ ```bash
142
+ # From file
143
+ segment -f silmarillion.txt
144
+ ```
145
+
146
+ ## API Reference
147
+
148
+ | Function | Parameters | Returns | Description |
149
+ |----------|------------|---------|-------------|
150
+ | `segment_text()` | `input_text: str`, `flatten: bool = False` | `list` | Main entry point for segmentation |
151
+ | `Segmenter.input_text()` | `input_text: str` | `list[list[str]]` | Cached paragraph-aware segmentation |
152
+
153
+ ### CLI Options
154
+
155
+ | Option | Description |
156
+ |--------|-------------|
157
+ | `text` | Text to segment (positional argument) |
158
+ | `-f, --file` | Read text from file |
159
+ | `-n, --numbered` | Number output lines |
160
+
161
+ ## Why Nested Lists?
162
+
163
+ The segmentation process preserves document structure by segmenting into both paragraphs and sentences. Each outer list represents a paragraph, and each inner list contains that paragraph's sentences. This is useful for:
164
+
165
+ - Document structure analysis
166
+ - Paragraph-level processing
167
+ - Maintaining original text organization
168
+
169
+ Use `flatten=True` when you only need sentences without paragraph context.
170
+
171
+ ## Requirements
172
+
173
+ - Python 3.9+
174
+ - spaCy 3.8+
175
+ - en_core_web_sm spaCy model
176
+
177
+ ## How It Works
178
+
179
+ This library uses spaCy for initial sentence segmentation, then applies surgical post-processing fixes for cases where spaCy's default behavior is incorrect:
180
+
181
+ 1. **Pre-processing**: Normalize numbered lists, preserve ellipses with placeholders
182
+ 2. **spaCy segmentation**: Use spaCy's sentence boundary detection
183
+ 3. **Post-processing**: Split on abbreviation boundaries, handle `?`/`!` + capital patterns
184
+ 4. **Denormalization**: Restore placeholders to original text
185
+
186
+ ## License
187
+
188
+ MIT License - see [LICENSE](LICENSE) for details.
189
+
190
+ ## Contributing
191
+
192
+ Contributions are welcome! Please feel free to submit a Pull Request.
193
+
194
+ 1. Fork the repository
195
+ 2. Create your feature branch (`git checkout -b feature/amazing-feature`)
196
+ 3. Run tests (`make test`)
197
+ 4. Commit your changes
198
+ 5. Push to the branch
199
+ 6. Open a Pull Request
@@ -0,0 +1,56 @@
1
+ # -*- coding: UTF-8 -*-
2
+ """CLI for fast-sentence-segment."""
3
+
4
+ import argparse
5
+ import logging
6
+ import sys
7
+
8
+ from fast_sentence_segment import segment_text
9
+
10
+ logging.disable(logging.CRITICAL)
11
+
12
+
13
+ def main():
14
+ parser = argparse.ArgumentParser(
15
+ prog="segment",
16
+ description="Segment text into sentences",
17
+ )
18
+ parser.add_argument(
19
+ "text",
20
+ nargs="?",
21
+ help="Text to segment (or use stdin)",
22
+ )
23
+ parser.add_argument(
24
+ "-f", "--file",
25
+ help="Read text from file",
26
+ )
27
+ parser.add_argument(
28
+ "-n", "--numbered",
29
+ action="store_true",
30
+ help="Number output lines",
31
+ )
32
+ args = parser.parse_args()
33
+
34
+ # Get input text
35
+ if args.file:
36
+ with open(args.file, "r", encoding="utf-8") as f:
37
+ text = f.read()
38
+ elif args.text:
39
+ text = args.text
40
+ elif not sys.stdin.isatty():
41
+ text = sys.stdin.read()
42
+ else:
43
+ parser.print_help()
44
+ sys.exit(1)
45
+
46
+ # Segment and output
47
+ sentences = segment_text(text.strip(), flatten=True)
48
+ for i, sentence in enumerate(sentences, 1):
49
+ if args.numbered:
50
+ print(f"{i}. {sentence}")
51
+ else:
52
+ print(sentence)
53
+
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -0,0 +1,10 @@
1
+ from .abbreviation_merger import AbbreviationMerger
2
+ from .abbreviation_splitter import AbbreviationSplitter
3
+ from .title_name_merger import TitleNameMerger
4
+ from .bullet_point_cleaner import BulletPointCleaner
5
+ from .ellipsis_normalizer import EllipsisNormalizer
6
+ from .newlines_to_periods import NewlinesToPeriods
7
+ from .post_process_sentences import PostProcessStructure
8
+ from .question_exclamation_splitter import QuestionExclamationSplitter
9
+ from .spacy_doc_segmenter import SpacyDocSegmenter
10
+ from .numbered_list_normalizer import NumberedListNormalizer
@@ -0,0 +1,146 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Merge sentences that spaCy incorrectly split at abbreviation boundaries.
5
+
6
+ When spaCy incorrectly splits after an abbreviation (e.g., "ext. 5" becomes
7
+ ["ext.", "5. Ask for help."]), this component merges them back together
8
+ using specific known patterns.
9
+
10
+ Reference: https://github.com/craigtrim/fast-sentence-segment/issues/3
11
+ """
12
+
13
+ import re
14
+ from typing import List, Optional, Tuple
15
+
16
+ from fast_sentence_segment.core import BaseObject
17
+
18
+
19
+ # Patterns where spaCy incorrectly splits after an abbreviation.
20
+ # Format: (ending_pattern, extract_pattern)
21
+ # - ending_pattern: regex to match end of current sentence
22
+ # - extract_pattern: regex to extract the portion to merge from next sentence
23
+ #
24
+ # The extract_pattern MUST have a capture group for the portion to merge.
25
+ # Whatever is NOT captured remains as a separate sentence.
26
+
27
+ MERGE_PATTERNS: List[Tuple[str, str]] = [
28
+
29
+ # ext. 5, Ext. 123, EXT. 42
30
+ (r"(?i)\bext\.$", r"^(\d+\.?)\s*"),
31
+
32
+ # no. 5, No. 42, NO. 100
33
+ (r"(?i)\bno\.$", r"^(\d+\.?)\s*"),
34
+
35
+ # vol. 3, Vol. 42, VOL. 1
36
+ (r"(?i)\bvol\.$", r"^(\d+\.?)\s*"),
37
+
38
+ # pt. 2, Pt. 1, PT. 3
39
+ (r"(?i)\bpt\.$", r"^(\d+\.?)\s*"),
40
+
41
+ # ch. 5, Ch. 10, CH. 3
42
+ (r"(?i)\bch\.$", r"^(\d+\.?)\s*"),
43
+
44
+ # sec. 3, Sec. 14, SEC. 2
45
+ (r"(?i)\bsec\.$", r"^(\d+(?:\.\d+)?\.?)\s*"),
46
+
47
+ # fig. 1, Fig. 3.2, FIG. 10
48
+ (r"(?i)\bfig\.$", r"^(\d+(?:\.\d+)?\.?)\s*"),
49
+
50
+ # p. 42, P. 100
51
+ (r"(?i)\bp\.$", r"^(\d+\.?)\s*"),
52
+
53
+ # pp. 42-50, PP. 100-110
54
+ (r"(?i)\bpp\.$", r"^(\d+(?:-\d+)?\.?)\s*"),
55
+
56
+ # art. 5, Art. 12, ART. 1
57
+ (r"(?i)\bart\.$", r"^(\d+\.?)\s*"),
58
+
59
+ ]
60
+
61
+
62
+ class AbbreviationMerger(BaseObject):
63
+ """Merge sentences incorrectly split at abbreviation boundaries."""
64
+
65
+ def __init__(self):
66
+ """
67
+ Created:
68
+ 27-Dec-2024
69
+ craigtrim@gmail.com
70
+ Reference:
71
+ https://github.com/craigtrim/fast-sentence-segment/issues/3
72
+ """
73
+ BaseObject.__init__(self, __name__)
74
+ # Compile patterns for efficiency
75
+ self._patterns = [
76
+ (re.compile(ending), re.compile(extract))
77
+ for ending, extract in MERGE_PATTERNS
78
+ ]
79
+
80
+ def _try_merge(self, current: str, next_sent: str) -> Optional[Tuple[str, str]]:
81
+ """Try to merge two sentences based on known patterns.
82
+
83
+ Args:
84
+ current: Current sentence
85
+ next_sent: Next sentence
86
+
87
+ Returns:
88
+ Tuple of (merged_sentence, remainder) if merge needed, else None
89
+ """
90
+ current = current.strip()
91
+ next_sent = next_sent.strip()
92
+
93
+ for ending_pattern, extract_pattern in self._patterns:
94
+ if ending_pattern.search(current):
95
+ match = extract_pattern.match(next_sent)
96
+ if match:
97
+ # Extract the portion to merge
98
+ extracted = match.group(1)
99
+ # Get the remainder (everything after the match)
100
+ remainder = next_sent[match.end():].strip()
101
+ # Build merged sentence
102
+ merged = current + " " + extracted
103
+ return (merged, remainder)
104
+
105
+ return None
106
+
107
+ def process(self, sentences: List[str]) -> List[str]:
108
+ """Process a list of sentences, merging incorrectly split ones.
109
+
110
+ Args:
111
+ sentences: List of sentences from spaCy
112
+
113
+ Returns:
114
+ List of sentences with incorrect splits merged
115
+ """
116
+ if not sentences:
117
+ return sentences
118
+
119
+ result = []
120
+ i = 0
121
+
122
+ while i < len(sentences):
123
+ current = sentences[i]
124
+
125
+ # Check if we should merge with next sentence
126
+ if i + 1 < len(sentences):
127
+ next_sent = sentences[i + 1]
128
+ merge_result = self._try_merge(current, next_sent)
129
+
130
+ if merge_result:
131
+ merged, remainder = merge_result
132
+ result.append(merged)
133
+
134
+ # If there's a remainder, it becomes a new sentence to process
135
+ if remainder:
136
+ # Insert remainder back for processing
137
+ sentences = sentences[:i+2] + [remainder] + sentences[i+2:]
138
+ sentences[i+1] = remainder
139
+
140
+ i += 2
141
+ continue
142
+
143
+ result.append(current)
144
+ i += 1
145
+
146
+ return result
@@ -0,0 +1,95 @@
1
+ #!/usr/bin/env python
2
+ # -*- coding: UTF-8 -*-
3
+ """
4
+ Split sentences at abbreviation boundaries.
5
+
6
+ When spaCy fails to detect a sentence boundary after an abbreviation
7
+ (e.g., "I woke at 6 a.m. It was dark."), this component splits the
8
+ sentence by detecting the pattern: abbreviation + space + Capital letter.
9
+
10
+ Reference: https://github.com/craigtrim/fast-sentence-segment/issues/3
11
+ """
12
+
13
+ import re
14
+ from typing import List
15
+
16
+ from fast_sentence_segment.core import BaseObject
17
+ from fast_sentence_segment.dmo.abbreviations import SENTENCE_ENDING_ABBREVIATIONS
18
+
19
+
20
+ class AbbreviationSplitter(BaseObject):
21
+ """Split sentences at abbreviation boundaries."""
22
+
23
+ def __init__(self):
24
+ """
25
+ Created:
26
+ 27-Dec-2024
27
+ craigtrim@gmail.com
28
+ Reference:
29
+ https://github.com/craigtrim/fast-sentence-segment/issues/3
30
+ """
31
+ BaseObject.__init__(self, __name__)
32
+ self._pattern = self._build_pattern()
33
+
34
+ def _build_pattern(self) -> re.Pattern:
35
+ """Build regex pattern to match abbreviation + capital letter.
36
+
37
+ Pattern matches:
38
+ - A known sentence-ending abbreviation (escaped for regex)
39
+ - Followed by one or more spaces
40
+ - Followed by a capital letter (start of new sentence)
41
+
42
+ Note: Title abbreviations (Dr., Mr., etc.) are excluded because
43
+ they are typically followed by names, not new sentences.
44
+
45
+ Returns:
46
+ Compiled regex pattern
47
+ """
48
+ escaped_abbrevs = [re.escape(abbr) for abbr in SENTENCE_ENDING_ABBREVIATIONS]
49
+ abbrev_pattern = "|".join(escaped_abbrevs)
50
+ pattern = rf"({abbrev_pattern})\s+([A-Z])"
51
+ return re.compile(pattern)
52
+
53
+ def _split_sentence(self, sentence: str) -> List[str]:
54
+ """Split a single sentence at abbreviation boundaries.
55
+
56
+ Args:
57
+ sentence: A sentence that may contain abbreviation boundaries
58
+
59
+ Returns:
60
+ List of one or more sentences
61
+ """
62
+ results = []
63
+ remaining = sentence
64
+
65
+ while True:
66
+ match = self._pattern.search(remaining)
67
+ if not match:
68
+ if remaining.strip():
69
+ results.append(remaining.strip())
70
+ break
71
+
72
+ split_pos = match.end(1)
73
+
74
+ before = remaining[:split_pos].strip()
75
+ if before:
76
+ results.append(before)
77
+
78
+ remaining = remaining[split_pos:].strip()
79
+
80
+ return results if results else [sentence]
81
+
82
+ def process(self, sentences: List[str]) -> List[str]:
83
+ """Process a list of sentences, splitting at abbreviation boundaries.
84
+
85
+ Args:
86
+ sentences: List of sentences from spaCy
87
+
88
+ Returns:
89
+ List of sentences with abbreviation boundaries properly split
90
+ """
91
+ result = []
92
+ for sentence in sentences:
93
+ split_sentences = self._split_sentence(sentence)
94
+ result.extend(split_sentences)
95
+ return result