yasbd-lib 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
yasbd/__init__.py ADDED
@@ -0,0 +1,3 @@
1
+ from yasbd.boundary_detector import BoundaryDetector, ParagraphEOF
2
+
3
+ __all__ = ["BoundaryDetector", "ParagraphEOF"]
@@ -0,0 +1,187 @@
1
+ from collections.abc import Generator, Iterable
2
+ from importlib import import_module
3
+ from io import TextIOBase
4
+ from itertools import tee
5
+
6
+ from loguru import logger
7
+
8
+ from yasbd.utils.cleaner_stub import StreamCleanerStub
9
+ from yasbd.utils.input_validator import validate_input
10
+ from yasbd.utils.paragraph_stream import ParagraphStream
11
+
12
+ # Signals transition between paragraphs in relative mode
13
+ # during boundary detection
14
+ ParagraphEOF = type("_ParagraphEOF", (), {"__repr__": lambda self: "ParagraphEOF"})()
15
+
16
+
17
+ class BoundaryDetector:
18
+ @validate_input
19
+ def __init__(
20
+ self,
21
+ lang: str = "en",
22
+ *,
23
+ preserve_quote_and_paren: bool = True,
24
+ verbose: bool = False,
25
+ ):
26
+ """Initialize the segmenter.
27
+
28
+ Args:
29
+ lang: Two chars ISO language code (e.g. en, fr, ...).
30
+ preserve_quote_and_paren: Do not split on terminators inside
31
+ quoted or parenthesised text.
32
+ verbose: Enable verbose logging.
33
+ """
34
+ self.preserve_quote_and_paren = preserve_quote_and_paren
35
+ self.verbose = verbose
36
+ self.lang = lang.lower()
37
+ if self.verbose: # pragma: no cover
38
+ logger.info(
39
+ "Initialized with lang={!r}, preserve_quote_and_paren={}, verbose={}",
40
+ self._lang,
41
+ self.preserve_quote_and_paren,
42
+ self.verbose,
43
+ )
44
+
45
+ @property
46
+ def lang(self) -> str:
47
+ """ISO language code of the active rule set."""
48
+ return self._lang
49
+
50
+ @lang.setter
51
+ def lang(self, lang: str) -> None:
52
+ lang = lang.lower()
53
+ old_lang = getattr(self, "_lang", None)
54
+ if lang == old_lang:
55
+ return
56
+
57
+ self._load_rule(lang)
58
+ self._lang = lang
59
+ if self.verbose: # pragma: no cover
60
+ logger.info("Language switched from {} to {}", old_lang, self._lang)
61
+
62
+ def _load_rule(self, lang: str) -> None:
63
+ """Dynamically import and instantiate the rule module for *lang*."""
64
+ if self.verbose: # pragma: no cover
65
+ logger.info("Trying to load rule module for {}", lang)
66
+
67
+ try:
68
+ rule_module = import_module(f"yasbd.rules.{lang}")
69
+ except ModuleNotFoundError:
70
+ raise ValueError(f"Unsupported language: {lang!r}") from None
71
+
72
+ self._rule = getattr(rule_module, f"{lang.capitalize()}Rules")()
73
+
74
+ def _detect_relative_spans(
75
+ self,
76
+ para_iter: Iterable[str],
77
+ ) -> Generator[tuple[int, int], None, None]:
78
+ """Yield per-paragraph sentence spans."""
79
+ for para in para_iter:
80
+ if not para or para.isspace():
81
+ boundaries = [0, len(para)]
82
+ else:
83
+ boundaries = self._rule.apply(para, self.preserve_quote_and_paren)
84
+
85
+ for i in range(len(boundaries) - 1):
86
+ start = boundaries[i]
87
+ end = boundaries[i + 1]
88
+ yield (start, end)
89
+
90
+ @validate_input
91
+ def detect(
92
+ self,
93
+ source: str | TextIOBase | StreamCleanerStub,
94
+ *,
95
+ relative: bool = False,
96
+ ) -> Generator[int, None, None]:
97
+ """Detect sentence boundaries in the source text.
98
+
99
+ Args:
100
+ source: Plain text string, an open text stream (e.g., ``StringIO``),
101
+ or a ``StreamCleaner`` instance.
102
+ relative: If ``False`` (default), yields absolute character
103
+ offsets from the beginning of the entire stream. If ``True``,
104
+ offsets reset at each paragraph break, yielding indices relative
105
+ to the start of the current paragraph.
106
+
107
+ Note:
108
+ When ``relative=True``, a ``ParagraphEOF`` sentinel is yielded
109
+ between distinct paragraphs to signal the boundary of the local
110
+ coordinate system. Import via: ``from yasbd import ParagraphEOF``.
111
+
112
+ Yields:
113
+ Integer boundary offsets or ``ParagraphEOF`` sentinels.
114
+ """
115
+
116
+ if self.verbose: # pragma: no cover
117
+ logger.info(
118
+ "Called with type={}, relative={}", type(source).__name__, relative
119
+ )
120
+
121
+ para_iter = (
122
+ ParagraphStream(source) if isinstance(source, (str, TextIOBase)) else source
123
+ )
124
+
125
+ offset = 0
126
+ is_first_pos = True
127
+ for para in para_iter:
128
+ if not para or para.isspace():
129
+ if not relative:
130
+ offset += len(para)
131
+ else:
132
+ yield ParagraphEOF
133
+ continue
134
+
135
+ if relative and not is_first_pos:
136
+ yield ParagraphEOF
137
+ is_first_pos = False
138
+
139
+ boundaries = self._rule.apply(para.rstrip(), self.preserve_quote_and_paren)
140
+
141
+ for pos in boundaries[1:]:
142
+ yield offset + pos if not relative else pos
143
+
144
+ if not relative:
145
+ offset += len(para)
146
+
147
+ @validate_input
148
+ def segment(
149
+ self,
150
+ source: str | TextIOBase | StreamCleanerStub,
151
+ *,
152
+ preserve_whitespace: bool = False,
153
+ ) -> Generator[str, None, None]:
154
+ """Split text into sentences.
155
+
156
+ Args:
157
+ source: Plain text string or ``TextIOBase`` stream (e.g., ``StringIO``, opened file).
158
+ preserve_whitespace: If ``False`` (default), strip leading and
159
+ trailing whitespace from each sentence.
160
+
161
+ Yields:
162
+ Individual sentences as strings.
163
+ """
164
+ if self.verbose: # pragma: no cover
165
+ logger.info("Called with preserve_whitespace={}", preserve_whitespace)
166
+
167
+ para_iter = (
168
+ ParagraphStream(source, skip_empty_lines=not preserve_whitespace)
169
+ if isinstance(source, (str, TextIOBase))
170
+ else source
171
+ )
172
+
173
+ input_for_detection, input_for_slicing = tee(para_iter)
174
+ curr_para = None
175
+ for start, end in self._detect_relative_spans(input_for_detection):
176
+ if start == 0:
177
+ curr_para = next(input_for_slicing, None)
178
+
179
+ if curr_para is None:
180
+ break
181
+
182
+ sent = curr_para[start:end]
183
+ if not preserve_whitespace:
184
+ sent = sent.strip()
185
+ if not sent:
186
+ continue
187
+ yield sent
File without changes
@@ -0,0 +1,57 @@
1
+ from yasbd.rules.base import Rules
2
+
3
+
4
+ # Don't remove the fmt guards as they keep compact set formatting
5
+ # fmt: off
6
+ class LangRules(Rules):
7
+ """Template for adding new language rule modules.
8
+
9
+ Copy this file and rename it to ``<lang>.py`` (e.g.
10
+ ``fr.py``), rename the class to ``<Lang>Rules`` (e.g.
11
+ ``FrRules``) and override only the sets your language needs
12
+ (please, not all of them).
13
+ """
14
+
15
+ # Extra sentence terminators used by the language.
16
+ TERMINATORS = Rules.TERMINATORS | {...}
17
+
18
+ # Honorifics and professional abbreviations that should not split sentences.
19
+ TITLE_ABBRVS = Rules.TITLE_ABBRVS | {...}
20
+
21
+ # Country and regional abbreviations written with periods (U.S., E.U., etc.).
22
+ GEOPOLITICAL_ABBRVS = Rules.GEOPOLITICAL_ABBRVS | {...}
23
+
24
+ # Citation and reference abbreviations commonly used mid-text.
25
+ REFERENCE_ABBRVS = Rules.REFERENCE_ABBRVS | {...}
26
+
27
+ # Street and address abbreviations (Ave., Blvd., Rd., etc.).
28
+ STREET_ABBRVS = Rules.STREET_ABBRVS | {...}
29
+
30
+ # Common inline abbreviations that should not end a sentence.
31
+ MID_SENTENCE_ABBRVS = Rules.MID_SENTENCE_ABBRVS | {...}
32
+
33
+ # Names or titles containing "!" that should not trigger sentence breaks.
34
+ NAMES_WITH_EXCLAMATION = Rules.NAMES_WITH_EXCLAMATION | {...}
35
+
36
+ # Month, weekday, and calendar abbreviations.
37
+ DATE_ABBRVS = Rules.DATE_ABBRVS | {...}
38
+
39
+ # Common nouns appearing inside organization or institution names.
40
+ COMMON_ORG_NOUNS = {...}
41
+
42
+ # Frequently occurring sentence starters used as weak boundary hints.
43
+ COMMON_SENT_STARTERS = {...}
44
+
45
+ # -- Mostly useful for unicase or weakly-cased languages --
46
+
47
+ # Quotative particles used after speech, thoughts, or labels.
48
+ QUOTATIVE_PARTICLES = {...}
49
+
50
+ # Postpositional case markers that tightly bind an abbreviation to the clause
51
+ # (Japanese の, Chinese 的, etc.).
52
+ CASE_MARKERS = {...}
53
+
54
+ # Verbs commonly used for dialogue attribution or reported speech.
55
+ REPORTING_WORDS = {...}
56
+
57
+ # fmt: on
yasbd/rules/base.py ADDED
@@ -0,0 +1,366 @@
1
+ import re # For simpler patterns
2
+
3
+ import regex as re2
4
+
5
+
6
+ def _build_abbr_pattern(options: set[str]) -> str:
7
+ """Build a safe escaped regex alternation pattern.
8
+
9
+ Returns a never-match pattern if no valid options exist.
10
+ Ref: https://stackoverflow.com/questions/1723182/a-regex-that-will-never-be-matched-by-anything?
11
+ """
12
+ cleaned = [
13
+ re2.escape(opt.strip())
14
+ for opt in sorted(options, key=len, reverse=True)
15
+ if opt.strip()
16
+ ]
17
+
18
+ return "|".join(cleaned) if cleaned else r"(?!)"
19
+
20
+
21
+ # fmt: off
22
+ class Rules:
23
+ TERMINATORS = {"。", ".", ".", "!", "!", "?", "?", "‼", "⁉", "⁈"}
24
+
25
+ TITLE_ABBRVS = {
26
+ # Standard Professional (Universal Latin roots)
27
+ "dr", "drs", "prof", "hon", "rev", "supt", "insp",
28
+
29
+ # Global Social (Overlap across English/Spanish/Portuguese/French)
30
+ "mr", "mrs", "ms", "st",
31
+
32
+ # Military (NATO/International Standardized Ranks)
33
+ "adm", "brig", "capt", "cmdr", "col", "cpl", "gen", "lt", "maj", "sgt", "pvt",
34
+
35
+ # Political/Administrative (Common in Western bureaucracy)
36
+ "gov", "rep", "sen", "pres"
37
+ }
38
+
39
+ GEOPOLITICAL_ABBRVS = {
40
+ # North Atlantic / Western Europe
41
+ "u.s", "u.s.a", "u.k", "e.u",
42
+ "u.s", "u.s.a", "u.k", "e.u",
43
+
44
+ # Multilateral / Intergovernmental
45
+ "u.n", "u.s.s.r",
46
+ "u.n", "u.s.s.r",
47
+
48
+ # Asia / Middle East
49
+ "u.a.e", "p.r.c", "r.o.k",
50
+ "u.a.e", "p.r.c", "r.o.k",
51
+ }
52
+
53
+ REFERENCE_ABBRVS = {
54
+ # Publishing / Documents
55
+ "et al", "app", "apps", "cf", "ext", "fig", "figs", "l", "ll",
56
+ "n", "nn", "p", "pp", "pag", "pt", "pts", "ref", "refs", "tab",
57
+ "tbl", "tbls", "v", "vol", "vols",
58
+
59
+ # Section / Structure
60
+ "ann", "art", "arts", "cap", "cl", "cls", "col", "cols", "para",
61
+ "paras", "sec", "sect", "secs", "subsec",
62
+
63
+ # Legal / Numbering
64
+ "no", "nos", "reg", "regs",
65
+
66
+ # Scientific / Math / Technical
67
+ "approx", "eq", "eqn", "eqs", "est", "ex", "exs",
68
+
69
+ # Academic
70
+ "univ", "s",
71
+ }
72
+
73
+ DATE_ABBRVS = {
74
+ # Months
75
+ "jan", "feb", "mar", "apr", "jun", "jul", "sep",
76
+ "sept", "oct", "nov", "dec", "déc",
77
+
78
+ # Day
79
+ "mon", "tue", "wed", "thu", "fri", "sat", "sun",
80
+ "lun", "mar", "dom",
81
+ }
82
+
83
+ MID_SENTENCE_ABBRVS = {
84
+ # Business entity bridges
85
+ "assoc", "mfg",
86
+
87
+ # Bridge/connectors
88
+ "cf", "eg", "e.g", "ie", "i.e", "vs", "v", "viz", "ibid", "ca", "sc",
89
+
90
+ # Street & directional anchors
91
+ "mt", "dist",
92
+
93
+ # General
94
+ "approx", "est", "intl", "misc",
95
+ }
96
+
97
+ STREET_ABBRVS = {
98
+ "ave", "blvd", "blv", "ct", "ln", "pl", "rd", "sq", "st", "wy", "way"
99
+ }
100
+
101
+ NAMES_WITH_EXCLAMATION = {
102
+ # Tech, Corporate Entities, & Major Consumer Brands
103
+ "Yahoo", "Yum", "Chips Ahoy", "Kahoot", "JOOP", "Walla",
104
+ "I Can't Believe It's Not Butter", "Pop",
105
+
106
+ # Gaming, Media, Animation, & Entertainment
107
+ "Mamma Mia", "Jeopardy", "Oklahoma", "Oliver", "Shindig",
108
+ "Hailey's On It", "Airplane", "Osu", "Ha", "VSPO",
109
+
110
+ # Geopolitical Quirks / Municipalities
111
+ "Westward Ho", "Saint-Louis-du-Ha", "Baie-des-Ha",
112
+
113
+ # Public Figures, Politics, & Manufacturing Brands
114
+ "Jeb", "Éxito", "Hey Man", "Basta", "Elliot S"
115
+ }
116
+
117
+ COMMON_ORG_NOUNS = set()
118
+ COMMON_SENT_STARTERS = set()
119
+ QUOTATIVE_PARTICLES = set()
120
+ CASE_MARKERS = set()
121
+ REPORTING_WORDS = set()
122
+
123
+ # https://regex101.com/r/tI9Cmg/2
124
+ VERTICAL_LIST_START_FINDER = re2.compile(r"(?<=^\s*(?:[\p{L}\p{N}]\.){1,3})(?=\s)")
125
+
126
+ # https://regex101.com/r/JYdWZw/4
127
+ QUOTE_AND_PAREN_FINDER = re2.compile(
128
+ r"""
129
+ (?:\p{Pi}|»|(?<=[\s:])(['""])).+?(?:\p{Pf}|«|\1)| # Quoted text
130
+ \p{Ps}.+?\p{Pe} # Parenthesized text
131
+ """,
132
+ re2.X,
133
+ )
134
+
135
+ # https://regex101.com/r/0P9f2V/1
136
+ TOC_LEADER_FINDER = re.compile(r"[^\W_][\s\.]{4,}\d")
137
+
138
+ # https://regex101.com/r/ZOZlLb/2/substitution
139
+ NEWLINE_INSIDE_SENTENCE_FINDER = re2.compile(r"(?<=[,:;)\w\s])\n(?=([a-z(]))")
140
+
141
+ _REGEX_CACHED = False
142
+ # fmt: on
143
+ def __init__(self):
144
+ """Initialize rule instance with lazy-compiled regex patterns.
145
+
146
+ Patterns are compiled once per class and cached via ``_REGEX_CACHED``.
147
+ Subclasses can override data constants (abbreviation sets, terminators, etc.)
148
+ and the classmethod ``_compile_regex_dynamically`` will pick them up.
149
+ """
150
+ if not type(self).__dict__.get("_REGEX_CACHED", False):
151
+ self._compile_regex_dynamically()
152
+ type(self)._REGEX_CACHED = True
153
+
154
+ @classmethod
155
+ def _compile_regex_dynamically(cls):
156
+ """Compile language-specific regex patterns."""
157
+ terminators_pattern = "".join(cls.TERMINATORS)
158
+ dots_pattern = r"[..]"
159
+ title_abbrvs_pattern = _build_abbr_pattern(cls.TITLE_ABBRVS)
160
+ geopolitical_abbrvs_pattern = _build_abbr_pattern(cls.GEOPOLITICAL_ABBRVS)
161
+ common_starters_pattern = _build_abbr_pattern(cls.COMMON_SENT_STARTERS)
162
+
163
+ # https://regex101.com/r/qBSyU5/12
164
+ # Handle flattened lists due to messy OCR.
165
+ cls.HORIZONTAL_LIST_FINDER = re.compile(
166
+ rf"""
167
+ (?: # Must preceded by
168
+ ^\s*| # A string start
169
+ [:{terminators_pattern}]\s+ # A terminator or double colon + space
170
+ )
171
+ (?:[•◦]\s+)? # Optional bullet point (e.g., • 9.)
172
+ (?:
173
+ [-*+]| # Markdown style list
174
+ (?:\d{{1,2}}[.)]{{1,2}}|[a-zA-Z]\)) # Numbered and alphabetical list (e.g, a\), 34.\), 1.)
175
+ )
176
+ (?=\s) # Must followed by a space
177
+ """,
178
+ re.X,
179
+ )
180
+
181
+ # https://regex101.com/r/VMzYsx/9
182
+ cls.NAIVE_BOUNDARY_FINDER = re2.compile(
183
+ rf"""
184
+ # Split if left token is a unicase letter (Always)
185
+ (?<=\p{{Lo}}\s*[{terminators_pattern}])|
186
+
187
+ # Split after any terminators followed by a a newline,
188
+ # common sentence starter, Space+Upper or unicase letter
189
+ (?<=[{terminators_pattern}])
190
+ (?=
191
+ \s*\n|
192
+ \s+(?:[^\p{{Ll}}]|
193
+ \s+(?<!\.\.)(?i:{common_starters_pattern})\b)|
194
+ \s*\p{{Lo}}
195
+ )|
196
+
197
+ # Split at transition between Latin letters separate by alien
198
+ (?<=[\p{{LU}}\p{{Ll}}][​。!?।])(?=[\p{{Lu}}])|
199
+
200
+ # Cluster of terminators (e.g hello!!! r u ok?)
201
+ (?<=[{terminators_pattern.replace('.', '')}]{{2,}})(?=\s)
202
+ """,
203
+ re2.X,
204
+ )
205
+
206
+ # fmt: off
207
+ # Faster than one big regex
208
+ # https://regex101.com/r/svyCoU/18
209
+ cls.MID_SENTENCE_FINDER_LST = [
210
+ # Title abbrv or initialisms (e.g., Dr. Paul)
211
+ re.compile(rf"\b(?i:{title_abbrvs_pattern}){dots_pattern}"),
212
+
213
+ # Geopolitical abbrv is followed by a common org noun (e.g., U.S.A Army)
214
+ re.compile(rf"""
215
+ \b(?i:{geopolitical_abbrvs_pattern}){dots_pattern}
216
+ (?=
217
+ \s*(?:{_build_abbr_pattern(cls.CASE_MARKERS)})|
218
+ \s+(?:{_build_abbr_pattern(cls.COMMON_ORG_NOUNS)})
219
+ )
220
+ """, re.X
221
+ ),
222
+
223
+ # Abbrv that NEVER ends a sentence
224
+ re.compile(
225
+ rf"\b(?i:{_build_abbr_pattern(cls.MID_SENTENCE_ABBRVS)}){dots_pattern}"
226
+ ),
227
+
228
+ # References abbrv followed by a number, a letter or opened paren (e.g., to p. 55, app. A)
229
+ re2.compile(rf"""
230
+ \b(?i:{_build_abbr_pattern(cls.REFERENCE_ABBRVS)}){dots_pattern}
231
+ (?=\s+(?:\(|\p{{Lu}}\b|\p{{N}}|[IVXLCDM]+))
232
+ """, re2.X
233
+ ),
234
+
235
+ # Date abbrv followed by a number
236
+ re2.compile(
237
+ rf"\b(?i:{_build_abbr_pattern(cls.DATE_ABBRVS)}){dots_pattern}(?=\s+\p{{N}})"
238
+ ),
239
+
240
+ # Streets/Acronyms/Exclamations words (e.g., Yahoo!, A.B. Holding, Ave. Central)
241
+ # excluding geopolitical ones not followed by a common starters
242
+ re2.compile(rf"""
243
+ (?:\p{{Lu}}\.){{2,}}(?<!(?i:{geopolitical_abbrvs_pattern}))
244
+ (?!\s+(?:{common_starters_pattern})\b)
245
+ """, re2.X
246
+ ),
247
+ re.compile(rf"""
248
+ (?:
249
+ \b(?i:{_build_abbr_pattern(cls.STREET_ABBRVS)}){dots_pattern}|
250
+ (?i:{_build_abbr_pattern(cls.NAMES_WITH_EXCLAMATION)})[! !‼]
251
+ )
252
+ (?!\s+(?:{common_starters_pattern})\b)
253
+ """, re.X
254
+ ),
255
+
256
+ # Collapsed middle name (e.g, Jonas E. Smith)
257
+ re2.compile(rf"\s\b(?:\p{{Lu}}){dots_pattern}(?=\s)"),
258
+ ]
259
+ # fmt: on
260
+
261
+ # https://regex101.com/r/EGkRU8/6
262
+ cls.QUOTE_AND_PAREN_END_FINDER = re2.compile(
263
+ rf"""
264
+ (?<=
265
+ [{terminators_pattern}] # A terminator
266
+ (?:'\s|["”]|\s*[»\p{{Pf}}\p{{Pe}}]) # Closing quotes/parens
267
+ )
268
+ (?! # NOT followed by any continuation markers, punctuation, or space+lowercase
269
+ \s*\p{{Po}}|
270
+ {_build_abbr_pattern(cls.QUOTATIVE_PARTICLES | cls.REPORTING_WORDS)}|
271
+ \s+[\p{{Ll}}]
272
+ )
273
+ """,
274
+ re2.X,
275
+ )
276
+
277
+ # https://regex101.com/r/ffqwjh/2
278
+ cls.CONTIGUOUS_TERMINATORS_FINDER = re.compile(rf"(?:\s*+[{terminators_pattern}]){{2,}}")
279
+
280
+ def _remove_quote_and_paren_spans(
281
+ self,
282
+ main_boundaries: set[int],
283
+ text: str,
284
+ preserve_quote_and_paren: bool,
285
+ ) -> None:
286
+ """Remove boundaries inside quoted/parenthesised spans."""
287
+ if preserve_quote_and_paren:
288
+ # Ignore first pos to preserve splits before opening quote/paren,
289
+ # especially for non-whitespace languages
290
+ main_boundaries.difference_update(
291
+ pos
292
+ for m in self.QUOTE_AND_PAREN_FINDER.finditer(text)
293
+ for pos in range(m.start() + 1, m.end())
294
+ )
295
+
296
+ main_boundaries.update(
297
+ m.end() for m in self.QUOTE_AND_PAREN_END_FINDER.finditer(text)
298
+ )
299
+
300
+ def _remove_toc_spans(
301
+ self, main_boundaries: set[int], text: str
302
+ ) -> None:
303
+ """Remove boundaries inside TOC leader runs."""
304
+ if "..." in text:
305
+ for m in self.TOC_LEADER_FINDER.finditer(text):
306
+ main_boundaries.difference_update(range(*m.span()))
307
+
308
+ def _adjust_list_boundaries(self, main_boundaries: set[int], text: str) -> None:
309
+ """Remove and re-align boundaries around list markers."""
310
+ horiz_matches = list(self.HORIZONTAL_LIST_FINDER.finditer(text))
311
+ if len(horiz_matches) >= 2:
312
+ main_boundaries.difference_update(m.end() for m in horiz_matches)
313
+ # Shift boundaries the pointer back (1.\)| => |1.\), a. | => |a. ) to correctly
314
+ # terminate the preceding sentence before flattened horizontal list.
315
+ main_boundaries.update(m.start() + 1 for m in horiz_matches if m.start())
316
+
317
+ main_boundaries.difference_update(
318
+ m.end() for m in self.VERTICAL_LIST_START_FINDER.finditer(text)
319
+ )
320
+
321
+ def apply(
322
+ self,
323
+ text: str,
324
+ preserve_quote_and_paren: bool,
325
+ ) -> list[int]:
326
+ """Detect sentence boundaries in *text*.
327
+
328
+ Two-pass algorithm:
329
+ 1. Collect boundary candidates from punctuation positions.
330
+ 2. Remove false alarms (mid-sentence abbreviations, ellipsis,
331
+ quote/paren spans, list markers).
332
+
333
+ Args:
334
+ text: A string to find sentence boundaries in.
335
+ preserve_quote_and_paren: If ``True``, suppress boundaries
336
+ inside quote and parenthesis spans.
337
+
338
+ Returns:
339
+ Sorted list of character offsets at which sentences end.
340
+ """
341
+ text = self.NEWLINE_INSIDE_SENTENCE_FINDER.sub(" ", text)
342
+ main_boundaries = {
343
+ m.end() for m in self.NAIVE_BOUNDARY_FINDER.finditer(text)
344
+ }
345
+
346
+ # -- Remove false alarms --
347
+ main_boundaries.difference_update(
348
+ m.end() for pat in self.MID_SENTENCE_FINDER_LST
349
+ for m in pat.finditer(text)
350
+ )
351
+ self._remove_quote_and_paren_spans(
352
+ main_boundaries, text, preserve_quote_and_paren
353
+ )
354
+ self._remove_toc_spans(main_boundaries, text)
355
+ self._adjust_list_boundaries(main_boundaries, text)
356
+
357
+ # Remove contiguous term pos except last one (e.g., Hello! !! !! )
358
+ main_boundaries.difference_update(
359
+ *(
360
+ range(m.start(), m.end() - 1)
361
+ for m in self.CONTIGUOUS_TERMINATORS_FINDER.finditer(text)
362
+ )
363
+ )
364
+
365
+ main_boundaries.update({0, len(text)})
366
+ return sorted(main_boundaries)