yasbd-lib 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- yasbd/__init__.py +3 -0
- yasbd/boundary_detector.py +187 -0
- yasbd/rules/__init__.py +0 -0
- yasbd/rules/_template.py +57 -0
- yasbd/rules/base.py +366 -0
- yasbd/rules/en.py +94 -0
- yasbd/rules/es.py +51 -0
- yasbd/rules/fr.py +58 -0
- yasbd/rules/ht.py +29 -0
- yasbd/rules/ja.py +29 -0
- yasbd/utils/__init__.py +0 -0
- yasbd/utils/cleaner.py +147 -0
- yasbd/utils/cleaner_stub.py +11 -0
- yasbd/utils/input_validator.py +81 -0
- yasbd/utils/paragraph_stream.py +109 -0
- yasbd/utils/pysbd_adapter.py +168 -0
- yasbd_lib-0.1.0.dist-info/METADATA +311 -0
- yasbd_lib-0.1.0.dist-info/RECORD +21 -0
- yasbd_lib-0.1.0.dist-info/WHEEL +5 -0
- yasbd_lib-0.1.0.dist-info/licenses/LICENSE +356 -0
- yasbd_lib-0.1.0.dist-info/top_level.txt +1 -0
yasbd/__init__.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
from collections.abc import Generator, Iterable
|
|
2
|
+
from importlib import import_module
|
|
3
|
+
from io import TextIOBase
|
|
4
|
+
from itertools import tee
|
|
5
|
+
|
|
6
|
+
from loguru import logger
|
|
7
|
+
|
|
8
|
+
from yasbd.utils.cleaner_stub import StreamCleanerStub
|
|
9
|
+
from yasbd.utils.input_validator import validate_input
|
|
10
|
+
from yasbd.utils.paragraph_stream import ParagraphStream
|
|
11
|
+
|
|
12
|
+
# Signals transition between paragraphs in relative mode
|
|
13
|
+
# during boundary detection
|
|
14
|
+
ParagraphEOF = type("_ParagraphEOF", (), {"__repr__": lambda self: "ParagraphEOF"})()
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class BoundaryDetector:
|
|
18
|
+
@validate_input
|
|
19
|
+
def __init__(
|
|
20
|
+
self,
|
|
21
|
+
lang: str = "en",
|
|
22
|
+
*,
|
|
23
|
+
preserve_quote_and_paren: bool = True,
|
|
24
|
+
verbose: bool = False,
|
|
25
|
+
):
|
|
26
|
+
"""Initialize the segmenter.
|
|
27
|
+
|
|
28
|
+
Args:
|
|
29
|
+
lang: Two chars ISO language code (e.g. en, fr, ...).
|
|
30
|
+
preserve_quote_and_paren: Do not split on terminators inside
|
|
31
|
+
quoted or parenthesised text.
|
|
32
|
+
verbose: Enable verbose logging.
|
|
33
|
+
"""
|
|
34
|
+
self.preserve_quote_and_paren = preserve_quote_and_paren
|
|
35
|
+
self.verbose = verbose
|
|
36
|
+
self.lang = lang.lower()
|
|
37
|
+
if self.verbose: # pragma: no cover
|
|
38
|
+
logger.info(
|
|
39
|
+
"Initialized with lang={!r}, preserve_quote_and_paren={}, verbose={}",
|
|
40
|
+
self._lang,
|
|
41
|
+
self.preserve_quote_and_paren,
|
|
42
|
+
self.verbose,
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
@property
|
|
46
|
+
def lang(self) -> str:
|
|
47
|
+
"""ISO language code of the active rule set."""
|
|
48
|
+
return self._lang
|
|
49
|
+
|
|
50
|
+
@lang.setter
|
|
51
|
+
def lang(self, lang: str) -> None:
|
|
52
|
+
lang = lang.lower()
|
|
53
|
+
old_lang = getattr(self, "_lang", None)
|
|
54
|
+
if lang == old_lang:
|
|
55
|
+
return
|
|
56
|
+
|
|
57
|
+
self._load_rule(lang)
|
|
58
|
+
self._lang = lang
|
|
59
|
+
if self.verbose: # pragma: no cover
|
|
60
|
+
logger.info("Language switched from {} to {}", old_lang, self._lang)
|
|
61
|
+
|
|
62
|
+
def _load_rule(self, lang: str) -> None:
|
|
63
|
+
"""Dynamically import and instantiate the rule module for *lang*."""
|
|
64
|
+
if self.verbose: # pragma: no cover
|
|
65
|
+
logger.info("Trying to load rule module for {}", lang)
|
|
66
|
+
|
|
67
|
+
try:
|
|
68
|
+
rule_module = import_module(f"yasbd.rules.{lang}")
|
|
69
|
+
except ModuleNotFoundError:
|
|
70
|
+
raise ValueError(f"Unsupported language: {lang!r}") from None
|
|
71
|
+
|
|
72
|
+
self._rule = getattr(rule_module, f"{lang.capitalize()}Rules")()
|
|
73
|
+
|
|
74
|
+
def _detect_relative_spans(
|
|
75
|
+
self,
|
|
76
|
+
para_iter: Iterable[str],
|
|
77
|
+
) -> Generator[tuple[int, int], None, None]:
|
|
78
|
+
"""Yield per-paragraph sentence spans."""
|
|
79
|
+
for para in para_iter:
|
|
80
|
+
if not para or para.isspace():
|
|
81
|
+
boundaries = [0, len(para)]
|
|
82
|
+
else:
|
|
83
|
+
boundaries = self._rule.apply(para, self.preserve_quote_and_paren)
|
|
84
|
+
|
|
85
|
+
for i in range(len(boundaries) - 1):
|
|
86
|
+
start = boundaries[i]
|
|
87
|
+
end = boundaries[i + 1]
|
|
88
|
+
yield (start, end)
|
|
89
|
+
|
|
90
|
+
@validate_input
|
|
91
|
+
def detect(
|
|
92
|
+
self,
|
|
93
|
+
source: str | TextIOBase | StreamCleanerStub,
|
|
94
|
+
*,
|
|
95
|
+
relative: bool = False,
|
|
96
|
+
) -> Generator[int, None, None]:
|
|
97
|
+
"""Detect sentence boundaries in the source text.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
source: Plain text string, an open text stream (e.g., ``StringIO``),
|
|
101
|
+
or a ``StreamCleaner`` instance.
|
|
102
|
+
relative: If ``False`` (default), yields absolute character
|
|
103
|
+
offsets from the beginning of the entire stream. If ``True``,
|
|
104
|
+
offsets reset at each paragraph break, yielding indices relative
|
|
105
|
+
to the start of the current paragraph.
|
|
106
|
+
|
|
107
|
+
Note:
|
|
108
|
+
When ``relative=True``, a ``ParagraphEOF`` sentinel is yielded
|
|
109
|
+
between distinct paragraphs to signal the boundary of the local
|
|
110
|
+
coordinate system. Import via: ``from yasbd import ParagraphEOF``.
|
|
111
|
+
|
|
112
|
+
Yields:
|
|
113
|
+
Integer boundary offsets or ``ParagraphEOF`` sentinels.
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
if self.verbose: # pragma: no cover
|
|
117
|
+
logger.info(
|
|
118
|
+
"Called with type={}, relative={}", type(source).__name__, relative
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
para_iter = (
|
|
122
|
+
ParagraphStream(source) if isinstance(source, (str, TextIOBase)) else source
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
offset = 0
|
|
126
|
+
is_first_pos = True
|
|
127
|
+
for para in para_iter:
|
|
128
|
+
if not para or para.isspace():
|
|
129
|
+
if not relative:
|
|
130
|
+
offset += len(para)
|
|
131
|
+
else:
|
|
132
|
+
yield ParagraphEOF
|
|
133
|
+
continue
|
|
134
|
+
|
|
135
|
+
if relative and not is_first_pos:
|
|
136
|
+
yield ParagraphEOF
|
|
137
|
+
is_first_pos = False
|
|
138
|
+
|
|
139
|
+
boundaries = self._rule.apply(para.rstrip(), self.preserve_quote_and_paren)
|
|
140
|
+
|
|
141
|
+
for pos in boundaries[1:]:
|
|
142
|
+
yield offset + pos if not relative else pos
|
|
143
|
+
|
|
144
|
+
if not relative:
|
|
145
|
+
offset += len(para)
|
|
146
|
+
|
|
147
|
+
@validate_input
|
|
148
|
+
def segment(
|
|
149
|
+
self,
|
|
150
|
+
source: str | TextIOBase | StreamCleanerStub,
|
|
151
|
+
*,
|
|
152
|
+
preserve_whitespace: bool = False,
|
|
153
|
+
) -> Generator[str, None, None]:
|
|
154
|
+
"""Split text into sentences.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
source: Plain text string or ``TextIOBase`` stream (e.g., ``StringIO``, opened file).
|
|
158
|
+
preserve_whitespace: If ``False`` (default), strip leading and
|
|
159
|
+
trailing whitespace from each sentence.
|
|
160
|
+
|
|
161
|
+
Yields:
|
|
162
|
+
Individual sentences as strings.
|
|
163
|
+
"""
|
|
164
|
+
if self.verbose: # pragma: no cover
|
|
165
|
+
logger.info("Called with preserve_whitespace={}", preserve_whitespace)
|
|
166
|
+
|
|
167
|
+
para_iter = (
|
|
168
|
+
ParagraphStream(source, skip_empty_lines=not preserve_whitespace)
|
|
169
|
+
if isinstance(source, (str, TextIOBase))
|
|
170
|
+
else source
|
|
171
|
+
)
|
|
172
|
+
|
|
173
|
+
input_for_detection, input_for_slicing = tee(para_iter)
|
|
174
|
+
curr_para = None
|
|
175
|
+
for start, end in self._detect_relative_spans(input_for_detection):
|
|
176
|
+
if start == 0:
|
|
177
|
+
curr_para = next(input_for_slicing, None)
|
|
178
|
+
|
|
179
|
+
if curr_para is None:
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
sent = curr_para[start:end]
|
|
183
|
+
if not preserve_whitespace:
|
|
184
|
+
sent = sent.strip()
|
|
185
|
+
if not sent:
|
|
186
|
+
continue
|
|
187
|
+
yield sent
|
yasbd/rules/__init__.py
ADDED
|
File without changes
|
yasbd/rules/_template.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
from yasbd.rules.base import Rules
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
# Don't remove the fmt guards as they keep compact set formatting
|
|
5
|
+
# fmt: off
|
|
6
|
+
class LangRules(Rules):
|
|
7
|
+
"""Template for adding new language rule modules.
|
|
8
|
+
|
|
9
|
+
Copy this file and rename it to ``<lang>.py`` (e.g.
|
|
10
|
+
``fr.py``), rename the class to ``<Lang>Rules`` (e.g.
|
|
11
|
+
``FrRules``) and override only the sets your language needs
|
|
12
|
+
(please, not all of them).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
# Extra sentence terminators used by the language.
|
|
16
|
+
TERMINATORS = Rules.TERMINATORS | {...}
|
|
17
|
+
|
|
18
|
+
# Honorifics and professional abbreviations that should not split sentences.
|
|
19
|
+
TITLE_ABBRVS = Rules.TITLE_ABBRVS | {...}
|
|
20
|
+
|
|
21
|
+
# Country and regional abbreviations written with periods (U.S., E.U., etc.).
|
|
22
|
+
GEOPOLITICAL_ABBRVS = Rules.GEOPOLITICAL_ABBRVS | {...}
|
|
23
|
+
|
|
24
|
+
# Citation and reference abbreviations commonly used mid-text.
|
|
25
|
+
REFERENCE_ABBRVS = Rules.REFERENCE_ABBRVS | {...}
|
|
26
|
+
|
|
27
|
+
# Street and address abbreviations (Ave., Blvd., Rd., etc.).
|
|
28
|
+
STREET_ABBRVS = Rules.STREET_ABBRVS | {...}
|
|
29
|
+
|
|
30
|
+
# Common inline abbreviations that should not end a sentence.
|
|
31
|
+
MID_SENTENCE_ABBRVS = Rules.MID_SENTENCE_ABBRVS | {...}
|
|
32
|
+
|
|
33
|
+
# Names or titles containing "!" that should not trigger sentence breaks.
|
|
34
|
+
NAMES_WITH_EXCLAMATION = Rules.NAMES_WITH_EXCLAMATION | {...}
|
|
35
|
+
|
|
36
|
+
# Month, weekday, and calendar abbreviations.
|
|
37
|
+
DATE_ABBRVS = Rules.DATE_ABBRVS | {...}
|
|
38
|
+
|
|
39
|
+
# Common nouns appearing inside organization or institution names.
|
|
40
|
+
COMMON_ORG_NOUNS = {...}
|
|
41
|
+
|
|
42
|
+
# Frequently occurring sentence starters used as weak boundary hints.
|
|
43
|
+
COMMON_SENT_STARTERS = {...}
|
|
44
|
+
|
|
45
|
+
# -- Mostly useful for unicase or weakly-cased languages --
|
|
46
|
+
|
|
47
|
+
# Quotative particles used after speech, thoughts, or labels.
|
|
48
|
+
QUOTATIVE_PARTICLES = {...}
|
|
49
|
+
|
|
50
|
+
# Postpositional case markers that tightly bind an abbreviation to the clause
|
|
51
|
+
# (Japanese の, Chinese 的, etc.).
|
|
52
|
+
CASE_MARKERS = {...}
|
|
53
|
+
|
|
54
|
+
# Verbs commonly used for dialogue attribution or reported speech.
|
|
55
|
+
REPORTING_WORDS = {...}
|
|
56
|
+
|
|
57
|
+
# fmt: on
|
yasbd/rules/base.py
ADDED
|
@@ -0,0 +1,366 @@
|
|
|
1
|
+
import re # For simpler patterns
|
|
2
|
+
|
|
3
|
+
import regex as re2
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def _build_abbr_pattern(options: set[str]) -> str:
|
|
7
|
+
"""Build a safe escaped regex alternation pattern.
|
|
8
|
+
|
|
9
|
+
Returns a never-match pattern if no valid options exist.
|
|
10
|
+
Ref: https://stackoverflow.com/questions/1723182/a-regex-that-will-never-be-matched-by-anything?
|
|
11
|
+
"""
|
|
12
|
+
cleaned = [
|
|
13
|
+
re2.escape(opt.strip())
|
|
14
|
+
for opt in sorted(options, key=len, reverse=True)
|
|
15
|
+
if opt.strip()
|
|
16
|
+
]
|
|
17
|
+
|
|
18
|
+
return "|".join(cleaned) if cleaned else r"(?!)"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
# fmt: off
|
|
22
|
+
class Rules:
|
|
23
|
+
TERMINATORS = {"。", ".", ".", "!", "!", "?", "?", "‼", "⁉", "⁈"}
|
|
24
|
+
|
|
25
|
+
TITLE_ABBRVS = {
|
|
26
|
+
# Standard Professional (Universal Latin roots)
|
|
27
|
+
"dr", "drs", "prof", "hon", "rev", "supt", "insp",
|
|
28
|
+
|
|
29
|
+
# Global Social (Overlap across English/Spanish/Portuguese/French)
|
|
30
|
+
"mr", "mrs", "ms", "st",
|
|
31
|
+
|
|
32
|
+
# Military (NATO/International Standardized Ranks)
|
|
33
|
+
"adm", "brig", "capt", "cmdr", "col", "cpl", "gen", "lt", "maj", "sgt", "pvt",
|
|
34
|
+
|
|
35
|
+
# Political/Administrative (Common in Western bureaucracy)
|
|
36
|
+
"gov", "rep", "sen", "pres"
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
GEOPOLITICAL_ABBRVS = {
|
|
40
|
+
# North Atlantic / Western Europe
|
|
41
|
+
"u.s", "u.s.a", "u.k", "e.u",
|
|
42
|
+
"u.s", "u.s.a", "u.k", "e.u",
|
|
43
|
+
|
|
44
|
+
# Multilateral / Intergovernmental
|
|
45
|
+
"u.n", "u.s.s.r",
|
|
46
|
+
"u.n", "u.s.s.r",
|
|
47
|
+
|
|
48
|
+
# Asia / Middle East
|
|
49
|
+
"u.a.e", "p.r.c", "r.o.k",
|
|
50
|
+
"u.a.e", "p.r.c", "r.o.k",
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
REFERENCE_ABBRVS = {
|
|
54
|
+
# Publishing / Documents
|
|
55
|
+
"et al", "app", "apps", "cf", "ext", "fig", "figs", "l", "ll",
|
|
56
|
+
"n", "nn", "p", "pp", "pag", "pt", "pts", "ref", "refs", "tab",
|
|
57
|
+
"tbl", "tbls", "v", "vol", "vols",
|
|
58
|
+
|
|
59
|
+
# Section / Structure
|
|
60
|
+
"ann", "art", "arts", "cap", "cl", "cls", "col", "cols", "para",
|
|
61
|
+
"paras", "sec", "sect", "secs", "subsec",
|
|
62
|
+
|
|
63
|
+
# Legal / Numbering
|
|
64
|
+
"no", "nos", "reg", "regs",
|
|
65
|
+
|
|
66
|
+
# Scientific / Math / Technical
|
|
67
|
+
"approx", "eq", "eqn", "eqs", "est", "ex", "exs",
|
|
68
|
+
|
|
69
|
+
# Academic
|
|
70
|
+
"univ", "s",
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
DATE_ABBRVS = {
|
|
74
|
+
# Months
|
|
75
|
+
"jan", "feb", "mar", "apr", "jun", "jul", "sep",
|
|
76
|
+
"sept", "oct", "nov", "dec", "déc",
|
|
77
|
+
|
|
78
|
+
# Day
|
|
79
|
+
"mon", "tue", "wed", "thu", "fri", "sat", "sun",
|
|
80
|
+
"lun", "mar", "dom",
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
MID_SENTENCE_ABBRVS = {
|
|
84
|
+
# Business entity bridges
|
|
85
|
+
"assoc", "mfg",
|
|
86
|
+
|
|
87
|
+
# Bridge/connectors
|
|
88
|
+
"cf", "eg", "e.g", "ie", "i.e", "vs", "v", "viz", "ibid", "ca", "sc",
|
|
89
|
+
|
|
90
|
+
# Street & directional anchors
|
|
91
|
+
"mt", "dist",
|
|
92
|
+
|
|
93
|
+
# General
|
|
94
|
+
"approx", "est", "intl", "misc",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
STREET_ABBRVS = {
|
|
98
|
+
"ave", "blvd", "blv", "ct", "ln", "pl", "rd", "sq", "st", "wy", "way"
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
NAMES_WITH_EXCLAMATION = {
|
|
102
|
+
# Tech, Corporate Entities, & Major Consumer Brands
|
|
103
|
+
"Yahoo", "Yum", "Chips Ahoy", "Kahoot", "JOOP", "Walla",
|
|
104
|
+
"I Can't Believe It's Not Butter", "Pop",
|
|
105
|
+
|
|
106
|
+
# Gaming, Media, Animation, & Entertainment
|
|
107
|
+
"Mamma Mia", "Jeopardy", "Oklahoma", "Oliver", "Shindig",
|
|
108
|
+
"Hailey's On It", "Airplane", "Osu", "Ha", "VSPO",
|
|
109
|
+
|
|
110
|
+
# Geopolitical Quirks / Municipalities
|
|
111
|
+
"Westward Ho", "Saint-Louis-du-Ha", "Baie-des-Ha",
|
|
112
|
+
|
|
113
|
+
# Public Figures, Politics, & Manufacturing Brands
|
|
114
|
+
"Jeb", "Éxito", "Hey Man", "Basta", "Elliot S"
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
COMMON_ORG_NOUNS = set()
|
|
118
|
+
COMMON_SENT_STARTERS = set()
|
|
119
|
+
QUOTATIVE_PARTICLES = set()
|
|
120
|
+
CASE_MARKERS = set()
|
|
121
|
+
REPORTING_WORDS = set()
|
|
122
|
+
|
|
123
|
+
# https://regex101.com/r/tI9Cmg/2
|
|
124
|
+
VERTICAL_LIST_START_FINDER = re2.compile(r"(?<=^\s*(?:[\p{L}\p{N}]\.){1,3})(?=\s)")
|
|
125
|
+
|
|
126
|
+
# https://regex101.com/r/JYdWZw/4
|
|
127
|
+
QUOTE_AND_PAREN_FINDER = re2.compile(
|
|
128
|
+
r"""
|
|
129
|
+
(?:\p{Pi}|»|(?<=[\s:])(['""])).+?(?:\p{Pf}|«|\1)| # Quoted text
|
|
130
|
+
\p{Ps}.+?\p{Pe} # Parenthesized text
|
|
131
|
+
""",
|
|
132
|
+
re2.X,
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# https://regex101.com/r/0P9f2V/1
|
|
136
|
+
TOC_LEADER_FINDER = re.compile(r"[^\W_][\s\.]{4,}\d")
|
|
137
|
+
|
|
138
|
+
# https://regex101.com/r/ZOZlLb/2/substitution
|
|
139
|
+
NEWLINE_INSIDE_SENTENCE_FINDER = re2.compile(r"(?<=[,:;)\w\s])\n(?=([a-z(]))")
|
|
140
|
+
|
|
141
|
+
_REGEX_CACHED = False
|
|
142
|
+
# fmt: on
|
|
143
|
+
def __init__(self):
|
|
144
|
+
"""Initialize rule instance with lazy-compiled regex patterns.
|
|
145
|
+
|
|
146
|
+
Patterns are compiled once per class and cached via ``_REGEX_CACHED``.
|
|
147
|
+
Subclasses can override data constants (abbreviation sets, terminators, etc.)
|
|
148
|
+
and the classmethod ``_compile_regex_dynamically`` will pick them up.
|
|
149
|
+
"""
|
|
150
|
+
if not type(self).__dict__.get("_REGEX_CACHED", False):
|
|
151
|
+
self._compile_regex_dynamically()
|
|
152
|
+
type(self)._REGEX_CACHED = True
|
|
153
|
+
|
|
154
|
+
@classmethod
|
|
155
|
+
def _compile_regex_dynamically(cls):
|
|
156
|
+
"""Compile language-specific regex patterns."""
|
|
157
|
+
terminators_pattern = "".join(cls.TERMINATORS)
|
|
158
|
+
dots_pattern = r"[..]"
|
|
159
|
+
title_abbrvs_pattern = _build_abbr_pattern(cls.TITLE_ABBRVS)
|
|
160
|
+
geopolitical_abbrvs_pattern = _build_abbr_pattern(cls.GEOPOLITICAL_ABBRVS)
|
|
161
|
+
common_starters_pattern = _build_abbr_pattern(cls.COMMON_SENT_STARTERS)
|
|
162
|
+
|
|
163
|
+
# https://regex101.com/r/qBSyU5/12
|
|
164
|
+
# Handle flattened lists due to messy OCR.
|
|
165
|
+
cls.HORIZONTAL_LIST_FINDER = re.compile(
|
|
166
|
+
rf"""
|
|
167
|
+
(?: # Must preceded by
|
|
168
|
+
^\s*| # A string start
|
|
169
|
+
[:{terminators_pattern}]\s+ # A terminator or double colon + space
|
|
170
|
+
)
|
|
171
|
+
(?:[•◦]\s+)? # Optional bullet point (e.g., • 9.)
|
|
172
|
+
(?:
|
|
173
|
+
[-*+]| # Markdown style list
|
|
174
|
+
(?:\d{{1,2}}[.)]{{1,2}}|[a-zA-Z]\)) # Numbered and alphabetical list (e.g, a\), 34.\), 1.)
|
|
175
|
+
)
|
|
176
|
+
(?=\s) # Must followed by a space
|
|
177
|
+
""",
|
|
178
|
+
re.X,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
# https://regex101.com/r/VMzYsx/9
|
|
182
|
+
cls.NAIVE_BOUNDARY_FINDER = re2.compile(
|
|
183
|
+
rf"""
|
|
184
|
+
# Split if left token is a unicase letter (Always)
|
|
185
|
+
(?<=\p{{Lo}}\s*[{terminators_pattern}])|
|
|
186
|
+
|
|
187
|
+
# Split after any terminators followed by a a newline,
|
|
188
|
+
# common sentence starter, Space+Upper or unicase letter
|
|
189
|
+
(?<=[{terminators_pattern}])
|
|
190
|
+
(?=
|
|
191
|
+
\s*\n|
|
|
192
|
+
\s+(?:[^\p{{Ll}}]|
|
|
193
|
+
\s+(?<!\.\.)(?i:{common_starters_pattern})\b)|
|
|
194
|
+
\s*\p{{Lo}}
|
|
195
|
+
)|
|
|
196
|
+
|
|
197
|
+
# Split at transition between Latin letters separate by alien
|
|
198
|
+
(?<=[\p{{LU}}\p{{Ll}}][。!?।])(?=[\p{{Lu}}])|
|
|
199
|
+
|
|
200
|
+
# Cluster of terminators (e.g hello!!! r u ok?)
|
|
201
|
+
(?<=[{terminators_pattern.replace('.', '')}]{{2,}})(?=\s)
|
|
202
|
+
""",
|
|
203
|
+
re2.X,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
# fmt: off
|
|
207
|
+
# Faster than one big regex
|
|
208
|
+
# https://regex101.com/r/svyCoU/18
|
|
209
|
+
cls.MID_SENTENCE_FINDER_LST = [
|
|
210
|
+
# Title abbrv or initialisms (e.g., Dr. Paul)
|
|
211
|
+
re.compile(rf"\b(?i:{title_abbrvs_pattern}){dots_pattern}"),
|
|
212
|
+
|
|
213
|
+
# Geopolitical abbrv is followed by a common org noun (e.g., U.S.A Army)
|
|
214
|
+
re.compile(rf"""
|
|
215
|
+
\b(?i:{geopolitical_abbrvs_pattern}){dots_pattern}
|
|
216
|
+
(?=
|
|
217
|
+
\s*(?:{_build_abbr_pattern(cls.CASE_MARKERS)})|
|
|
218
|
+
\s+(?:{_build_abbr_pattern(cls.COMMON_ORG_NOUNS)})
|
|
219
|
+
)
|
|
220
|
+
""", re.X
|
|
221
|
+
),
|
|
222
|
+
|
|
223
|
+
# Abbrv that NEVER ends a sentence
|
|
224
|
+
re.compile(
|
|
225
|
+
rf"\b(?i:{_build_abbr_pattern(cls.MID_SENTENCE_ABBRVS)}){dots_pattern}"
|
|
226
|
+
),
|
|
227
|
+
|
|
228
|
+
# References abbrv followed by a number, a letter or opened paren (e.g., to p. 55, app. A)
|
|
229
|
+
re2.compile(rf"""
|
|
230
|
+
\b(?i:{_build_abbr_pattern(cls.REFERENCE_ABBRVS)}){dots_pattern}
|
|
231
|
+
(?=\s+(?:\(|\p{{Lu}}\b|\p{{N}}|[IVXLCDM]+))
|
|
232
|
+
""", re2.X
|
|
233
|
+
),
|
|
234
|
+
|
|
235
|
+
# Date abbrv followed by a number
|
|
236
|
+
re2.compile(
|
|
237
|
+
rf"\b(?i:{_build_abbr_pattern(cls.DATE_ABBRVS)}){dots_pattern}(?=\s+\p{{N}})"
|
|
238
|
+
),
|
|
239
|
+
|
|
240
|
+
# Streets/Acronyms/Exclamations words (e.g., Yahoo!, A.B. Holding, Ave. Central)
|
|
241
|
+
# excluding geopolitical ones not followed by a common starters
|
|
242
|
+
re2.compile(rf"""
|
|
243
|
+
(?:\p{{Lu}}\.){{2,}}(?<!(?i:{geopolitical_abbrvs_pattern}))
|
|
244
|
+
(?!\s+(?:{common_starters_pattern})\b)
|
|
245
|
+
""", re2.X
|
|
246
|
+
),
|
|
247
|
+
re.compile(rf"""
|
|
248
|
+
(?:
|
|
249
|
+
\b(?i:{_build_abbr_pattern(cls.STREET_ABBRVS)}){dots_pattern}|
|
|
250
|
+
(?i:{_build_abbr_pattern(cls.NAMES_WITH_EXCLAMATION)})[! !‼]
|
|
251
|
+
)
|
|
252
|
+
(?!\s+(?:{common_starters_pattern})\b)
|
|
253
|
+
""", re.X
|
|
254
|
+
),
|
|
255
|
+
|
|
256
|
+
# Collapsed middle name (e.g, Jonas E. Smith)
|
|
257
|
+
re2.compile(rf"\s\b(?:\p{{Lu}}){dots_pattern}(?=\s)"),
|
|
258
|
+
]
|
|
259
|
+
# fmt: on
|
|
260
|
+
|
|
261
|
+
# https://regex101.com/r/EGkRU8/6
|
|
262
|
+
cls.QUOTE_AND_PAREN_END_FINDER = re2.compile(
|
|
263
|
+
rf"""
|
|
264
|
+
(?<=
|
|
265
|
+
[{terminators_pattern}] # A terminator
|
|
266
|
+
(?:'\s|["”]|\s*[»\p{{Pf}}\p{{Pe}}]) # Closing quotes/parens
|
|
267
|
+
)
|
|
268
|
+
(?! # NOT followed by any continuation markers, punctuation, or space+lowercase
|
|
269
|
+
\s*\p{{Po}}|
|
|
270
|
+
{_build_abbr_pattern(cls.QUOTATIVE_PARTICLES | cls.REPORTING_WORDS)}|
|
|
271
|
+
\s+[\p{{Ll}}]
|
|
272
|
+
)
|
|
273
|
+
""",
|
|
274
|
+
re2.X,
|
|
275
|
+
)
|
|
276
|
+
|
|
277
|
+
# https://regex101.com/r/ffqwjh/2
|
|
278
|
+
cls.CONTIGUOUS_TERMINATORS_FINDER = re.compile(rf"(?:\s*+[{terminators_pattern}]){{2,}}")
|
|
279
|
+
|
|
280
|
+
def _remove_quote_and_paren_spans(
|
|
281
|
+
self,
|
|
282
|
+
main_boundaries: set[int],
|
|
283
|
+
text: str,
|
|
284
|
+
preserve_quote_and_paren: bool,
|
|
285
|
+
) -> None:
|
|
286
|
+
"""Remove boundaries inside quoted/parenthesised spans."""
|
|
287
|
+
if preserve_quote_and_paren:
|
|
288
|
+
# Ignore first pos to preserve splits before opening quote/paren,
|
|
289
|
+
# especially for non-whitespace languages
|
|
290
|
+
main_boundaries.difference_update(
|
|
291
|
+
pos
|
|
292
|
+
for m in self.QUOTE_AND_PAREN_FINDER.finditer(text)
|
|
293
|
+
for pos in range(m.start() + 1, m.end())
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
main_boundaries.update(
|
|
297
|
+
m.end() for m in self.QUOTE_AND_PAREN_END_FINDER.finditer(text)
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
def _remove_toc_spans(
|
|
301
|
+
self, main_boundaries: set[int], text: str
|
|
302
|
+
) -> None:
|
|
303
|
+
"""Remove boundaries inside TOC leader runs."""
|
|
304
|
+
if "..." in text:
|
|
305
|
+
for m in self.TOC_LEADER_FINDER.finditer(text):
|
|
306
|
+
main_boundaries.difference_update(range(*m.span()))
|
|
307
|
+
|
|
308
|
+
def _adjust_list_boundaries(self, main_boundaries: set[int], text: str) -> None:
|
|
309
|
+
"""Remove and re-align boundaries around list markers."""
|
|
310
|
+
horiz_matches = list(self.HORIZONTAL_LIST_FINDER.finditer(text))
|
|
311
|
+
if len(horiz_matches) >= 2:
|
|
312
|
+
main_boundaries.difference_update(m.end() for m in horiz_matches)
|
|
313
|
+
# Shift boundaries the pointer back (1.\)| => |1.\), a. | => |a. ) to correctly
|
|
314
|
+
# terminate the preceding sentence before flattened horizontal list.
|
|
315
|
+
main_boundaries.update(m.start() + 1 for m in horiz_matches if m.start())
|
|
316
|
+
|
|
317
|
+
main_boundaries.difference_update(
|
|
318
|
+
m.end() for m in self.VERTICAL_LIST_START_FINDER.finditer(text)
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
def apply(
|
|
322
|
+
self,
|
|
323
|
+
text: str,
|
|
324
|
+
preserve_quote_and_paren: bool,
|
|
325
|
+
) -> list[int]:
|
|
326
|
+
"""Detect sentence boundaries in *text*.
|
|
327
|
+
|
|
328
|
+
Two-pass algorithm:
|
|
329
|
+
1. Collect boundary candidates from punctuation positions.
|
|
330
|
+
2. Remove false alarms (mid-sentence abbreviations, ellipsis,
|
|
331
|
+
quote/paren spans, list markers).
|
|
332
|
+
|
|
333
|
+
Args:
|
|
334
|
+
text: A string to find sentence boundaries in.
|
|
335
|
+
preserve_quote_and_paren: If ``True``, suppress boundaries
|
|
336
|
+
inside quote and parenthesis spans.
|
|
337
|
+
|
|
338
|
+
Returns:
|
|
339
|
+
Sorted list of character offsets at which sentences end.
|
|
340
|
+
"""
|
|
341
|
+
text = self.NEWLINE_INSIDE_SENTENCE_FINDER.sub(" ", text)
|
|
342
|
+
main_boundaries = {
|
|
343
|
+
m.end() for m in self.NAIVE_BOUNDARY_FINDER.finditer(text)
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
# -- Remove false alarms --
|
|
347
|
+
main_boundaries.difference_update(
|
|
348
|
+
m.end() for pat in self.MID_SENTENCE_FINDER_LST
|
|
349
|
+
for m in pat.finditer(text)
|
|
350
|
+
)
|
|
351
|
+
self._remove_quote_and_paren_spans(
|
|
352
|
+
main_boundaries, text, preserve_quote_and_paren
|
|
353
|
+
)
|
|
354
|
+
self._remove_toc_spans(main_boundaries, text)
|
|
355
|
+
self._adjust_list_boundaries(main_boundaries, text)
|
|
356
|
+
|
|
357
|
+
# Remove contiguous term pos except last one (e.g., Hello! !! !! )
|
|
358
|
+
main_boundaries.difference_update(
|
|
359
|
+
*(
|
|
360
|
+
range(m.start(), m.end() - 1)
|
|
361
|
+
for m in self.CONTIGUOUS_TERMINATORS_FINDER.finditer(text)
|
|
362
|
+
)
|
|
363
|
+
)
|
|
364
|
+
|
|
365
|
+
main_boundaries.update({0, len(text)})
|
|
366
|
+
return sorted(main_boundaries)
|