pystylometry 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +206 -0
- pystylometry/_types.py +172 -0
- pystylometry/_utils.py +197 -0
- pystylometry/authorship/__init__.py +10 -0
- pystylometry/authorship/burrows_delta.py +152 -0
- pystylometry/authorship/zeta.py +109 -0
- pystylometry/lexical/__init__.py +17 -0
- pystylometry/lexical/hapax.py +75 -0
- pystylometry/lexical/mtld.py +61 -0
- pystylometry/lexical/yule.py +66 -0
- pystylometry/ngrams/__init__.py +13 -0
- pystylometry/ngrams/entropy.py +130 -0
- pystylometry/readability/__init__.py +15 -0
- pystylometry/readability/ari.py +70 -0
- pystylometry/readability/coleman_liau.py +67 -0
- pystylometry/readability/flesch.py +81 -0
- pystylometry/readability/gunning_fog.py +63 -0
- pystylometry/readability/smog.py +71 -0
- pystylometry/readability/syllables.py +54 -0
- pystylometry/syntactic/__init__.py +9 -0
- pystylometry/syntactic/pos_ratios.py +61 -0
- pystylometry/syntactic/sentence_stats.py +60 -0
- pystylometry/tokenizer.py +598 -0
- pystylometry-0.1.0.dist-info/METADATA +238 -0
- pystylometry-0.1.0.dist-info/RECORD +26 -0
- pystylometry-0.1.0.dist-info/WHEEL +4 -0
pystylometry/__init__.py
ADDED
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""
|
|
2
|
+
pystylometry - Comprehensive Python package for stylometric analysis.
|
|
3
|
+
|
|
4
|
+
A modular package for text analysis with lexical, readability, syntactic,
|
|
5
|
+
authorship, and n-gram metrics.
|
|
6
|
+
|
|
7
|
+
Installation:
|
|
8
|
+
pip install pystylometry # Core (lexical only)
|
|
9
|
+
pip install pystylometry[readability] # With readability metrics
|
|
10
|
+
pip install pystylometry[syntactic] # With syntactic analysis
|
|
11
|
+
pip install pystylometry[authorship] # With authorship attribution
|
|
12
|
+
pip install pystylometry[all] # Everything
|
|
13
|
+
|
|
14
|
+
Usage:
|
|
15
|
+
# Direct module imports
|
|
16
|
+
from pystylometry.lexical import compute_mtld, compute_yule
|
|
17
|
+
from pystylometry.readability import compute_flesch
|
|
18
|
+
from pystylometry.syntactic import compute_pos_ratios
|
|
19
|
+
from pystylometry.authorship import compute_burrows_delta
|
|
20
|
+
|
|
21
|
+
# Or use the unified analyze() function
|
|
22
|
+
from pystylometry import analyze
|
|
23
|
+
|
|
24
|
+
results = analyze(text, lexical=True, readability=True)
|
|
25
|
+
print(results.lexical['mtld'].mtld_average)
|
|
26
|
+
print(results.readability['flesch'].reading_ease)
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from ._types import AnalysisResult
|
|
30
|
+
|
|
31
|
+
# Version
|
|
32
|
+
__version__ = "0.1.0"
|
|
33
|
+
|
|
34
|
+
# Core exports - always available
|
|
35
|
+
from . import lexical
|
|
36
|
+
|
|
37
|
+
# Optional exports - may raise ImportError if dependencies not installed
|
|
38
|
+
try:
|
|
39
|
+
from . import readability # noqa: F401
|
|
40
|
+
|
|
41
|
+
_READABILITY_AVAILABLE = True
|
|
42
|
+
except ImportError:
|
|
43
|
+
_READABILITY_AVAILABLE = False
|
|
44
|
+
|
|
45
|
+
try:
|
|
46
|
+
from . import syntactic # noqa: F401
|
|
47
|
+
|
|
48
|
+
_SYNTACTIC_AVAILABLE = True
|
|
49
|
+
except ImportError:
|
|
50
|
+
_SYNTACTIC_AVAILABLE = False
|
|
51
|
+
|
|
52
|
+
# Authorship and ngrams use only stdlib (no external dependencies)
|
|
53
|
+
from . import (
|
|
54
|
+
authorship, # noqa: F401
|
|
55
|
+
ngrams, # noqa: F401
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
_AUTHORSHIP_AVAILABLE = True
|
|
59
|
+
_NGRAMS_AVAILABLE = True
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def analyze(
|
|
63
|
+
text: str,
|
|
64
|
+
lexical_metrics: bool = True,
|
|
65
|
+
readability_metrics: bool = False,
|
|
66
|
+
syntactic_metrics: bool = False,
|
|
67
|
+
authorship_metrics: bool = False,
|
|
68
|
+
ngram_metrics: bool = False,
|
|
69
|
+
) -> AnalysisResult:
|
|
70
|
+
"""
|
|
71
|
+
Unified interface to compute multiple stylometric metrics at once.
|
|
72
|
+
|
|
73
|
+
This is a convenience function that calls all requested metric computations
|
|
74
|
+
and returns a unified result object. Only computes metrics for which the
|
|
75
|
+
required optional dependencies are installed.
|
|
76
|
+
|
|
77
|
+
Args:
|
|
78
|
+
text: Input text to analyze
|
|
79
|
+
lexical_metrics: Compute lexical diversity metrics (default: True)
|
|
80
|
+
readability_metrics: Compute readability metrics (default: False)
|
|
81
|
+
syntactic_metrics: Compute syntactic metrics (default: False)
|
|
82
|
+
authorship_metrics: Compute authorship metrics (default: False)
|
|
83
|
+
Note: Authorship metrics typically require multiple texts for comparison.
|
|
84
|
+
This will compute features that can be used for authorship analysis.
|
|
85
|
+
ngram_metrics: Compute n-gram entropy metrics (default: False)
|
|
86
|
+
|
|
87
|
+
Returns:
|
|
88
|
+
AnalysisResult with requested metrics in nested dictionaries
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
ImportError: If requested analysis requires uninstalled dependencies
|
|
92
|
+
|
|
93
|
+
Example:
|
|
94
|
+
>>> from pystylometry import analyze
|
|
95
|
+
>>> results = analyze(text, lexical=True, readability=True)
|
|
96
|
+
>>> print(results.lexical['mtld'].mtld_average)
|
|
97
|
+
>>> print(results.readability['flesch'].reading_ease)
|
|
98
|
+
|
|
99
|
+
Example with all metrics:
|
|
100
|
+
>>> results = analyze(text, lexical=True, readability=True,
|
|
101
|
+
... syntactic=True, ngrams=True)
|
|
102
|
+
>>> print(f"MTLD: {results.lexical['mtld'].mtld_average:.2f}")
|
|
103
|
+
>>> print(f"Flesch: {results.readability['flesch'].reading_ease:.1f}")
|
|
104
|
+
>>> print(f"Noun ratio: {results.syntactic['pos'].noun_ratio:.3f}")
|
|
105
|
+
>>> print(f"Bigram entropy: {results.ngrams['word_bigram'].entropy:.3f}")
|
|
106
|
+
"""
|
|
107
|
+
result = AnalysisResult(metadata={"text_length": len(text)})
|
|
108
|
+
|
|
109
|
+
# Lexical metrics (always available)
|
|
110
|
+
if lexical_metrics:
|
|
111
|
+
result.lexical = {}
|
|
112
|
+
# TODO: Add when stylometry-ttr is integrated
|
|
113
|
+
# result.lexical['ttr'] = lexical.compute_ttr(text)
|
|
114
|
+
result.lexical["mtld"] = lexical.compute_mtld(text)
|
|
115
|
+
result.lexical["yule"] = lexical.compute_yule(text)
|
|
116
|
+
result.lexical["hapax"] = lexical.compute_hapax_ratios(text)
|
|
117
|
+
|
|
118
|
+
# Readability metrics (optional dependency)
|
|
119
|
+
if readability_metrics:
|
|
120
|
+
if not _READABILITY_AVAILABLE:
|
|
121
|
+
raise ImportError(
|
|
122
|
+
"Readability metrics require optional dependencies. "
|
|
123
|
+
"Install with: pip install pystylometry[readability]"
|
|
124
|
+
)
|
|
125
|
+
# Import locally to avoid name conflict
|
|
126
|
+
from . import readability as readability_module
|
|
127
|
+
|
|
128
|
+
result.readability = {}
|
|
129
|
+
result.readability["flesch"] = readability_module.compute_flesch(text)
|
|
130
|
+
result.readability["smog"] = readability_module.compute_smog(text)
|
|
131
|
+
result.readability["gunning_fog"] = readability_module.compute_gunning_fog(text)
|
|
132
|
+
result.readability["coleman_liau"] = readability_module.compute_coleman_liau(text)
|
|
133
|
+
result.readability["ari"] = readability_module.compute_ari(text)
|
|
134
|
+
|
|
135
|
+
# Syntactic metrics (optional dependency)
|
|
136
|
+
if syntactic_metrics:
|
|
137
|
+
if not _SYNTACTIC_AVAILABLE:
|
|
138
|
+
raise ImportError(
|
|
139
|
+
"Syntactic metrics require optional dependencies. "
|
|
140
|
+
"Install with: pip install pystylometry[syntactic]"
|
|
141
|
+
)
|
|
142
|
+
# Import locally to avoid name conflict
|
|
143
|
+
from . import syntactic as syntactic_module
|
|
144
|
+
|
|
145
|
+
result.syntactic = {}
|
|
146
|
+
result.syntactic["pos"] = syntactic_module.compute_pos_ratios(text)
|
|
147
|
+
result.syntactic["sentence_stats"] = syntactic_module.compute_sentence_stats(text)
|
|
148
|
+
|
|
149
|
+
# Authorship metrics (uses stdlib only)
|
|
150
|
+
# Note: These are typically used for comparison between texts
|
|
151
|
+
# Here we just note that they're available but don't compute them
|
|
152
|
+
# since they require multiple texts as input
|
|
153
|
+
if authorship_metrics:
|
|
154
|
+
result.authorship = {
|
|
155
|
+
"note": "Authorship metrics require multiple texts for comparison. "
|
|
156
|
+
"Use pystylometry.authorship.compute_burrows_delta(text1, text2) directly."
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
# N-gram metrics (uses stdlib only)
|
|
160
|
+
if ngram_metrics:
|
|
161
|
+
result.ngrams = {}
|
|
162
|
+
result.ngrams["character_bigram"] = ngrams.compute_character_bigram_entropy(text)
|
|
163
|
+
result.ngrams["word_bigram"] = ngrams.compute_word_bigram_entropy(text)
|
|
164
|
+
|
|
165
|
+
return result
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
# Convenient access to availability flags
|
|
169
|
+
def get_available_modules() -> dict[str, bool]:
|
|
170
|
+
"""
|
|
171
|
+
Get dictionary of available optional modules.
|
|
172
|
+
|
|
173
|
+
Returns:
|
|
174
|
+
Dictionary mapping module names to availability status
|
|
175
|
+
|
|
176
|
+
Example:
|
|
177
|
+
>>> from pystylometry import get_available_modules
|
|
178
|
+
>>> available = get_available_modules()
|
|
179
|
+
>>> if available['readability']:
|
|
180
|
+
... from pystylometry.readability import compute_flesch
|
|
181
|
+
"""
|
|
182
|
+
return {
|
|
183
|
+
"lexical": True, # Always available
|
|
184
|
+
"readability": _READABILITY_AVAILABLE,
|
|
185
|
+
"syntactic": _SYNTACTIC_AVAILABLE,
|
|
186
|
+
"authorship": _AUTHORSHIP_AVAILABLE,
|
|
187
|
+
"ngrams": _NGRAMS_AVAILABLE,
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
__all__ = [
|
|
192
|
+
"__version__",
|
|
193
|
+
"analyze",
|
|
194
|
+
"get_available_modules",
|
|
195
|
+
"lexical",
|
|
196
|
+
]
|
|
197
|
+
|
|
198
|
+
# Conditionally add to __all__ based on availability
|
|
199
|
+
if _READABILITY_AVAILABLE:
|
|
200
|
+
__all__.append("readability")
|
|
201
|
+
if _SYNTACTIC_AVAILABLE:
|
|
202
|
+
__all__.append("syntactic")
|
|
203
|
+
if _AUTHORSHIP_AVAILABLE:
|
|
204
|
+
__all__.append("authorship")
|
|
205
|
+
if _NGRAMS_AVAILABLE:
|
|
206
|
+
__all__.append("ngrams")
|
pystylometry/_types.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Result dataclasses for all pystylometry metrics."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
# ===== Lexical Results =====
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class MTLDResult:
|
|
13
|
+
"""Result from MTLD (Measure of Textual Lexical Diversity) computation."""
|
|
14
|
+
|
|
15
|
+
mtld_forward: float
|
|
16
|
+
mtld_backward: float
|
|
17
|
+
mtld_average: float
|
|
18
|
+
metadata: dict[str, Any]
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class YuleResult:
|
|
23
|
+
"""Result from Yule's K and I computation."""
|
|
24
|
+
|
|
25
|
+
yule_k: float
|
|
26
|
+
yule_i: float
|
|
27
|
+
metadata: dict[str, Any]
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
@dataclass
|
|
31
|
+
class HapaxResult:
|
|
32
|
+
"""Result from Hapax Legomena analysis."""
|
|
33
|
+
|
|
34
|
+
hapax_count: int
|
|
35
|
+
hapax_ratio: float
|
|
36
|
+
dis_hapax_count: int
|
|
37
|
+
dis_hapax_ratio: float
|
|
38
|
+
sichel_s: float
|
|
39
|
+
honore_r: float
|
|
40
|
+
metadata: dict[str, Any]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
# ===== Readability Results =====
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@dataclass
|
|
47
|
+
class FleschResult:
|
|
48
|
+
"""Result from Flesch Reading Ease and Flesch-Kincaid Grade computation."""
|
|
49
|
+
|
|
50
|
+
reading_ease: float
|
|
51
|
+
grade_level: float
|
|
52
|
+
difficulty: str # "Very Easy", "Easy", "Fairly Easy", "Standard", etc.
|
|
53
|
+
metadata: dict[str, Any]
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
@dataclass
|
|
57
|
+
class SMOGResult:
|
|
58
|
+
"""Result from SMOG Index computation."""
|
|
59
|
+
|
|
60
|
+
smog_index: float
|
|
61
|
+
grade_level: int
|
|
62
|
+
metadata: dict[str, Any]
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class GunningFogResult:
|
|
67
|
+
"""Result from Gunning Fog Index computation."""
|
|
68
|
+
|
|
69
|
+
fog_index: float
|
|
70
|
+
grade_level: int
|
|
71
|
+
metadata: dict[str, Any]
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@dataclass
|
|
75
|
+
class ColemanLiauResult:
|
|
76
|
+
"""Result from Coleman-Liau Index computation."""
|
|
77
|
+
|
|
78
|
+
cli_index: float
|
|
79
|
+
grade_level: int
|
|
80
|
+
metadata: dict[str, Any]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@dataclass
|
|
84
|
+
class ARIResult:
|
|
85
|
+
"""Result from Automated Readability Index computation."""
|
|
86
|
+
|
|
87
|
+
ari_score: float
|
|
88
|
+
grade_level: int
|
|
89
|
+
age_range: str
|
|
90
|
+
metadata: dict[str, Any]
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ===== Syntactic Results =====
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@dataclass
|
|
97
|
+
class POSResult:
|
|
98
|
+
"""Result from Part-of-Speech ratio analysis."""
|
|
99
|
+
|
|
100
|
+
noun_ratio: float
|
|
101
|
+
verb_ratio: float
|
|
102
|
+
adjective_ratio: float
|
|
103
|
+
adverb_ratio: float
|
|
104
|
+
noun_verb_ratio: float
|
|
105
|
+
adjective_noun_ratio: float
|
|
106
|
+
lexical_density: float
|
|
107
|
+
function_word_ratio: float
|
|
108
|
+
metadata: dict[str, Any]
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@dataclass
|
|
112
|
+
class SentenceStatsResult:
|
|
113
|
+
"""Result from sentence-level statistics."""
|
|
114
|
+
|
|
115
|
+
mean_sentence_length: float
|
|
116
|
+
sentence_length_std: float
|
|
117
|
+
sentence_length_range: int
|
|
118
|
+
min_sentence_length: int
|
|
119
|
+
max_sentence_length: int
|
|
120
|
+
sentence_count: int
|
|
121
|
+
metadata: dict[str, Any]
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
# ===== Authorship Results =====
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
@dataclass
|
|
128
|
+
class BurrowsDeltaResult:
|
|
129
|
+
"""Result from Burrows' Delta computation."""
|
|
130
|
+
|
|
131
|
+
delta_score: float
|
|
132
|
+
distance_type: str # "burrows", "cosine", "eder"
|
|
133
|
+
mfw_count: int
|
|
134
|
+
metadata: dict[str, Any]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
@dataclass
|
|
138
|
+
class ZetaResult:
|
|
139
|
+
"""Result from Zeta score computation."""
|
|
140
|
+
|
|
141
|
+
zeta_score: float
|
|
142
|
+
marker_words: list[str]
|
|
143
|
+
anti_marker_words: list[str]
|
|
144
|
+
metadata: dict[str, Any]
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
# ===== N-gram Results =====
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
@dataclass
|
|
151
|
+
class EntropyResult:
|
|
152
|
+
"""Result from n-gram entropy computation."""
|
|
153
|
+
|
|
154
|
+
entropy: float
|
|
155
|
+
perplexity: float
|
|
156
|
+
ngram_type: str # "character_bigram", "word_bigram", "word_trigram"
|
|
157
|
+
metadata: dict[str, Any]
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
# ===== Unified Analysis Result =====
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
@dataclass
|
|
164
|
+
class AnalysisResult:
|
|
165
|
+
"""Unified result from comprehensive stylometric analysis."""
|
|
166
|
+
|
|
167
|
+
lexical: dict[str, Any] | None = None
|
|
168
|
+
readability: dict[str, Any] | None = None
|
|
169
|
+
syntactic: dict[str, Any] | None = None
|
|
170
|
+
authorship: dict[str, Any] | None = None
|
|
171
|
+
ngrams: dict[str, Any] | None = None
|
|
172
|
+
metadata: dict[str, Any] | None = None
|
pystylometry/_utils.py
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Shared utility functions for pystylometry."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .tokenizer import Tokenizer
|
|
8
|
+
|
|
9
|
+
# ===== Convenience Functions =====
|
|
10
|
+
|
|
11
|
+
# Default tokenizer instance for backward compatibility
|
|
12
|
+
_default_tokenizer = Tokenizer(
|
|
13
|
+
lowercase=False,
|
|
14
|
+
strip_punctuation=False,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def tokenize(text: str) -> list[str]:
|
|
19
|
+
"""
|
|
20
|
+
Simple tokenization using default settings.
|
|
21
|
+
|
|
22
|
+
Convenience function that maintains backward compatibility
|
|
23
|
+
with the original simple tokenizer interface.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
text: Input text to tokenize
|
|
27
|
+
|
|
28
|
+
Returns:
|
|
29
|
+
List of tokens
|
|
30
|
+
|
|
31
|
+
Example:
|
|
32
|
+
>>> tokens = tokenize("Hello, world!")
|
|
33
|
+
>>> print(tokens)
|
|
34
|
+
['Hello', ',', 'world', '!']
|
|
35
|
+
"""
|
|
36
|
+
return _default_tokenizer.tokenize(text)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def advanced_tokenize(
|
|
40
|
+
text: str,
|
|
41
|
+
lowercase: bool = True,
|
|
42
|
+
strip_punctuation: bool = True,
|
|
43
|
+
expand_contractions: bool = False,
|
|
44
|
+
) -> list[str]:
|
|
45
|
+
"""
|
|
46
|
+
Tokenization with commonly-used advanced options.
|
|
47
|
+
|
|
48
|
+
Args:
|
|
49
|
+
text: Input text to tokenize
|
|
50
|
+
lowercase: Convert to lowercase (default: True)
|
|
51
|
+
strip_punctuation: Remove punctuation tokens (default: True)
|
|
52
|
+
expand_contractions: Expand contractions (default: False)
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of tokens
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
>>> tokens = advanced_tokenize("Hello, world! It's nice.", lowercase=True)
|
|
59
|
+
>>> print(tokens)
|
|
60
|
+
['hello', 'world', "it's", 'nice']
|
|
61
|
+
"""
|
|
62
|
+
tokenizer = Tokenizer(
|
|
63
|
+
lowercase=lowercase,
|
|
64
|
+
strip_punctuation=strip_punctuation,
|
|
65
|
+
expand_contractions=expand_contractions,
|
|
66
|
+
)
|
|
67
|
+
return tokenizer.tokenize(text)
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
# ===== Sentence Splitting =====
|
|
71
|
+
|
|
72
|
+
# Common abbreviations that shouldn't trigger sentence boundaries
|
|
73
|
+
_ABBREVIATIONS = {
|
|
74
|
+
"mr.",
|
|
75
|
+
"mrs.",
|
|
76
|
+
"ms.",
|
|
77
|
+
"dr.",
|
|
78
|
+
"prof.",
|
|
79
|
+
"sr.",
|
|
80
|
+
"jr.",
|
|
81
|
+
"st.",
|
|
82
|
+
"vs.",
|
|
83
|
+
"etc.",
|
|
84
|
+
"e.g.",
|
|
85
|
+
"i.e.",
|
|
86
|
+
"al.",
|
|
87
|
+
"fig.",
|
|
88
|
+
"vol.",
|
|
89
|
+
"no.",
|
|
90
|
+
"inc.",
|
|
91
|
+
"corp.",
|
|
92
|
+
"ltd.",
|
|
93
|
+
"co.",
|
|
94
|
+
"ph.d.",
|
|
95
|
+
"m.d.",
|
|
96
|
+
"b.a.",
|
|
97
|
+
"m.a.",
|
|
98
|
+
"j.d.",
|
|
99
|
+
"rev.",
|
|
100
|
+
"gen.",
|
|
101
|
+
"rep.",
|
|
102
|
+
"sen.",
|
|
103
|
+
"capt.",
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def split_sentences(text: str) -> list[str]:
|
|
108
|
+
"""
|
|
109
|
+
Split text into sentences with improved boundary detection.
|
|
110
|
+
|
|
111
|
+
Handles common abbreviations and edge cases better than simple
|
|
112
|
+
splitting on sentence-ending punctuation. Uses a two-pass approach:
|
|
113
|
+
1. Protect known abbreviations from splitting
|
|
114
|
+
2. Split on sentence boundaries
|
|
115
|
+
3. Restore abbreviations
|
|
116
|
+
|
|
117
|
+
Args:
|
|
118
|
+
text: Input text to split
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
List of sentences
|
|
122
|
+
|
|
123
|
+
Example:
|
|
124
|
+
>>> sentences = split_sentences("Dr. Smith arrived. He was happy.")
|
|
125
|
+
>>> print(sentences)
|
|
126
|
+
['Dr. Smith arrived.', 'He was happy.']
|
|
127
|
+
"""
|
|
128
|
+
if not text:
|
|
129
|
+
return []
|
|
130
|
+
|
|
131
|
+
# Temporarily replace abbreviations with placeholders
|
|
132
|
+
protected_text = text
|
|
133
|
+
replacements = {}
|
|
134
|
+
for i, abbr in enumerate(_ABBREVIATIONS):
|
|
135
|
+
if abbr in text.lower():
|
|
136
|
+
placeholder = f"__ABBR{i}__"
|
|
137
|
+
# Case-insensitive replacement
|
|
138
|
+
pattern = re.compile(re.escape(abbr), re.IGNORECASE)
|
|
139
|
+
matches = pattern.findall(protected_text)
|
|
140
|
+
if matches:
|
|
141
|
+
replacements[placeholder] = matches[0]
|
|
142
|
+
protected_text = pattern.sub(placeholder, protected_text, count=1)
|
|
143
|
+
|
|
144
|
+
# Split on sentence boundaries: period/question/exclamation + whitespace + capital letter
|
|
145
|
+
# Simple pattern that avoids variable-width look-behind
|
|
146
|
+
sentences = re.split(r"([.!?]+)\s+(?=[A-Z])", protected_text)
|
|
147
|
+
|
|
148
|
+
# Reconstruct sentences (regex split includes the captured groups)
|
|
149
|
+
result = []
|
|
150
|
+
i = 0
|
|
151
|
+
while i < len(sentences):
|
|
152
|
+
if i + 1 < len(sentences) and sentences[i + 1] in (".", "!", "?", ".!", "!?", "?.", "..."):
|
|
153
|
+
# Combine text with its punctuation
|
|
154
|
+
sentence = sentences[i] + sentences[i + 1]
|
|
155
|
+
i += 2
|
|
156
|
+
else:
|
|
157
|
+
sentence = sentences[i]
|
|
158
|
+
i += 1
|
|
159
|
+
|
|
160
|
+
# Restore abbreviations
|
|
161
|
+
for placeholder, original in replacements.items():
|
|
162
|
+
sentence = sentence.replace(placeholder, original)
|
|
163
|
+
|
|
164
|
+
sentence = sentence.strip()
|
|
165
|
+
if sentence:
|
|
166
|
+
result.append(sentence)
|
|
167
|
+
|
|
168
|
+
# Fallback: if we only got one sentence, try simpler split
|
|
169
|
+
if len(result) <= 1 and text:
|
|
170
|
+
sentences = re.split(r"[.!?]+\s+", text)
|
|
171
|
+
result = [s.strip() for s in sentences if s.strip()]
|
|
172
|
+
|
|
173
|
+
return result
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def check_optional_dependency(module_name: str, extra_name: str) -> bool:
|
|
177
|
+
"""
|
|
178
|
+
Check if an optional dependency is installed.
|
|
179
|
+
|
|
180
|
+
Args:
|
|
181
|
+
module_name: Name of the module to check
|
|
182
|
+
extra_name: Name of the extra in pyproject.toml
|
|
183
|
+
|
|
184
|
+
Returns:
|
|
185
|
+
True if module is available
|
|
186
|
+
|
|
187
|
+
Raises:
|
|
188
|
+
ImportError: If module is not installed with instructions
|
|
189
|
+
"""
|
|
190
|
+
try:
|
|
191
|
+
__import__(module_name)
|
|
192
|
+
return True
|
|
193
|
+
except ImportError:
|
|
194
|
+
raise ImportError(
|
|
195
|
+
f"The '{module_name}' package is required for this functionality. "
|
|
196
|
+
f"Install it with: pip install pystylometry[{extra_name}]"
|
|
197
|
+
)
|