pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
pystylometry/__init__.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
pystylometry - Comprehensive Python package for stylometric analysis.
|
|
3
3
|
|
|
4
4
|
A modular package for text analysis with lexical, readability, syntactic,
|
|
5
|
-
authorship,
|
|
5
|
+
authorship, n-gram, dialect detection, and consistency analysis metrics.
|
|
6
6
|
|
|
7
7
|
Installation:
|
|
8
8
|
pip install pystylometry # Core (lexical only)
|
|
@@ -16,7 +16,9 @@ Usage:
|
|
|
16
16
|
from pystylometry.lexical import compute_mtld, compute_yule
|
|
17
17
|
from pystylometry.readability import compute_flesch
|
|
18
18
|
from pystylometry.syntactic import compute_pos_ratios
|
|
19
|
-
from pystylometry.authorship import compute_burrows_delta
|
|
19
|
+
from pystylometry.authorship import compute_burrows_delta, compute_kilgarriff
|
|
20
|
+
from pystylometry.consistency import compute_kilgarriff_drift
|
|
21
|
+
from pystylometry.dialect import compute_dialect
|
|
20
22
|
|
|
21
23
|
# Or use the unified analyze() function
|
|
22
24
|
from pystylometry import analyze
|
|
@@ -24,6 +26,18 @@ Usage:
|
|
|
24
26
|
results = analyze(text, lexical=True, readability=True)
|
|
25
27
|
print(results.lexical['mtld'].mtld_average)
|
|
26
28
|
print(results.readability['flesch'].reading_ease)
|
|
29
|
+
|
|
30
|
+
# Dialect detection
|
|
31
|
+
result = compute_dialect("The colour of the programme was brilliant.")
|
|
32
|
+
print(result.dialect) # 'british'
|
|
33
|
+
print(result.british_score) # 0.85
|
|
34
|
+
|
|
35
|
+
# Consistency analysis (Style Drift Detector - Issue #36)
|
|
36
|
+
from pystylometry.consistency import compute_kilgarriff_drift
|
|
37
|
+
|
|
38
|
+
result = compute_kilgarriff_drift(long_document)
|
|
39
|
+
print(result.pattern) # 'consistent', 'sudden_spike', 'suspiciously_uniform', etc.
|
|
40
|
+
print(result.pattern_confidence)
|
|
27
41
|
"""
|
|
28
42
|
|
|
29
43
|
from ._types import AnalysisResult
|
|
@@ -49,14 +63,18 @@ try:
|
|
|
49
63
|
except ImportError:
|
|
50
64
|
_SYNTACTIC_AVAILABLE = False
|
|
51
65
|
|
|
52
|
-
# Authorship and
|
|
66
|
+
# Authorship, ngrams, dialect, and consistency use only stdlib (no external dependencies)
|
|
53
67
|
from . import (
|
|
54
68
|
authorship, # noqa: F401
|
|
69
|
+
consistency, # noqa: F401 - Style drift detection (Issue #36)
|
|
70
|
+
dialect, # noqa: F401
|
|
55
71
|
ngrams, # noqa: F401
|
|
56
72
|
)
|
|
57
73
|
|
|
58
74
|
_AUTHORSHIP_AVAILABLE = True
|
|
59
75
|
_NGRAMS_AVAILABLE = True
|
|
76
|
+
_DIALECT_AVAILABLE = True
|
|
77
|
+
_CONSISTENCY_AVAILABLE = True
|
|
60
78
|
|
|
61
79
|
|
|
62
80
|
def analyze(
|
|
@@ -109,8 +127,7 @@ def analyze(
|
|
|
109
127
|
# Lexical metrics (always available)
|
|
110
128
|
if lexical_metrics:
|
|
111
129
|
result.lexical = {}
|
|
112
|
-
|
|
113
|
-
# result.lexical['ttr'] = lexical.compute_ttr(text)
|
|
130
|
+
result.lexical["ttr"] = lexical.compute_ttr(text)
|
|
114
131
|
result.lexical["mtld"] = lexical.compute_mtld(text)
|
|
115
132
|
result.lexical["yule"] = lexical.compute_yule(text)
|
|
116
133
|
result.lexical["hapax"] = lexical.compute_hapax_ratios(text)
|
|
@@ -178,6 +195,8 @@ def get_available_modules() -> dict[str, bool]:
|
|
|
178
195
|
>>> available = get_available_modules()
|
|
179
196
|
>>> if available['readability']:
|
|
180
197
|
... from pystylometry.readability import compute_flesch
|
|
198
|
+
>>> if available['consistency']:
|
|
199
|
+
... from pystylometry.consistency import compute_kilgarriff_drift
|
|
181
200
|
"""
|
|
182
201
|
return {
|
|
183
202
|
"lexical": True, # Always available
|
|
@@ -185,6 +204,8 @@ def get_available_modules() -> dict[str, bool]:
|
|
|
185
204
|
"syntactic": _SYNTACTIC_AVAILABLE,
|
|
186
205
|
"authorship": _AUTHORSHIP_AVAILABLE,
|
|
187
206
|
"ngrams": _NGRAMS_AVAILABLE,
|
|
207
|
+
"dialect": _DIALECT_AVAILABLE,
|
|
208
|
+
"consistency": _CONSISTENCY_AVAILABLE, # Style drift detection (Issue #36)
|
|
188
209
|
}
|
|
189
210
|
|
|
190
211
|
|
|
@@ -204,3 +225,7 @@ if _AUTHORSHIP_AVAILABLE:
|
|
|
204
225
|
__all__.append("authorship")
|
|
205
226
|
if _NGRAMS_AVAILABLE:
|
|
206
227
|
__all__.append("ngrams")
|
|
228
|
+
if _DIALECT_AVAILABLE:
|
|
229
|
+
__all__.append("dialect")
|
|
230
|
+
if _CONSISTENCY_AVAILABLE:
|
|
231
|
+
__all__.append("consistency")
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""Token normalization for stylometric analysis.
|
|
2
|
+
|
|
3
|
+
This module provides token filtering and normalization utilities for different
|
|
4
|
+
analysis scenarios. The primary use case is filtering out non-words (numbers,
|
|
5
|
+
URLs, emails, etc.) before passing tokens to readability metrics that rely on
|
|
6
|
+
syllable counting.
|
|
7
|
+
|
|
8
|
+
Design Philosophy:
|
|
9
|
+
-----------------
|
|
10
|
+
Different stylometric analyses require different normalization strategies:
|
|
11
|
+
|
|
12
|
+
1. **Readability Metrics** (Flesch, SMOG, etc.):
|
|
13
|
+
- Strict filtering: only alphabetic words
|
|
14
|
+
- Removes numbers, URLs, emails, punctuation
|
|
15
|
+
- Prevents garbage/crashes in syllable counting
|
|
16
|
+
|
|
17
|
+
2. **Authorship Attribution**:
|
|
18
|
+
- Preserve stylistic markers
|
|
19
|
+
- Keep contractions, hyphens, apostrophes
|
|
20
|
+
- More permissive filtering
|
|
21
|
+
|
|
22
|
+
3. **Lexical Diversity**:
|
|
23
|
+
- Balance between cleanliness and vocabulary richness
|
|
24
|
+
- May keep some punctuation patterns
|
|
25
|
+
- Configurable based on research question
|
|
26
|
+
|
|
27
|
+
Critical Issue Addressed:
|
|
28
|
+
------------------------
|
|
29
|
+
Without normalization, readability metrics receive non-words from the tokenizer:
|
|
30
|
+
- count_syllables("2026") → undefined behavior (crash or garbage)
|
|
31
|
+
- count_syllables("test@example.com") → undefined behavior
|
|
32
|
+
- count_syllables("C++") → undefined behavior
|
|
33
|
+
- count_syllables("$99.99") → undefined behavior
|
|
34
|
+
|
|
35
|
+
This module ensures only syllabifiable words reach syllable counting functions.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
import re
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def is_word_token(token: str) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
Check if a token is a valid word for readability analysis.
|
|
46
|
+
|
|
47
|
+
A valid word token is:
|
|
48
|
+
- Purely alphabetic (including accented characters)
|
|
49
|
+
- May contain internal apostrophes (contractions like "don't")
|
|
50
|
+
- May contain internal hyphens (compound words like "co-operate")
|
|
51
|
+
- Does NOT start or end with punctuation
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
token: Token to validate
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
True if token is a valid word
|
|
58
|
+
|
|
59
|
+
Examples:
|
|
60
|
+
>>> is_word_token("hello")
|
|
61
|
+
True
|
|
62
|
+
>>> is_word_token("don't")
|
|
63
|
+
True
|
|
64
|
+
>>> is_word_token("co-operate")
|
|
65
|
+
True
|
|
66
|
+
>>> is_word_token("123")
|
|
67
|
+
False
|
|
68
|
+
>>> is_word_token("test@example.com")
|
|
69
|
+
False
|
|
70
|
+
>>> is_word_token("...")
|
|
71
|
+
False
|
|
72
|
+
"""
|
|
73
|
+
if not token or len(token) == 0:
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
# Must start and end with alphabetic character
|
|
77
|
+
if not (token[0].isalpha() and token[-1].isalpha()):
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
# Check middle characters - allow letters, apostrophes, hyphens
|
|
81
|
+
for char in token:
|
|
82
|
+
if not (char.isalpha() or char in ("'", "-")):
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def normalize_for_readability(tokens: list[str]) -> list[str]:
|
|
89
|
+
"""
|
|
90
|
+
Normalize tokens for readability metrics (e.g., Flesch, SMOG).
|
|
91
|
+
|
|
92
|
+
Filters tokens to only include valid words that can have syllables counted.
|
|
93
|
+
This prevents errors and garbage results from non-word tokens.
|
|
94
|
+
|
|
95
|
+
Filtering rules:
|
|
96
|
+
- Keep only alphabetic words (a-zA-Z)
|
|
97
|
+
- Keep contractions with apostrophes ("don't", "we're")
|
|
98
|
+
- Keep hyphenated compound words ("co-operate", "re-enter")
|
|
99
|
+
- Remove pure numbers ("2026", "3.14")
|
|
100
|
+
- Remove URLs ("http://example.com")
|
|
101
|
+
- Remove emails ("test@example.com")
|
|
102
|
+
- Remove special characters ("C++", "O'Brian" → keep, "$99.99" → remove)
|
|
103
|
+
- Remove pure punctuation ("...", "—", "!!!")
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
tokens: List of tokens from tokenizer
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Filtered list containing only valid word tokens
|
|
110
|
+
|
|
111
|
+
Examples:
|
|
112
|
+
>>> tokens = ["The", "year", "2026", "had", "365", "days"]
|
|
113
|
+
>>> normalize_for_readability(tokens)
|
|
114
|
+
['The', 'year', 'had', 'days']
|
|
115
|
+
|
|
116
|
+
>>> tokens = ["Dr", "Smith", "works", "at", "U", ".", "S", ".", "Steel"]
|
|
117
|
+
>>> normalize_for_readability(tokens)
|
|
118
|
+
['Dr', 'Smith', 'works', 'at', 'U', 'S', 'Steel']
|
|
119
|
+
|
|
120
|
+
>>> tokens = ["Email", "test@example.com", "for", "help"]
|
|
121
|
+
>>> normalize_for_readability(tokens)
|
|
122
|
+
['Email', 'for', 'help']
|
|
123
|
+
"""
|
|
124
|
+
return [token for token in tokens if is_word_token(token)]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def normalize_for_stylometry(
|
|
128
|
+
tokens: list[str],
|
|
129
|
+
preserve_contractions: bool = True,
|
|
130
|
+
preserve_hyphens: bool = True,
|
|
131
|
+
min_length: int = 1,
|
|
132
|
+
) -> list[str]:
|
|
133
|
+
"""
|
|
134
|
+
Normalize tokens for stylometric analysis (authorship attribution, etc.).
|
|
135
|
+
|
|
136
|
+
More permissive than readability normalization. Preserves stylistic markers
|
|
137
|
+
that may be relevant for authorship analysis.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
tokens: List of tokens from tokenizer
|
|
141
|
+
preserve_contractions: Keep contracted forms (default: True)
|
|
142
|
+
preserve_hyphens: Keep hyphenated words (default: True)
|
|
143
|
+
min_length: Minimum token length (default: 1)
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Filtered list of tokens suitable for stylometric analysis
|
|
147
|
+
|
|
148
|
+
Examples:
|
|
149
|
+
>>> tokens = ["don't", "re-enter", "test@example.com", "..."]
|
|
150
|
+
>>> normalize_for_stylometry(tokens)
|
|
151
|
+
["don't", "re-enter"]
|
|
152
|
+
|
|
153
|
+
>>> normalize_for_stylometry(tokens, preserve_contractions=False)
|
|
154
|
+
['re-enter']
|
|
155
|
+
"""
|
|
156
|
+
result = []
|
|
157
|
+
for token in tokens:
|
|
158
|
+
# Check minimum length
|
|
159
|
+
if len(token) < min_length:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
# Skip URLs and emails (not stylistically relevant)
|
|
163
|
+
if "@" in token or token.startswith(("http://", "https://", "www.")):
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
# Must contain at least one alphabetic character
|
|
167
|
+
if not any(c.isalpha() for c in token):
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
# Handle contractions and hyphenated words (including tokens with both)
|
|
171
|
+
has_apostrophe = "'" in token
|
|
172
|
+
has_hyphen = "-" in token
|
|
173
|
+
|
|
174
|
+
if has_apostrophe or has_hyphen:
|
|
175
|
+
# Only consider valid word tokens
|
|
176
|
+
if not is_word_token(token):
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
# Respect configuration flags for each stylistic feature present
|
|
180
|
+
if (has_apostrophe and not preserve_contractions) or (
|
|
181
|
+
has_hyphen and not preserve_hyphens
|
|
182
|
+
):
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
result.append(token)
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
# Default: keep if it's a valid word
|
|
189
|
+
if is_word_token(token):
|
|
190
|
+
result.append(token)
|
|
191
|
+
|
|
192
|
+
return result
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def clean_for_syllable_counting(text: str) -> str:
|
|
196
|
+
"""
|
|
197
|
+
Pre-clean text before tokenization for syllable-based readability metrics.
|
|
198
|
+
|
|
199
|
+
This is a defensive normalization layer that removes known problematic
|
|
200
|
+
patterns BEFORE tokenization, reducing the burden on token filtering.
|
|
201
|
+
|
|
202
|
+
Transformations:
|
|
203
|
+
- Remove URLs
|
|
204
|
+
- Remove email addresses
|
|
205
|
+
- Remove currency symbols with numbers ($99, £50, €100)
|
|
206
|
+
- Remove standalone numbers
|
|
207
|
+
- Normalize multiple spaces
|
|
208
|
+
|
|
209
|
+
Note: This is complementary to token-level filtering, not a replacement.
|
|
210
|
+
Both layers provide defense-in-depth against garbage syllable counts.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
text: Raw input text
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Cleaned text ready for tokenization
|
|
217
|
+
|
|
218
|
+
Examples:
|
|
219
|
+
>>> clean_for_syllable_counting("Visit http://example.com today!")
|
|
220
|
+
'Visit today!'
|
|
221
|
+
|
|
222
|
+
>>> clean_for_syllable_counting("Email test@example.com for help")
|
|
223
|
+
'Email for help'
|
|
224
|
+
|
|
225
|
+
>>> clean_for_syllable_counting("The price is $99.99 on sale")
|
|
226
|
+
'The price is on sale'
|
|
227
|
+
"""
|
|
228
|
+
# Remove URLs (http, https, www)
|
|
229
|
+
text = re.sub(r"https?://\S+", "", text)
|
|
230
|
+
text = re.sub(r"www\.\S+", "", text)
|
|
231
|
+
|
|
232
|
+
# Remove email addresses
|
|
233
|
+
text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "", text)
|
|
234
|
+
|
|
235
|
+
# Remove currency patterns ($99, £50, €100, $50,000, etc.)
|
|
236
|
+
text = re.sub(r"[$£€¥]\d+(?:[,.]\d+)*", "", text)
|
|
237
|
+
|
|
238
|
+
# Remove standalone numbers (with optional decimals, commas)
|
|
239
|
+
text = re.sub(r"\b\d+(?:[,.]\d+)*\b", "", text)
|
|
240
|
+
|
|
241
|
+
# Normalize whitespace
|
|
242
|
+
text = re.sub(r"\s+", " ", text)
|
|
243
|
+
|
|
244
|
+
return text.strip()
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def validate_tokens_for_readability(tokens: list[str]) -> tuple[list[str], list[str]]:
|
|
248
|
+
"""
|
|
249
|
+
Validate tokens for readability analysis and report problematic tokens.
|
|
250
|
+
|
|
251
|
+
This is a diagnostic function useful for debugging tokenization issues.
|
|
252
|
+
It separates valid word tokens from problematic non-words.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
tokens: List of tokens to validate
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Tuple of (valid_tokens, invalid_tokens)
|
|
259
|
+
|
|
260
|
+
Examples:
|
|
261
|
+
>>> tokens = ["Hello", "2026", "test@example.com", "world"]
|
|
262
|
+
>>> valid, invalid = validate_tokens_for_readability(tokens)
|
|
263
|
+
>>> print(valid)
|
|
264
|
+
['Hello', 'world']
|
|
265
|
+
>>> print(invalid)
|
|
266
|
+
['2026', 'test@example.com']
|
|
267
|
+
"""
|
|
268
|
+
valid = []
|
|
269
|
+
invalid = []
|
|
270
|
+
|
|
271
|
+
for token in tokens:
|
|
272
|
+
if is_word_token(token):
|
|
273
|
+
valid.append(token)
|
|
274
|
+
else:
|
|
275
|
+
invalid.append(token)
|
|
276
|
+
|
|
277
|
+
return valid, invalid
|