pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +1 -2
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1224 -2
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +4 -0
- pystylometry/authorship/additional_methods.py +100 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +301 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +641 -0
- pystylometry/lexical/function_words.py +391 -0
- pystylometry/lexical/hapax.py +154 -7
- pystylometry/lexical/mtld.py +83 -7
- pystylometry/lexical/ttr.py +83 -0
- pystylometry/lexical/word_frequency_sophistication.py +581 -0
- pystylometry/lexical/yule.py +34 -7
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +985 -0
- pystylometry/readability/ari.py +93 -17
- pystylometry/readability/coleman_liau.py +102 -9
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +59 -14
- pystylometry/readability/gunning_fog.py +194 -25
- pystylometry/readability/smog.py +31 -14
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +432 -0
- pystylometry/syntactic/pos_ratios.py +104 -13
- pystylometry/syntactic/sentence_stats.py +57 -13
- pystylometry/syntactic/sentence_types.py +470 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
- pystylometry-1.0.0.dist-info/RECORD +46 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
- pystylometry-0.1.0.dist-info/RECORD +0 -26
pystylometry/__init__.py
CHANGED
|
@@ -109,8 +109,7 @@ def analyze(
|
|
|
109
109
|
# Lexical metrics (always available)
|
|
110
110
|
if lexical_metrics:
|
|
111
111
|
result.lexical = {}
|
|
112
|
-
|
|
113
|
-
# result.lexical['ttr'] = lexical.compute_ttr(text)
|
|
112
|
+
result.lexical["ttr"] = lexical.compute_ttr(text)
|
|
114
113
|
result.lexical["mtld"] = lexical.compute_mtld(text)
|
|
115
114
|
result.lexical["yule"] = lexical.compute_yule(text)
|
|
116
115
|
result.lexical["hapax"] = lexical.compute_hapax_ratios(text)
|
|
@@ -0,0 +1,277 @@
|
|
|
1
|
+
"""Token normalization for stylometric analysis.
|
|
2
|
+
|
|
3
|
+
This module provides token filtering and normalization utilities for different
|
|
4
|
+
analysis scenarios. The primary use case is filtering out non-words (numbers,
|
|
5
|
+
URLs, emails, etc.) before passing tokens to readability metrics that rely on
|
|
6
|
+
syllable counting.
|
|
7
|
+
|
|
8
|
+
Design Philosophy:
|
|
9
|
+
-----------------
|
|
10
|
+
Different stylometric analyses require different normalization strategies:
|
|
11
|
+
|
|
12
|
+
1. **Readability Metrics** (Flesch, SMOG, etc.):
|
|
13
|
+
- Strict filtering: only alphabetic words
|
|
14
|
+
- Removes numbers, URLs, emails, punctuation
|
|
15
|
+
- Prevents garbage/crashes in syllable counting
|
|
16
|
+
|
|
17
|
+
2. **Authorship Attribution**:
|
|
18
|
+
- Preserve stylistic markers
|
|
19
|
+
- Keep contractions, hyphens, apostrophes
|
|
20
|
+
- More permissive filtering
|
|
21
|
+
|
|
22
|
+
3. **Lexical Diversity**:
|
|
23
|
+
- Balance between cleanliness and vocabulary richness
|
|
24
|
+
- May keep some punctuation patterns
|
|
25
|
+
- Configurable based on research question
|
|
26
|
+
|
|
27
|
+
Critical Issue Addressed:
|
|
28
|
+
------------------------
|
|
29
|
+
Without normalization, readability metrics receive non-words from the tokenizer:
|
|
30
|
+
- count_syllables("2026") → undefined behavior (crash or garbage)
|
|
31
|
+
- count_syllables("test@example.com") → undefined behavior
|
|
32
|
+
- count_syllables("C++") → undefined behavior
|
|
33
|
+
- count_syllables("$99.99") → undefined behavior
|
|
34
|
+
|
|
35
|
+
This module ensures only syllabifiable words reach syllable counting functions.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
from __future__ import annotations
|
|
39
|
+
|
|
40
|
+
import re
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def is_word_token(token: str) -> bool:
|
|
44
|
+
"""
|
|
45
|
+
Check if a token is a valid word for readability analysis.
|
|
46
|
+
|
|
47
|
+
A valid word token is:
|
|
48
|
+
- Purely alphabetic (including accented characters)
|
|
49
|
+
- May contain internal apostrophes (contractions like "don't")
|
|
50
|
+
- May contain internal hyphens (compound words like "co-operate")
|
|
51
|
+
- Does NOT start or end with punctuation
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
token: Token to validate
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
True if token is a valid word
|
|
58
|
+
|
|
59
|
+
Examples:
|
|
60
|
+
>>> is_word_token("hello")
|
|
61
|
+
True
|
|
62
|
+
>>> is_word_token("don't")
|
|
63
|
+
True
|
|
64
|
+
>>> is_word_token("co-operate")
|
|
65
|
+
True
|
|
66
|
+
>>> is_word_token("123")
|
|
67
|
+
False
|
|
68
|
+
>>> is_word_token("test@example.com")
|
|
69
|
+
False
|
|
70
|
+
>>> is_word_token("...")
|
|
71
|
+
False
|
|
72
|
+
"""
|
|
73
|
+
if not token or len(token) == 0:
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
# Must start and end with alphabetic character
|
|
77
|
+
if not (token[0].isalpha() and token[-1].isalpha()):
|
|
78
|
+
return False
|
|
79
|
+
|
|
80
|
+
# Check middle characters - allow letters, apostrophes, hyphens
|
|
81
|
+
for char in token:
|
|
82
|
+
if not (char.isalpha() or char in ("'", "-")):
|
|
83
|
+
return False
|
|
84
|
+
|
|
85
|
+
return True
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def normalize_for_readability(tokens: list[str]) -> list[str]:
|
|
89
|
+
"""
|
|
90
|
+
Normalize tokens for readability metrics (e.g., Flesch, SMOG).
|
|
91
|
+
|
|
92
|
+
Filters tokens to only include valid words that can have syllables counted.
|
|
93
|
+
This prevents errors and garbage results from non-word tokens.
|
|
94
|
+
|
|
95
|
+
Filtering rules:
|
|
96
|
+
- Keep only alphabetic words (a-zA-Z)
|
|
97
|
+
- Keep contractions with apostrophes ("don't", "we're")
|
|
98
|
+
- Keep hyphenated compound words ("co-operate", "re-enter")
|
|
99
|
+
- Remove pure numbers ("2026", "3.14")
|
|
100
|
+
- Remove URLs ("http://example.com")
|
|
101
|
+
- Remove emails ("test@example.com")
|
|
102
|
+
- Remove special characters ("C++", "O'Brian" → keep, "$99.99" → remove)
|
|
103
|
+
- Remove pure punctuation ("...", "—", "!!!")
|
|
104
|
+
|
|
105
|
+
Args:
|
|
106
|
+
tokens: List of tokens from tokenizer
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
Filtered list containing only valid word tokens
|
|
110
|
+
|
|
111
|
+
Examples:
|
|
112
|
+
>>> tokens = ["The", "year", "2026", "had", "365", "days"]
|
|
113
|
+
>>> normalize_for_readability(tokens)
|
|
114
|
+
['The', 'year', 'had', 'days']
|
|
115
|
+
|
|
116
|
+
>>> tokens = ["Dr", "Smith", "works", "at", "U", ".", "S", ".", "Steel"]
|
|
117
|
+
>>> normalize_for_readability(tokens)
|
|
118
|
+
['Dr', 'Smith', 'works', 'at', 'U', 'S', 'Steel']
|
|
119
|
+
|
|
120
|
+
>>> tokens = ["Email", "test@example.com", "for", "help"]
|
|
121
|
+
>>> normalize_for_readability(tokens)
|
|
122
|
+
['Email', 'for', 'help']
|
|
123
|
+
"""
|
|
124
|
+
return [token for token in tokens if is_word_token(token)]
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
def normalize_for_stylometry(
|
|
128
|
+
tokens: list[str],
|
|
129
|
+
preserve_contractions: bool = True,
|
|
130
|
+
preserve_hyphens: bool = True,
|
|
131
|
+
min_length: int = 1,
|
|
132
|
+
) -> list[str]:
|
|
133
|
+
"""
|
|
134
|
+
Normalize tokens for stylometric analysis (authorship attribution, etc.).
|
|
135
|
+
|
|
136
|
+
More permissive than readability normalization. Preserves stylistic markers
|
|
137
|
+
that may be relevant for authorship analysis.
|
|
138
|
+
|
|
139
|
+
Args:
|
|
140
|
+
tokens: List of tokens from tokenizer
|
|
141
|
+
preserve_contractions: Keep contracted forms (default: True)
|
|
142
|
+
preserve_hyphens: Keep hyphenated words (default: True)
|
|
143
|
+
min_length: Minimum token length (default: 1)
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Filtered list of tokens suitable for stylometric analysis
|
|
147
|
+
|
|
148
|
+
Examples:
|
|
149
|
+
>>> tokens = ["don't", "re-enter", "test@example.com", "..."]
|
|
150
|
+
>>> normalize_for_stylometry(tokens)
|
|
151
|
+
["don't", "re-enter"]
|
|
152
|
+
|
|
153
|
+
>>> normalize_for_stylometry(tokens, preserve_contractions=False)
|
|
154
|
+
['re-enter']
|
|
155
|
+
"""
|
|
156
|
+
result = []
|
|
157
|
+
for token in tokens:
|
|
158
|
+
# Check minimum length
|
|
159
|
+
if len(token) < min_length:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
# Skip URLs and emails (not stylistically relevant)
|
|
163
|
+
if "@" in token or token.startswith(("http://", "https://", "www.")):
|
|
164
|
+
continue
|
|
165
|
+
|
|
166
|
+
# Must contain at least one alphabetic character
|
|
167
|
+
if not any(c.isalpha() for c in token):
|
|
168
|
+
continue
|
|
169
|
+
|
|
170
|
+
# Handle contractions and hyphenated words (including tokens with both)
|
|
171
|
+
has_apostrophe = "'" in token
|
|
172
|
+
has_hyphen = "-" in token
|
|
173
|
+
|
|
174
|
+
if has_apostrophe or has_hyphen:
|
|
175
|
+
# Only consider valid word tokens
|
|
176
|
+
if not is_word_token(token):
|
|
177
|
+
continue
|
|
178
|
+
|
|
179
|
+
# Respect configuration flags for each stylistic feature present
|
|
180
|
+
if (has_apostrophe and not preserve_contractions) or (
|
|
181
|
+
has_hyphen and not preserve_hyphens
|
|
182
|
+
):
|
|
183
|
+
continue
|
|
184
|
+
|
|
185
|
+
result.append(token)
|
|
186
|
+
continue
|
|
187
|
+
|
|
188
|
+
# Default: keep if it's a valid word
|
|
189
|
+
if is_word_token(token):
|
|
190
|
+
result.append(token)
|
|
191
|
+
|
|
192
|
+
return result
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
def clean_for_syllable_counting(text: str) -> str:
|
|
196
|
+
"""
|
|
197
|
+
Pre-clean text before tokenization for syllable-based readability metrics.
|
|
198
|
+
|
|
199
|
+
This is a defensive normalization layer that removes known problematic
|
|
200
|
+
patterns BEFORE tokenization, reducing the burden on token filtering.
|
|
201
|
+
|
|
202
|
+
Transformations:
|
|
203
|
+
- Remove URLs
|
|
204
|
+
- Remove email addresses
|
|
205
|
+
- Remove currency symbols with numbers ($99, £50, €100)
|
|
206
|
+
- Remove standalone numbers
|
|
207
|
+
- Normalize multiple spaces
|
|
208
|
+
|
|
209
|
+
Note: This is complementary to token-level filtering, not a replacement.
|
|
210
|
+
Both layers provide defense-in-depth against garbage syllable counts.
|
|
211
|
+
|
|
212
|
+
Args:
|
|
213
|
+
text: Raw input text
|
|
214
|
+
|
|
215
|
+
Returns:
|
|
216
|
+
Cleaned text ready for tokenization
|
|
217
|
+
|
|
218
|
+
Examples:
|
|
219
|
+
>>> clean_for_syllable_counting("Visit http://example.com today!")
|
|
220
|
+
'Visit today!'
|
|
221
|
+
|
|
222
|
+
>>> clean_for_syllable_counting("Email test@example.com for help")
|
|
223
|
+
'Email for help'
|
|
224
|
+
|
|
225
|
+
>>> clean_for_syllable_counting("The price is $99.99 on sale")
|
|
226
|
+
'The price is on sale'
|
|
227
|
+
"""
|
|
228
|
+
# Remove URLs (http, https, www)
|
|
229
|
+
text = re.sub(r"https?://\S+", "", text)
|
|
230
|
+
text = re.sub(r"www\.\S+", "", text)
|
|
231
|
+
|
|
232
|
+
# Remove email addresses
|
|
233
|
+
text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "", text)
|
|
234
|
+
|
|
235
|
+
# Remove currency patterns ($99, £50, €100, $50,000, etc.)
|
|
236
|
+
text = re.sub(r"[$£€¥]\d+(?:[,.]\d+)*", "", text)
|
|
237
|
+
|
|
238
|
+
# Remove standalone numbers (with optional decimals, commas)
|
|
239
|
+
text = re.sub(r"\b\d+(?:[,.]\d+)*\b", "", text)
|
|
240
|
+
|
|
241
|
+
# Normalize whitespace
|
|
242
|
+
text = re.sub(r"\s+", " ", text)
|
|
243
|
+
|
|
244
|
+
return text.strip()
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def validate_tokens_for_readability(tokens: list[str]) -> tuple[list[str], list[str]]:
|
|
248
|
+
"""
|
|
249
|
+
Validate tokens for readability analysis and report problematic tokens.
|
|
250
|
+
|
|
251
|
+
This is a diagnostic function useful for debugging tokenization issues.
|
|
252
|
+
It separates valid word tokens from problematic non-words.
|
|
253
|
+
|
|
254
|
+
Args:
|
|
255
|
+
tokens: List of tokens to validate
|
|
256
|
+
|
|
257
|
+
Returns:
|
|
258
|
+
Tuple of (valid_tokens, invalid_tokens)
|
|
259
|
+
|
|
260
|
+
Examples:
|
|
261
|
+
>>> tokens = ["Hello", "2026", "test@example.com", "world"]
|
|
262
|
+
>>> valid, invalid = validate_tokens_for_readability(tokens)
|
|
263
|
+
>>> print(valid)
|
|
264
|
+
['Hello', 'world']
|
|
265
|
+
>>> print(invalid)
|
|
266
|
+
['2026', 'test@example.com']
|
|
267
|
+
"""
|
|
268
|
+
valid = []
|
|
269
|
+
invalid = []
|
|
270
|
+
|
|
271
|
+
for token in tokens:
|
|
272
|
+
if is_word_token(token):
|
|
273
|
+
valid.append(token)
|
|
274
|
+
else:
|
|
275
|
+
invalid.append(token)
|
|
276
|
+
|
|
277
|
+
return valid, invalid
|