pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
pystylometry/__init__.py CHANGED
@@ -109,8 +109,7 @@ def analyze(
109
109
  # Lexical metrics (always available)
110
110
  if lexical_metrics:
111
111
  result.lexical = {}
112
- # TODO: Add when stylometry-ttr is integrated
113
- # result.lexical['ttr'] = lexical.compute_ttr(text)
112
+ result.lexical["ttr"] = lexical.compute_ttr(text)
114
113
  result.lexical["mtld"] = lexical.compute_mtld(text)
115
114
  result.lexical["yule"] = lexical.compute_yule(text)
116
115
  result.lexical["hapax"] = lexical.compute_hapax_ratios(text)
@@ -0,0 +1,277 @@
1
+ """Token normalization for stylometric analysis.
2
+
3
+ This module provides token filtering and normalization utilities for different
4
+ analysis scenarios. The primary use case is filtering out non-words (numbers,
5
+ URLs, emails, etc.) before passing tokens to readability metrics that rely on
6
+ syllable counting.
7
+
8
+ Design Philosophy:
9
+ -----------------
10
+ Different stylometric analyses require different normalization strategies:
11
+
12
+ 1. **Readability Metrics** (Flesch, SMOG, etc.):
13
+ - Strict filtering: only alphabetic words
14
+ - Removes numbers, URLs, emails, punctuation
15
+ - Prevents garbage/crashes in syllable counting
16
+
17
+ 2. **Authorship Attribution**:
18
+ - Preserve stylistic markers
19
+ - Keep contractions, hyphens, apostrophes
20
+ - More permissive filtering
21
+
22
+ 3. **Lexical Diversity**:
23
+ - Balance between cleanliness and vocabulary richness
24
+ - May keep some punctuation patterns
25
+ - Configurable based on research question
26
+
27
+ Critical Issue Addressed:
28
+ ------------------------
29
+ Without normalization, readability metrics receive non-words from the tokenizer:
30
+ - count_syllables("2026") → undefined behavior (crash or garbage)
31
+ - count_syllables("test@example.com") → undefined behavior
32
+ - count_syllables("C++") → undefined behavior
33
+ - count_syllables("$99.99") → undefined behavior
34
+
35
+ This module ensures only syllabifiable words reach syllable counting functions.
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ import re
41
+
42
+
43
+ def is_word_token(token: str) -> bool:
44
+ """
45
+ Check if a token is a valid word for readability analysis.
46
+
47
+ A valid word token is:
48
+ - Purely alphabetic (including accented characters)
49
+ - May contain internal apostrophes (contractions like "don't")
50
+ - May contain internal hyphens (compound words like "co-operate")
51
+ - Does NOT start or end with punctuation
52
+
53
+ Args:
54
+ token: Token to validate
55
+
56
+ Returns:
57
+ True if token is a valid word
58
+
59
+ Examples:
60
+ >>> is_word_token("hello")
61
+ True
62
+ >>> is_word_token("don't")
63
+ True
64
+ >>> is_word_token("co-operate")
65
+ True
66
+ >>> is_word_token("123")
67
+ False
68
+ >>> is_word_token("test@example.com")
69
+ False
70
+ >>> is_word_token("...")
71
+ False
72
+ """
73
+ if not token or len(token) == 0:
74
+ return False
75
+
76
+ # Must start and end with alphabetic character
77
+ if not (token[0].isalpha() and token[-1].isalpha()):
78
+ return False
79
+
80
+ # Check middle characters - allow letters, apostrophes, hyphens
81
+ for char in token:
82
+ if not (char.isalpha() or char in ("'", "-")):
83
+ return False
84
+
85
+ return True
86
+
87
+
88
+ def normalize_for_readability(tokens: list[str]) -> list[str]:
89
+ """
90
+ Normalize tokens for readability metrics (e.g., Flesch, SMOG).
91
+
92
+ Filters tokens to only include valid words that can have syllables counted.
93
+ This prevents errors and garbage results from non-word tokens.
94
+
95
+ Filtering rules:
96
+ - Keep only alphabetic words (a-zA-Z)
97
+ - Keep contractions with apostrophes ("don't", "we're")
98
+ - Keep hyphenated compound words ("co-operate", "re-enter")
99
+ - Remove pure numbers ("2026", "3.14")
100
+ - Remove URLs ("http://example.com")
101
+ - Remove emails ("test@example.com")
102
+ - Remove special characters ("C++", "O'Brian" → keep, "$99.99" → remove)
103
+ - Remove pure punctuation ("...", "—", "!!!")
104
+
105
+ Args:
106
+ tokens: List of tokens from tokenizer
107
+
108
+ Returns:
109
+ Filtered list containing only valid word tokens
110
+
111
+ Examples:
112
+ >>> tokens = ["The", "year", "2026", "had", "365", "days"]
113
+ >>> normalize_for_readability(tokens)
114
+ ['The', 'year', 'had', 'days']
115
+
116
+ >>> tokens = ["Dr", "Smith", "works", "at", "U", ".", "S", ".", "Steel"]
117
+ >>> normalize_for_readability(tokens)
118
+ ['Dr', 'Smith', 'works', 'at', 'U', 'S', 'Steel']
119
+
120
+ >>> tokens = ["Email", "test@example.com", "for", "help"]
121
+ >>> normalize_for_readability(tokens)
122
+ ['Email', 'for', 'help']
123
+ """
124
+ return [token for token in tokens if is_word_token(token)]
125
+
126
+
127
+ def normalize_for_stylometry(
128
+ tokens: list[str],
129
+ preserve_contractions: bool = True,
130
+ preserve_hyphens: bool = True,
131
+ min_length: int = 1,
132
+ ) -> list[str]:
133
+ """
134
+ Normalize tokens for stylometric analysis (authorship attribution, etc.).
135
+
136
+ More permissive than readability normalization. Preserves stylistic markers
137
+ that may be relevant for authorship analysis.
138
+
139
+ Args:
140
+ tokens: List of tokens from tokenizer
141
+ preserve_contractions: Keep contracted forms (default: True)
142
+ preserve_hyphens: Keep hyphenated words (default: True)
143
+ min_length: Minimum token length (default: 1)
144
+
145
+ Returns:
146
+ Filtered list of tokens suitable for stylometric analysis
147
+
148
+ Examples:
149
+ >>> tokens = ["don't", "re-enter", "test@example.com", "..."]
150
+ >>> normalize_for_stylometry(tokens)
151
+ ["don't", "re-enter"]
152
+
153
+ >>> normalize_for_stylometry(tokens, preserve_contractions=False)
154
+ ['re-enter']
155
+ """
156
+ result = []
157
+ for token in tokens:
158
+ # Check minimum length
159
+ if len(token) < min_length:
160
+ continue
161
+
162
+ # Skip URLs and emails (not stylistically relevant)
163
+ if "@" in token or token.startswith(("http://", "https://", "www.")):
164
+ continue
165
+
166
+ # Must contain at least one alphabetic character
167
+ if not any(c.isalpha() for c in token):
168
+ continue
169
+
170
+ # Handle contractions and hyphenated words (including tokens with both)
171
+ has_apostrophe = "'" in token
172
+ has_hyphen = "-" in token
173
+
174
+ if has_apostrophe or has_hyphen:
175
+ # Only consider valid word tokens
176
+ if not is_word_token(token):
177
+ continue
178
+
179
+ # Respect configuration flags for each stylistic feature present
180
+ if (has_apostrophe and not preserve_contractions) or (
181
+ has_hyphen and not preserve_hyphens
182
+ ):
183
+ continue
184
+
185
+ result.append(token)
186
+ continue
187
+
188
+ # Default: keep if it's a valid word
189
+ if is_word_token(token):
190
+ result.append(token)
191
+
192
+ return result
193
+
194
+
195
+ def clean_for_syllable_counting(text: str) -> str:
196
+ """
197
+ Pre-clean text before tokenization for syllable-based readability metrics.
198
+
199
+ This is a defensive normalization layer that removes known problematic
200
+ patterns BEFORE tokenization, reducing the burden on token filtering.
201
+
202
+ Transformations:
203
+ - Remove URLs
204
+ - Remove email addresses
205
+ - Remove currency symbols with numbers ($99, £50, €100)
206
+ - Remove standalone numbers
207
+ - Normalize multiple spaces
208
+
209
+ Note: This is complementary to token-level filtering, not a replacement.
210
+ Both layers provide defense-in-depth against garbage syllable counts.
211
+
212
+ Args:
213
+ text: Raw input text
214
+
215
+ Returns:
216
+ Cleaned text ready for tokenization
217
+
218
+ Examples:
219
+ >>> clean_for_syllable_counting("Visit http://example.com today!")
220
+ 'Visit today!'
221
+
222
+ >>> clean_for_syllable_counting("Email test@example.com for help")
223
+ 'Email for help'
224
+
225
+ >>> clean_for_syllable_counting("The price is $99.99 on sale")
226
+ 'The price is on sale'
227
+ """
228
+ # Remove URLs (http, https, www)
229
+ text = re.sub(r"https?://\S+", "", text)
230
+ text = re.sub(r"www\.\S+", "", text)
231
+
232
+ # Remove email addresses
233
+ text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "", text)
234
+
235
+ # Remove currency patterns ($99, £50, €100, $50,000, etc.)
236
+ text = re.sub(r"[$£€¥]\d+(?:[,.]\d+)*", "", text)
237
+
238
+ # Remove standalone numbers (with optional decimals, commas)
239
+ text = re.sub(r"\b\d+(?:[,.]\d+)*\b", "", text)
240
+
241
+ # Normalize whitespace
242
+ text = re.sub(r"\s+", " ", text)
243
+
244
+ return text.strip()
245
+
246
+
247
+ def validate_tokens_for_readability(tokens: list[str]) -> tuple[list[str], list[str]]:
248
+ """
249
+ Validate tokens for readability analysis and report problematic tokens.
250
+
251
+ This is a diagnostic function useful for debugging tokenization issues.
252
+ It separates valid word tokens from problematic non-words.
253
+
254
+ Args:
255
+ tokens: List of tokens to validate
256
+
257
+ Returns:
258
+ Tuple of (valid_tokens, invalid_tokens)
259
+
260
+ Examples:
261
+ >>> tokens = ["Hello", "2026", "test@example.com", "world"]
262
+ >>> valid, invalid = validate_tokens_for_readability(tokens)
263
+ >>> print(valid)
264
+ ['Hello', 'world']
265
+ >>> print(invalid)
266
+ ['2026', 'test@example.com']
267
+ """
268
+ valid = []
269
+ invalid = []
270
+
271
+ for token in tokens:
272
+ if is_word_token(token):
273
+ valid.append(token)
274
+ else:
275
+ invalid.append(token)
276
+
277
+ return valid, invalid