pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pystylometry/__init__.py +30 -5
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1954 -28
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +26 -1
  6. pystylometry/authorship/additional_methods.py +75 -0
  7. pystylometry/authorship/kilgarriff.py +347 -0
  8. pystylometry/character/__init__.py +15 -0
  9. pystylometry/character/character_metrics.py +389 -0
  10. pystylometry/cli.py +427 -0
  11. pystylometry/consistency/__init__.py +57 -0
  12. pystylometry/consistency/_thresholds.py +162 -0
  13. pystylometry/consistency/drift.py +549 -0
  14. pystylometry/dialect/__init__.py +65 -0
  15. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  16. pystylometry/dialect/_loader.py +360 -0
  17. pystylometry/dialect/detector.py +533 -0
  18. pystylometry/lexical/__init__.py +13 -6
  19. pystylometry/lexical/advanced_diversity.py +680 -0
  20. pystylometry/lexical/function_words.py +590 -0
  21. pystylometry/lexical/hapax.py +310 -33
  22. pystylometry/lexical/mtld.py +180 -22
  23. pystylometry/lexical/ttr.py +149 -0
  24. pystylometry/lexical/word_frequency_sophistication.py +1805 -0
  25. pystylometry/lexical/yule.py +142 -29
  26. pystylometry/ngrams/__init__.py +2 -0
  27. pystylometry/ngrams/entropy.py +150 -49
  28. pystylometry/ngrams/extended_ngrams.py +235 -0
  29. pystylometry/prosody/__init__.py +12 -0
  30. pystylometry/prosody/rhythm_prosody.py +53 -0
  31. pystylometry/readability/__init__.py +12 -0
  32. pystylometry/readability/additional_formulas.py +2110 -0
  33. pystylometry/readability/ari.py +173 -35
  34. pystylometry/readability/coleman_liau.py +150 -30
  35. pystylometry/readability/complex_words.py +531 -0
  36. pystylometry/readability/flesch.py +181 -32
  37. pystylometry/readability/gunning_fog.py +208 -35
  38. pystylometry/readability/smog.py +126 -28
  39. pystylometry/readability/syllables.py +137 -30
  40. pystylometry/stylistic/__init__.py +20 -0
  41. pystylometry/stylistic/cohesion_coherence.py +45 -0
  42. pystylometry/stylistic/genre_register.py +45 -0
  43. pystylometry/stylistic/markers.py +131 -0
  44. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  45. pystylometry/syntactic/__init__.py +4 -0
  46. pystylometry/syntactic/advanced_syntactic.py +494 -0
  47. pystylometry/syntactic/pos_ratios.py +172 -17
  48. pystylometry/syntactic/sentence_stats.py +105 -18
  49. pystylometry/syntactic/sentence_types.py +526 -0
  50. pystylometry/viz/__init__.py +71 -0
  51. pystylometry/viz/drift.py +589 -0
  52. pystylometry/viz/jsx/__init__.py +31 -0
  53. pystylometry/viz/jsx/_base.py +144 -0
  54. pystylometry/viz/jsx/report.py +677 -0
  55. pystylometry/viz/jsx/timeline.py +716 -0
  56. pystylometry/viz/jsx/viewer.py +1032 -0
  57. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
  58. pystylometry-1.1.0.dist-info/RECORD +63 -0
  59. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  60. pystylometry-0.1.0.dist-info/RECORD +0 -26
  61. {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
pystylometry/__init__.py CHANGED
@@ -2,7 +2,7 @@
2
2
  pystylometry - Comprehensive Python package for stylometric analysis.
3
3
 
4
4
  A modular package for text analysis with lexical, readability, syntactic,
5
- authorship, and n-gram metrics.
5
+ authorship, n-gram, dialect detection, and consistency analysis metrics.
6
6
 
7
7
  Installation:
8
8
  pip install pystylometry # Core (lexical only)
@@ -16,7 +16,9 @@ Usage:
16
16
  from pystylometry.lexical import compute_mtld, compute_yule
17
17
  from pystylometry.readability import compute_flesch
18
18
  from pystylometry.syntactic import compute_pos_ratios
19
- from pystylometry.authorship import compute_burrows_delta
19
+ from pystylometry.authorship import compute_burrows_delta, compute_kilgarriff
20
+ from pystylometry.consistency import compute_kilgarriff_drift
21
+ from pystylometry.dialect import compute_dialect
20
22
 
21
23
  # Or use the unified analyze() function
22
24
  from pystylometry import analyze
@@ -24,6 +26,18 @@ Usage:
24
26
  results = analyze(text, lexical=True, readability=True)
25
27
  print(results.lexical['mtld'].mtld_average)
26
28
  print(results.readability['flesch'].reading_ease)
29
+
30
+ # Dialect detection
31
+ result = compute_dialect("The colour of the programme was brilliant.")
32
+ print(result.dialect) # 'british'
33
+ print(result.british_score) # 0.85
34
+
35
+ # Consistency analysis (Style Drift Detector - Issue #36)
36
+ from pystylometry.consistency import compute_kilgarriff_drift
37
+
38
+ result = compute_kilgarriff_drift(long_document)
39
+ print(result.pattern) # 'consistent', 'sudden_spike', 'suspiciously_uniform', etc.
40
+ print(result.pattern_confidence)
27
41
  """
28
42
 
29
43
  from ._types import AnalysisResult
@@ -49,14 +63,18 @@ try:
49
63
  except ImportError:
50
64
  _SYNTACTIC_AVAILABLE = False
51
65
 
52
- # Authorship and ngrams use only stdlib (no external dependencies)
66
+ # Authorship, ngrams, dialect, and consistency use only stdlib (no external dependencies)
53
67
  from . import (
54
68
  authorship, # noqa: F401
69
+ consistency, # noqa: F401 - Style drift detection (Issue #36)
70
+ dialect, # noqa: F401
55
71
  ngrams, # noqa: F401
56
72
  )
57
73
 
58
74
  _AUTHORSHIP_AVAILABLE = True
59
75
  _NGRAMS_AVAILABLE = True
76
+ _DIALECT_AVAILABLE = True
77
+ _CONSISTENCY_AVAILABLE = True
60
78
 
61
79
 
62
80
  def analyze(
@@ -109,8 +127,7 @@ def analyze(
109
127
  # Lexical metrics (always available)
110
128
  if lexical_metrics:
111
129
  result.lexical = {}
112
- # TODO: Add when stylometry-ttr is integrated
113
- # result.lexical['ttr'] = lexical.compute_ttr(text)
130
+ result.lexical["ttr"] = lexical.compute_ttr(text)
114
131
  result.lexical["mtld"] = lexical.compute_mtld(text)
115
132
  result.lexical["yule"] = lexical.compute_yule(text)
116
133
  result.lexical["hapax"] = lexical.compute_hapax_ratios(text)
@@ -178,6 +195,8 @@ def get_available_modules() -> dict[str, bool]:
178
195
  >>> available = get_available_modules()
179
196
  >>> if available['readability']:
180
197
  ... from pystylometry.readability import compute_flesch
198
+ >>> if available['consistency']:
199
+ ... from pystylometry.consistency import compute_kilgarriff_drift
181
200
  """
182
201
  return {
183
202
  "lexical": True, # Always available
@@ -185,6 +204,8 @@ def get_available_modules() -> dict[str, bool]:
185
204
  "syntactic": _SYNTACTIC_AVAILABLE,
186
205
  "authorship": _AUTHORSHIP_AVAILABLE,
187
206
  "ngrams": _NGRAMS_AVAILABLE,
207
+ "dialect": _DIALECT_AVAILABLE,
208
+ "consistency": _CONSISTENCY_AVAILABLE, # Style drift detection (Issue #36)
188
209
  }
189
210
 
190
211
 
@@ -204,3 +225,7 @@ if _AUTHORSHIP_AVAILABLE:
204
225
  __all__.append("authorship")
205
226
  if _NGRAMS_AVAILABLE:
206
227
  __all__.append("ngrams")
228
+ if _DIALECT_AVAILABLE:
229
+ __all__.append("dialect")
230
+ if _CONSISTENCY_AVAILABLE:
231
+ __all__.append("consistency")
@@ -0,0 +1,277 @@
1
+ """Token normalization for stylometric analysis.
2
+
3
+ This module provides token filtering and normalization utilities for different
4
+ analysis scenarios. The primary use case is filtering out non-words (numbers,
5
+ URLs, emails, etc.) before passing tokens to readability metrics that rely on
6
+ syllable counting.
7
+
8
+ Design Philosophy:
9
+ -----------------
10
+ Different stylometric analyses require different normalization strategies:
11
+
12
+ 1. **Readability Metrics** (Flesch, SMOG, etc.):
13
+ - Strict filtering: only alphabetic words
14
+ - Removes numbers, URLs, emails, punctuation
15
+ - Prevents garbage/crashes in syllable counting
16
+
17
+ 2. **Authorship Attribution**:
18
+ - Preserve stylistic markers
19
+ - Keep contractions, hyphens, apostrophes
20
+ - More permissive filtering
21
+
22
+ 3. **Lexical Diversity**:
23
+ - Balance between cleanliness and vocabulary richness
24
+ - May keep some punctuation patterns
25
+ - Configurable based on research question
26
+
27
+ Critical Issue Addressed:
28
+ ------------------------
29
+ Without normalization, readability metrics receive non-words from the tokenizer:
30
+ - count_syllables("2026") → undefined behavior (crash or garbage)
31
+ - count_syllables("test@example.com") → undefined behavior
32
+ - count_syllables("C++") → undefined behavior
33
+ - count_syllables("$99.99") → undefined behavior
34
+
35
+ This module ensures only syllabifiable words reach syllable counting functions.
36
+ """
37
+
38
+ from __future__ import annotations
39
+
40
+ import re
41
+
42
+
43
+ def is_word_token(token: str) -> bool:
44
+ """
45
+ Check if a token is a valid word for readability analysis.
46
+
47
+ A valid word token is:
48
+ - Purely alphabetic (including accented characters)
49
+ - May contain internal apostrophes (contractions like "don't")
50
+ - May contain internal hyphens (compound words like "co-operate")
51
+ - Does NOT start or end with punctuation
52
+
53
+ Args:
54
+ token: Token to validate
55
+
56
+ Returns:
57
+ True if token is a valid word
58
+
59
+ Examples:
60
+ >>> is_word_token("hello")
61
+ True
62
+ >>> is_word_token("don't")
63
+ True
64
+ >>> is_word_token("co-operate")
65
+ True
66
+ >>> is_word_token("123")
67
+ False
68
+ >>> is_word_token("test@example.com")
69
+ False
70
+ >>> is_word_token("...")
71
+ False
72
+ """
73
+ if not token or len(token) == 0:
74
+ return False
75
+
76
+ # Must start and end with alphabetic character
77
+ if not (token[0].isalpha() and token[-1].isalpha()):
78
+ return False
79
+
80
+ # Check middle characters - allow letters, apostrophes, hyphens
81
+ for char in token:
82
+ if not (char.isalpha() or char in ("'", "-")):
83
+ return False
84
+
85
+ return True
86
+
87
+
88
+ def normalize_for_readability(tokens: list[str]) -> list[str]:
89
+ """
90
+ Normalize tokens for readability metrics (e.g., Flesch, SMOG).
91
+
92
+ Filters tokens to only include valid words that can have syllables counted.
93
+ This prevents errors and garbage results from non-word tokens.
94
+
95
+ Filtering rules:
96
+ - Keep only alphabetic words (a-zA-Z)
97
+ - Keep contractions with apostrophes ("don't", "we're")
98
+ - Keep hyphenated compound words ("co-operate", "re-enter")
99
+ - Remove pure numbers ("2026", "3.14")
100
+ - Remove URLs ("http://example.com")
101
+ - Remove emails ("test@example.com")
102
+ - Remove special characters ("C++", "O'Brian" → keep, "$99.99" → remove)
103
+ - Remove pure punctuation ("...", "—", "!!!")
104
+
105
+ Args:
106
+ tokens: List of tokens from tokenizer
107
+
108
+ Returns:
109
+ Filtered list containing only valid word tokens
110
+
111
+ Examples:
112
+ >>> tokens = ["The", "year", "2026", "had", "365", "days"]
113
+ >>> normalize_for_readability(tokens)
114
+ ['The', 'year', 'had', 'days']
115
+
116
+ >>> tokens = ["Dr", "Smith", "works", "at", "U", ".", "S", ".", "Steel"]
117
+ >>> normalize_for_readability(tokens)
118
+ ['Dr', 'Smith', 'works', 'at', 'U', 'S', 'Steel']
119
+
120
+ >>> tokens = ["Email", "test@example.com", "for", "help"]
121
+ >>> normalize_for_readability(tokens)
122
+ ['Email', 'for', 'help']
123
+ """
124
+ return [token for token in tokens if is_word_token(token)]
125
+
126
+
127
+ def normalize_for_stylometry(
128
+ tokens: list[str],
129
+ preserve_contractions: bool = True,
130
+ preserve_hyphens: bool = True,
131
+ min_length: int = 1,
132
+ ) -> list[str]:
133
+ """
134
+ Normalize tokens for stylometric analysis (authorship attribution, etc.).
135
+
136
+ More permissive than readability normalization. Preserves stylistic markers
137
+ that may be relevant for authorship analysis.
138
+
139
+ Args:
140
+ tokens: List of tokens from tokenizer
141
+ preserve_contractions: Keep contracted forms (default: True)
142
+ preserve_hyphens: Keep hyphenated words (default: True)
143
+ min_length: Minimum token length (default: 1)
144
+
145
+ Returns:
146
+ Filtered list of tokens suitable for stylometric analysis
147
+
148
+ Examples:
149
+ >>> tokens = ["don't", "re-enter", "test@example.com", "..."]
150
+ >>> normalize_for_stylometry(tokens)
151
+ ["don't", "re-enter"]
152
+
153
+ >>> normalize_for_stylometry(tokens, preserve_contractions=False)
154
+ ['re-enter']
155
+ """
156
+ result = []
157
+ for token in tokens:
158
+ # Check minimum length
159
+ if len(token) < min_length:
160
+ continue
161
+
162
+ # Skip URLs and emails (not stylistically relevant)
163
+ if "@" in token or token.startswith(("http://", "https://", "www.")):
164
+ continue
165
+
166
+ # Must contain at least one alphabetic character
167
+ if not any(c.isalpha() for c in token):
168
+ continue
169
+
170
+ # Handle contractions and hyphenated words (including tokens with both)
171
+ has_apostrophe = "'" in token
172
+ has_hyphen = "-" in token
173
+
174
+ if has_apostrophe or has_hyphen:
175
+ # Only consider valid word tokens
176
+ if not is_word_token(token):
177
+ continue
178
+
179
+ # Respect configuration flags for each stylistic feature present
180
+ if (has_apostrophe and not preserve_contractions) or (
181
+ has_hyphen and not preserve_hyphens
182
+ ):
183
+ continue
184
+
185
+ result.append(token)
186
+ continue
187
+
188
+ # Default: keep if it's a valid word
189
+ if is_word_token(token):
190
+ result.append(token)
191
+
192
+ return result
193
+
194
+
195
+ def clean_for_syllable_counting(text: str) -> str:
196
+ """
197
+ Pre-clean text before tokenization for syllable-based readability metrics.
198
+
199
+ This is a defensive normalization layer that removes known problematic
200
+ patterns BEFORE tokenization, reducing the burden on token filtering.
201
+
202
+ Transformations:
203
+ - Remove URLs
204
+ - Remove email addresses
205
+ - Remove currency symbols with numbers ($99, £50, €100)
206
+ - Remove standalone numbers
207
+ - Normalize multiple spaces
208
+
209
+ Note: This is complementary to token-level filtering, not a replacement.
210
+ Both layers provide defense-in-depth against garbage syllable counts.
211
+
212
+ Args:
213
+ text: Raw input text
214
+
215
+ Returns:
216
+ Cleaned text ready for tokenization
217
+
218
+ Examples:
219
+ >>> clean_for_syllable_counting("Visit http://example.com today!")
220
+ 'Visit today!'
221
+
222
+ >>> clean_for_syllable_counting("Email test@example.com for help")
223
+ 'Email for help'
224
+
225
+ >>> clean_for_syllable_counting("The price is $99.99 on sale")
226
+ 'The price is on sale'
227
+ """
228
+ # Remove URLs (http, https, www)
229
+ text = re.sub(r"https?://\S+", "", text)
230
+ text = re.sub(r"www\.\S+", "", text)
231
+
232
+ # Remove email addresses
233
+ text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b", "", text)
234
+
235
+ # Remove currency patterns ($99, £50, €100, $50,000, etc.)
236
+ text = re.sub(r"[$£€¥]\d+(?:[,.]\d+)*", "", text)
237
+
238
+ # Remove standalone numbers (with optional decimals, commas)
239
+ text = re.sub(r"\b\d+(?:[,.]\d+)*\b", "", text)
240
+
241
+ # Normalize whitespace
242
+ text = re.sub(r"\s+", " ", text)
243
+
244
+ return text.strip()
245
+
246
+
247
+ def validate_tokens_for_readability(tokens: list[str]) -> tuple[list[str], list[str]]:
248
+ """
249
+ Validate tokens for readability analysis and report problematic tokens.
250
+
251
+ This is a diagnostic function useful for debugging tokenization issues.
252
+ It separates valid word tokens from problematic non-words.
253
+
254
+ Args:
255
+ tokens: List of tokens to validate
256
+
257
+ Returns:
258
+ Tuple of (valid_tokens, invalid_tokens)
259
+
260
+ Examples:
261
+ >>> tokens = ["Hello", "2026", "test@example.com", "world"]
262
+ >>> valid, invalid = validate_tokens_for_readability(tokens)
263
+ >>> print(valid)
264
+ ['Hello', 'world']
265
+ >>> print(invalid)
266
+ ['2026', 'test@example.com']
267
+ """
268
+ valid = []
269
+ invalid = []
270
+
271
+ for token in tokens:
272
+ if is_word_token(token):
273
+ valid.append(token)
274
+ else:
275
+ invalid.append(token)
276
+
277
+ return valid, invalid