pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -1,19 +1,95 @@
1
- """Flesch Reading Ease and Flesch-Kincaid Grade Level."""
1
+ """Flesch Reading Ease and Flesch-Kincaid Grade Level.
2
+
3
+ This module implements the Flesch readability formulas with native chunked
4
+ analysis for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
2
10
 
3
11
  from .._normalize import normalize_for_readability
4
- from .._types import FleschResult
12
+ from .._types import Distribution, FleschResult, chunk_text, make_distribution
5
13
  from .._utils import split_sentences, tokenize
6
14
  from .syllables import count_syllables
7
15
 
8
16
 
9
- def compute_flesch(text: str) -> FleschResult:
17
+ def _compute_flesch_single(text: str) -> tuple[float, float, dict]:
18
+ """Compute Flesch metrics for a single chunk of text.
19
+
20
+ Returns:
21
+ Tuple of (reading_ease, grade_level, metadata_dict).
22
+ Returns (nan, nan, metadata) for empty/invalid input.
23
+ """
24
+ sentences = split_sentences(text)
25
+ tokens = tokenize(text)
26
+
27
+ # Filter tokens to only valid words for syllable counting
28
+ word_tokens = normalize_for_readability(tokens)
29
+
30
+ if len(sentences) == 0 or len(word_tokens) == 0:
31
+ return (
32
+ float("nan"),
33
+ float("nan"),
34
+ {"sentence_count": 0, "word_count": 0, "syllable_count": 0},
35
+ )
36
+
37
+ # Count syllables
38
+ total_syllables = sum(count_syllables(word) for word in word_tokens)
39
+
40
+ # Calculate metrics
41
+ words_per_sentence = len(word_tokens) / len(sentences)
42
+ syllables_per_word = total_syllables / len(word_tokens)
43
+
44
+ # Flesch Reading Ease
45
+ reading_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
46
+
47
+ # Flesch-Kincaid Grade Level
48
+ grade_level = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
49
+
50
+ metadata = {
51
+ "sentence_count": len(sentences),
52
+ "word_count": len(word_tokens),
53
+ "syllable_count": total_syllables,
54
+ "words_per_sentence": words_per_sentence,
55
+ "syllables_per_word": syllables_per_word,
56
+ }
57
+
58
+ return (reading_ease, grade_level, metadata)
59
+
60
+
61
+ def _get_difficulty(reading_ease: float) -> str:
62
+ """Determine difficulty rating based on reading ease score."""
63
+ import math
64
+
65
+ if math.isnan(reading_ease):
66
+ return "Unknown"
67
+ if reading_ease >= 90:
68
+ return "Very Easy"
69
+ if reading_ease >= 80:
70
+ return "Easy"
71
+ if reading_ease >= 70:
72
+ return "Fairly Easy"
73
+ if reading_ease >= 60:
74
+ return "Standard"
75
+ if reading_ease >= 50:
76
+ return "Fairly Difficult"
77
+ if reading_ease >= 30:
78
+ return "Difficult"
79
+ return "Very Difficult"
80
+
81
+
82
+ def compute_flesch(text: str, chunk_size: int = 1000) -> FleschResult:
10
83
  """
11
84
  Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
12
85
 
86
+ This function uses native chunked analysis to capture variance and patterns
87
+ across the text, which is essential for stylometric fingerprinting.
88
+
13
89
  Flesch Reading Ease:
14
90
  Score = 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
15
91
  Higher scores = easier to read
16
- Typical range: 0-100, but can exceed bounds for extremely simple (>100) or complex (<0) text
92
+ Typical range: 0-100, but can exceed bounds
17
93
 
18
94
  Flesch-Kincaid Grade Level:
19
95
  Grade = 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
@@ -27,6 +103,10 @@ def compute_flesch(text: str) -> FleschResult:
27
103
  30-49: Difficult (College)
28
104
  0-29: Very Difficult (College graduate)
29
105
 
106
+ Related GitHub Issue:
107
+ #27 - Native chunked analysis with Distribution dataclass
108
+ https://github.com/craigtrim/pystylometry/issues/27
109
+
30
110
  References:
31
111
  Flesch, R. (1948). A new readability yardstick.
32
112
  Journal of Applied Psychology, 32(3), 221.
@@ -36,91 +116,115 @@ def compute_flesch(text: str) -> FleschResult:
36
116
 
37
117
  Args:
38
118
  text: Input text to analyze
119
+ chunk_size: Number of words per chunk (default: 1000).
120
+ The text is divided into chunks of this size, and metrics are
121
+ computed per-chunk. Use a large value (e.g., 1_000_000) for
122
+ single-chunk "aggregate" mode.
39
123
 
40
124
  Returns:
41
- FleschResult with reading ease, grade level, and difficulty rating
42
-
43
- Note: The difficulty label ("Very Easy", "Easy", etc.) is determined solely
44
- from the reading_ease score and does NOT consider the grade_level score.
45
- This means text with high reading_ease (e.g., 85 = "Easy") but high
46
- grade_level (e.g., 12 = college) will still be labeled "Easy". The two
47
- metrics measure different aspects of readability and may not always align.
48
-
49
- Note: For empty input (no sentences or words), reading_ease and grade_level
50
- will be float('nan'). This prevents conflating "no data" with "extremely
51
- difficult text" (score of 0). Consumers should check for NaN before
52
- performing arithmetic operations (e.g., using math.isnan() or filtering
53
- before aggregation) to avoid silent propagation of NaN in statistics.
125
+ FleschResult with:
126
+ - reading_ease: Mean reading ease across chunks
127
+ - grade_level: Mean grade level across chunks
128
+ - difficulty: Difficulty rating based on mean reading_ease
129
+ - reading_ease_dist: Distribution with per-chunk values and stats
130
+ - grade_level_dist: Distribution with per-chunk values and stats
131
+ - chunk_size: The chunk size used
132
+ - chunk_count: Number of chunks analyzed
54
133
 
55
134
  Example:
56
- >>> result = compute_flesch("The quick brown fox jumps over the lazy dog.")
57
- >>> print(f"Reading Ease: {result.reading_ease:.1f}")
58
- >>> print(f"Grade Level: {result.grade_level:.1f}")
59
- >>> print(f"Difficulty: {result.difficulty}")
60
-
61
- >>> # Empty input returns NaN
62
- >>> import math
63
- >>> result_empty = compute_flesch("")
64
- >>> math.isnan(result_empty.reading_ease)
65
- True
66
- >>> result_empty.difficulty
67
- 'Unknown'
135
+ >>> result = compute_flesch("Long text here...", chunk_size=1000)
136
+ >>> result.reading_ease # Mean across chunks
137
+ 68.54
138
+ >>> result.reading_ease_dist.std # Variance reveals fingerprint
139
+ 4.2
140
+ >>> result.reading_ease_dist.values # Per-chunk values
141
+ [65.2, 71.1, 68.8, ...]
142
+ >>> result.chunk_count
143
+ 59
144
+
145
+ >>> # Single-chunk mode (no chunking)
146
+ >>> result = compute_flesch("Short text.", chunk_size=1_000_000)
147
+ >>> result.chunk_count
148
+ 1
68
149
  """
69
- sentences = split_sentences(text)
70
- tokens = tokenize(text)
71
-
72
- # Filter tokens to only valid words for syllable counting
73
- # Removes numbers, URLs, emails, etc. that would cause errors
74
- word_tokens = normalize_for_readability(tokens)
75
-
76
- if len(sentences) == 0 or len(word_tokens) == 0:
150
+ import math
151
+
152
+ # Chunk the text
153
+ chunks = chunk_text(text, chunk_size)
154
+
155
+ # Compute metrics per chunk
156
+ reading_ease_values = []
157
+ grade_level_values = []
158
+ total_sentences = 0
159
+ total_words = 0
160
+ total_syllables = 0
161
+
162
+ for chunk in chunks:
163
+ re, gl, meta = _compute_flesch_single(chunk)
164
+ if not math.isnan(re): # Only include valid results
165
+ reading_ease_values.append(re)
166
+ grade_level_values.append(gl)
167
+ total_sentences += meta.get("sentence_count", 0)
168
+ total_words += meta.get("word_count", 0)
169
+ total_syllables += meta.get("syllable_count", 0)
170
+
171
+ # Handle empty or all-invalid chunks
172
+ if not reading_ease_values:
173
+ empty_dist = Distribution(
174
+ values=[],
175
+ mean=float("nan"),
176
+ median=float("nan"),
177
+ std=0.0,
178
+ range=0.0,
179
+ iqr=0.0,
180
+ )
77
181
  return FleschResult(
78
182
  reading_ease=float("nan"),
79
183
  grade_level=float("nan"),
80
184
  difficulty="Unknown",
81
- metadata={"sentence_count": 0, "word_count": 0, "syllable_count": 0},
185
+ reading_ease_dist=empty_dist,
186
+ grade_level_dist=empty_dist,
187
+ chunk_size=chunk_size,
188
+ chunk_count=len(chunks),
189
+ metadata={
190
+ # Backward-compatible keys
191
+ "sentence_count": 0,
192
+ "word_count": 0,
193
+ "syllable_count": 0,
194
+ # New prefixed keys for consistency
195
+ "total_sentence_count": 0,
196
+ "total_word_count": 0,
197
+ "total_syllable_count": 0,
198
+ },
82
199
  )
83
200
 
84
- # Count syllables (safe now - only valid words)
85
- total_syllables = sum(count_syllables(word) for word in word_tokens)
86
-
87
- # Calculate metrics
88
- words_per_sentence = len(word_tokens) / len(sentences)
89
- syllables_per_word = total_syllables / len(word_tokens)
90
-
91
- # Flesch Reading Ease: 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
92
- reading_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
93
-
94
- # Flesch-Kincaid Grade Level: 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
95
- grade_level = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
201
+ # Build distributions
202
+ reading_ease_dist = make_distribution(reading_ease_values)
203
+ grade_level_dist = make_distribution(grade_level_values)
96
204
 
97
- # Determine difficulty rating based ONLY on reading ease score (not grade level)
98
- # This is a conscious design choice: difficulty labels follow the Reading Ease
99
- # thresholds exclusively, even though grade_level may suggest a different difficulty
100
- if reading_ease >= 90:
101
- difficulty = "Very Easy"
102
- elif reading_ease >= 80:
103
- difficulty = "Easy"
104
- elif reading_ease >= 70:
105
- difficulty = "Fairly Easy"
106
- elif reading_ease >= 60:
107
- difficulty = "Standard"
108
- elif reading_ease >= 50:
109
- difficulty = "Fairly Difficult"
110
- elif reading_ease >= 30:
111
- difficulty = "Difficult"
112
- else:
113
- difficulty = "Very Difficult"
205
+ # Use mean for convenient access
206
+ mean_reading_ease = reading_ease_dist.mean
207
+ mean_grade_level = grade_level_dist.mean
208
+ difficulty = _get_difficulty(mean_reading_ease)
114
209
 
115
210
  return FleschResult(
116
- reading_ease=reading_ease,
117
- grade_level=grade_level,
211
+ reading_ease=mean_reading_ease,
212
+ grade_level=mean_grade_level,
118
213
  difficulty=difficulty,
214
+ reading_ease_dist=reading_ease_dist,
215
+ grade_level_dist=grade_level_dist,
216
+ chunk_size=chunk_size,
217
+ chunk_count=len(chunks),
119
218
  metadata={
120
- "sentence_count": len(sentences),
121
- "word_count": len(word_tokens),
219
+ # Backward-compatible keys
220
+ "sentence_count": total_sentences,
221
+ "word_count": total_words,
122
222
  "syllable_count": total_syllables,
123
- "words_per_sentence": words_per_sentence,
124
- "syllables_per_word": syllables_per_word,
223
+ # New prefixed keys for consistency
224
+ "total_sentence_count": total_sentences,
225
+ "total_word_count": total_words,
226
+ "total_syllable_count": total_syllables,
227
+ "words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
228
+ "syllables_per_word": total_syllables / total_words if total_words > 0 else 0,
125
229
  },
126
230
  )