pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +29 -3
- pystylometry/_types.py +963 -259
- pystylometry/authorship/__init__.py +23 -2
- pystylometry/authorship/additional_methods.py +4 -29
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -1,19 +1,95 @@
|
|
|
1
|
-
"""Flesch Reading Ease and Flesch-Kincaid Grade Level.
|
|
1
|
+
"""Flesch Reading Ease and Flesch-Kincaid Grade Level.
|
|
2
|
+
|
|
3
|
+
This module implements the Flesch readability formulas with native chunked
|
|
4
|
+
analysis for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
from .._normalize import normalize_for_readability
|
|
4
|
-
from .._types import FleschResult
|
|
12
|
+
from .._types import Distribution, FleschResult, chunk_text, make_distribution
|
|
5
13
|
from .._utils import split_sentences, tokenize
|
|
6
14
|
from .syllables import count_syllables
|
|
7
15
|
|
|
8
16
|
|
|
9
|
-
def
|
|
17
|
+
def _compute_flesch_single(text: str) -> tuple[float, float, dict]:
|
|
18
|
+
"""Compute Flesch metrics for a single chunk of text.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Tuple of (reading_ease, grade_level, metadata_dict).
|
|
22
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
23
|
+
"""
|
|
24
|
+
sentences = split_sentences(text)
|
|
25
|
+
tokens = tokenize(text)
|
|
26
|
+
|
|
27
|
+
# Filter tokens to only valid words for syllable counting
|
|
28
|
+
word_tokens = normalize_for_readability(tokens)
|
|
29
|
+
|
|
30
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
31
|
+
return (
|
|
32
|
+
float("nan"),
|
|
33
|
+
float("nan"),
|
|
34
|
+
{"sentence_count": 0, "word_count": 0, "syllable_count": 0},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Count syllables
|
|
38
|
+
total_syllables = sum(count_syllables(word) for word in word_tokens)
|
|
39
|
+
|
|
40
|
+
# Calculate metrics
|
|
41
|
+
words_per_sentence = len(word_tokens) / len(sentences)
|
|
42
|
+
syllables_per_word = total_syllables / len(word_tokens)
|
|
43
|
+
|
|
44
|
+
# Flesch Reading Ease
|
|
45
|
+
reading_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
|
|
46
|
+
|
|
47
|
+
# Flesch-Kincaid Grade Level
|
|
48
|
+
grade_level = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
|
|
49
|
+
|
|
50
|
+
metadata = {
|
|
51
|
+
"sentence_count": len(sentences),
|
|
52
|
+
"word_count": len(word_tokens),
|
|
53
|
+
"syllable_count": total_syllables,
|
|
54
|
+
"words_per_sentence": words_per_sentence,
|
|
55
|
+
"syllables_per_word": syllables_per_word,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return (reading_ease, grade_level, metadata)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _get_difficulty(reading_ease: float) -> str:
|
|
62
|
+
"""Determine difficulty rating based on reading ease score."""
|
|
63
|
+
import math
|
|
64
|
+
|
|
65
|
+
if math.isnan(reading_ease):
|
|
66
|
+
return "Unknown"
|
|
67
|
+
if reading_ease >= 90:
|
|
68
|
+
return "Very Easy"
|
|
69
|
+
if reading_ease >= 80:
|
|
70
|
+
return "Easy"
|
|
71
|
+
if reading_ease >= 70:
|
|
72
|
+
return "Fairly Easy"
|
|
73
|
+
if reading_ease >= 60:
|
|
74
|
+
return "Standard"
|
|
75
|
+
if reading_ease >= 50:
|
|
76
|
+
return "Fairly Difficult"
|
|
77
|
+
if reading_ease >= 30:
|
|
78
|
+
return "Difficult"
|
|
79
|
+
return "Very Difficult"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def compute_flesch(text: str, chunk_size: int = 1000) -> FleschResult:
|
|
10
83
|
"""
|
|
11
84
|
Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
|
|
12
85
|
|
|
86
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
87
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
88
|
+
|
|
13
89
|
Flesch Reading Ease:
|
|
14
90
|
Score = 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
|
|
15
91
|
Higher scores = easier to read
|
|
16
|
-
Typical range: 0-100, but can exceed bounds
|
|
92
|
+
Typical range: 0-100, but can exceed bounds
|
|
17
93
|
|
|
18
94
|
Flesch-Kincaid Grade Level:
|
|
19
95
|
Grade = 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
|
|
@@ -27,6 +103,10 @@ def compute_flesch(text: str) -> FleschResult:
|
|
|
27
103
|
30-49: Difficult (College)
|
|
28
104
|
0-29: Very Difficult (College graduate)
|
|
29
105
|
|
|
106
|
+
Related GitHub Issue:
|
|
107
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
108
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
109
|
+
|
|
30
110
|
References:
|
|
31
111
|
Flesch, R. (1948). A new readability yardstick.
|
|
32
112
|
Journal of Applied Psychology, 32(3), 221.
|
|
@@ -36,91 +116,115 @@ def compute_flesch(text: str) -> FleschResult:
|
|
|
36
116
|
|
|
37
117
|
Args:
|
|
38
118
|
text: Input text to analyze
|
|
119
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
120
|
+
The text is divided into chunks of this size, and metrics are
|
|
121
|
+
computed per-chunk. Use a large value (e.g., 1_000_000) for
|
|
122
|
+
single-chunk "aggregate" mode.
|
|
39
123
|
|
|
40
124
|
Returns:
|
|
41
|
-
FleschResult with
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
Note: For empty input (no sentences or words), reading_ease and grade_level
|
|
50
|
-
will be float('nan'). This prevents conflating "no data" with "extremely
|
|
51
|
-
difficult text" (score of 0). Consumers should check for NaN before
|
|
52
|
-
performing arithmetic operations (e.g., using math.isnan() or filtering
|
|
53
|
-
before aggregation) to avoid silent propagation of NaN in statistics.
|
|
125
|
+
FleschResult with:
|
|
126
|
+
- reading_ease: Mean reading ease across chunks
|
|
127
|
+
- grade_level: Mean grade level across chunks
|
|
128
|
+
- difficulty: Difficulty rating based on mean reading_ease
|
|
129
|
+
- reading_ease_dist: Distribution with per-chunk values and stats
|
|
130
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
131
|
+
- chunk_size: The chunk size used
|
|
132
|
+
- chunk_count: Number of chunks analyzed
|
|
54
133
|
|
|
55
134
|
Example:
|
|
56
|
-
>>> result = compute_flesch("
|
|
57
|
-
>>>
|
|
58
|
-
|
|
59
|
-
>>>
|
|
60
|
-
|
|
61
|
-
>>> #
|
|
62
|
-
|
|
63
|
-
>>>
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
>>>
|
|
67
|
-
|
|
135
|
+
>>> result = compute_flesch("Long text here...", chunk_size=1000)
|
|
136
|
+
>>> result.reading_ease # Mean across chunks
|
|
137
|
+
68.54
|
|
138
|
+
>>> result.reading_ease_dist.std # Variance reveals fingerprint
|
|
139
|
+
4.2
|
|
140
|
+
>>> result.reading_ease_dist.values # Per-chunk values
|
|
141
|
+
[65.2, 71.1, 68.8, ...]
|
|
142
|
+
>>> result.chunk_count
|
|
143
|
+
59
|
|
144
|
+
|
|
145
|
+
>>> # Single-chunk mode (no chunking)
|
|
146
|
+
>>> result = compute_flesch("Short text.", chunk_size=1_000_000)
|
|
147
|
+
>>> result.chunk_count
|
|
148
|
+
1
|
|
68
149
|
"""
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
150
|
+
import math
|
|
151
|
+
|
|
152
|
+
# Chunk the text
|
|
153
|
+
chunks = chunk_text(text, chunk_size)
|
|
154
|
+
|
|
155
|
+
# Compute metrics per chunk
|
|
156
|
+
reading_ease_values = []
|
|
157
|
+
grade_level_values = []
|
|
158
|
+
total_sentences = 0
|
|
159
|
+
total_words = 0
|
|
160
|
+
total_syllables = 0
|
|
161
|
+
|
|
162
|
+
for chunk in chunks:
|
|
163
|
+
re, gl, meta = _compute_flesch_single(chunk)
|
|
164
|
+
if not math.isnan(re): # Only include valid results
|
|
165
|
+
reading_ease_values.append(re)
|
|
166
|
+
grade_level_values.append(gl)
|
|
167
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
168
|
+
total_words += meta.get("word_count", 0)
|
|
169
|
+
total_syllables += meta.get("syllable_count", 0)
|
|
170
|
+
|
|
171
|
+
# Handle empty or all-invalid chunks
|
|
172
|
+
if not reading_ease_values:
|
|
173
|
+
empty_dist = Distribution(
|
|
174
|
+
values=[],
|
|
175
|
+
mean=float("nan"),
|
|
176
|
+
median=float("nan"),
|
|
177
|
+
std=0.0,
|
|
178
|
+
range=0.0,
|
|
179
|
+
iqr=0.0,
|
|
180
|
+
)
|
|
77
181
|
return FleschResult(
|
|
78
182
|
reading_ease=float("nan"),
|
|
79
183
|
grade_level=float("nan"),
|
|
80
184
|
difficulty="Unknown",
|
|
81
|
-
|
|
185
|
+
reading_ease_dist=empty_dist,
|
|
186
|
+
grade_level_dist=empty_dist,
|
|
187
|
+
chunk_size=chunk_size,
|
|
188
|
+
chunk_count=len(chunks),
|
|
189
|
+
metadata={
|
|
190
|
+
# Backward-compatible keys
|
|
191
|
+
"sentence_count": 0,
|
|
192
|
+
"word_count": 0,
|
|
193
|
+
"syllable_count": 0,
|
|
194
|
+
# New prefixed keys for consistency
|
|
195
|
+
"total_sentence_count": 0,
|
|
196
|
+
"total_word_count": 0,
|
|
197
|
+
"total_syllable_count": 0,
|
|
198
|
+
},
|
|
82
199
|
)
|
|
83
200
|
|
|
84
|
-
#
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
# Calculate metrics
|
|
88
|
-
words_per_sentence = len(word_tokens) / len(sentences)
|
|
89
|
-
syllables_per_word = total_syllables / len(word_tokens)
|
|
90
|
-
|
|
91
|
-
# Flesch Reading Ease: 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
|
|
92
|
-
reading_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
|
|
93
|
-
|
|
94
|
-
# Flesch-Kincaid Grade Level: 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
|
|
95
|
-
grade_level = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
|
|
201
|
+
# Build distributions
|
|
202
|
+
reading_ease_dist = make_distribution(reading_ease_values)
|
|
203
|
+
grade_level_dist = make_distribution(grade_level_values)
|
|
96
204
|
|
|
97
|
-
#
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
difficulty = "Very Easy"
|
|
102
|
-
elif reading_ease >= 80:
|
|
103
|
-
difficulty = "Easy"
|
|
104
|
-
elif reading_ease >= 70:
|
|
105
|
-
difficulty = "Fairly Easy"
|
|
106
|
-
elif reading_ease >= 60:
|
|
107
|
-
difficulty = "Standard"
|
|
108
|
-
elif reading_ease >= 50:
|
|
109
|
-
difficulty = "Fairly Difficult"
|
|
110
|
-
elif reading_ease >= 30:
|
|
111
|
-
difficulty = "Difficult"
|
|
112
|
-
else:
|
|
113
|
-
difficulty = "Very Difficult"
|
|
205
|
+
# Use mean for convenient access
|
|
206
|
+
mean_reading_ease = reading_ease_dist.mean
|
|
207
|
+
mean_grade_level = grade_level_dist.mean
|
|
208
|
+
difficulty = _get_difficulty(mean_reading_ease)
|
|
114
209
|
|
|
115
210
|
return FleschResult(
|
|
116
|
-
reading_ease=
|
|
117
|
-
grade_level=
|
|
211
|
+
reading_ease=mean_reading_ease,
|
|
212
|
+
grade_level=mean_grade_level,
|
|
118
213
|
difficulty=difficulty,
|
|
214
|
+
reading_ease_dist=reading_ease_dist,
|
|
215
|
+
grade_level_dist=grade_level_dist,
|
|
216
|
+
chunk_size=chunk_size,
|
|
217
|
+
chunk_count=len(chunks),
|
|
119
218
|
metadata={
|
|
120
|
-
|
|
121
|
-
"
|
|
219
|
+
# Backward-compatible keys
|
|
220
|
+
"sentence_count": total_sentences,
|
|
221
|
+
"word_count": total_words,
|
|
122
222
|
"syllable_count": total_syllables,
|
|
123
|
-
|
|
124
|
-
"
|
|
223
|
+
# New prefixed keys for consistency
|
|
224
|
+
"total_sentence_count": total_sentences,
|
|
225
|
+
"total_word_count": total_words,
|
|
226
|
+
"total_syllable_count": total_syllables,
|
|
227
|
+
"words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
|
|
228
|
+
"syllables_per_word": total_syllables / total_words if total_words > 0 else 0,
|
|
125
229
|
},
|
|
126
230
|
)
|