pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -1,17 +1,95 @@
|
|
|
1
|
-
"""Flesch Reading Ease and Flesch-Kincaid Grade Level.
|
|
1
|
+
"""Flesch Reading Ease and Flesch-Kincaid Grade Level.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This module implements the Flesch readability formulas with native chunked
|
|
4
|
+
analysis for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .._normalize import normalize_for_readability
|
|
12
|
+
from .._types import Distribution, FleschResult, chunk_text, make_distribution
|
|
4
13
|
from .._utils import split_sentences, tokenize
|
|
5
14
|
from .syllables import count_syllables
|
|
6
15
|
|
|
7
16
|
|
|
8
|
-
def
|
|
17
|
+
def _compute_flesch_single(text: str) -> tuple[float, float, dict]:
|
|
18
|
+
"""Compute Flesch metrics for a single chunk of text.
|
|
19
|
+
|
|
20
|
+
Returns:
|
|
21
|
+
Tuple of (reading_ease, grade_level, metadata_dict).
|
|
22
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
23
|
+
"""
|
|
24
|
+
sentences = split_sentences(text)
|
|
25
|
+
tokens = tokenize(text)
|
|
26
|
+
|
|
27
|
+
# Filter tokens to only valid words for syllable counting
|
|
28
|
+
word_tokens = normalize_for_readability(tokens)
|
|
29
|
+
|
|
30
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
31
|
+
return (
|
|
32
|
+
float("nan"),
|
|
33
|
+
float("nan"),
|
|
34
|
+
{"sentence_count": 0, "word_count": 0, "syllable_count": 0},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Count syllables
|
|
38
|
+
total_syllables = sum(count_syllables(word) for word in word_tokens)
|
|
39
|
+
|
|
40
|
+
# Calculate metrics
|
|
41
|
+
words_per_sentence = len(word_tokens) / len(sentences)
|
|
42
|
+
syllables_per_word = total_syllables / len(word_tokens)
|
|
43
|
+
|
|
44
|
+
# Flesch Reading Ease
|
|
45
|
+
reading_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
|
|
46
|
+
|
|
47
|
+
# Flesch-Kincaid Grade Level
|
|
48
|
+
grade_level = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
|
|
49
|
+
|
|
50
|
+
metadata = {
|
|
51
|
+
"sentence_count": len(sentences),
|
|
52
|
+
"word_count": len(word_tokens),
|
|
53
|
+
"syllable_count": total_syllables,
|
|
54
|
+
"words_per_sentence": words_per_sentence,
|
|
55
|
+
"syllables_per_word": syllables_per_word,
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
return (reading_ease, grade_level, metadata)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _get_difficulty(reading_ease: float) -> str:
|
|
62
|
+
"""Determine difficulty rating based on reading ease score."""
|
|
63
|
+
import math
|
|
64
|
+
|
|
65
|
+
if math.isnan(reading_ease):
|
|
66
|
+
return "Unknown"
|
|
67
|
+
if reading_ease >= 90:
|
|
68
|
+
return "Very Easy"
|
|
69
|
+
if reading_ease >= 80:
|
|
70
|
+
return "Easy"
|
|
71
|
+
if reading_ease >= 70:
|
|
72
|
+
return "Fairly Easy"
|
|
73
|
+
if reading_ease >= 60:
|
|
74
|
+
return "Standard"
|
|
75
|
+
if reading_ease >= 50:
|
|
76
|
+
return "Fairly Difficult"
|
|
77
|
+
if reading_ease >= 30:
|
|
78
|
+
return "Difficult"
|
|
79
|
+
return "Very Difficult"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def compute_flesch(text: str, chunk_size: int = 1000) -> FleschResult:
|
|
9
83
|
"""
|
|
10
84
|
Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
|
|
11
85
|
|
|
86
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
87
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
88
|
+
|
|
12
89
|
Flesch Reading Ease:
|
|
13
90
|
Score = 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
|
|
14
|
-
Higher scores = easier to read
|
|
91
|
+
Higher scores = easier to read
|
|
92
|
+
Typical range: 0-100, but can exceed bounds
|
|
15
93
|
|
|
16
94
|
Flesch-Kincaid Grade Level:
|
|
17
95
|
Grade = 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
|
|
@@ -25,6 +103,10 @@ def compute_flesch(text: str) -> FleschResult:
|
|
|
25
103
|
30-49: Difficult (College)
|
|
26
104
|
0-29: Very Difficult (College graduate)
|
|
27
105
|
|
|
106
|
+
Related GitHub Issue:
|
|
107
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
108
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
109
|
+
|
|
28
110
|
References:
|
|
29
111
|
Flesch, R. (1948). A new readability yardstick.
|
|
30
112
|
Journal of Applied Psychology, 32(3), 221.
|
|
@@ -34,48 +116,115 @@ def compute_flesch(text: str) -> FleschResult:
|
|
|
34
116
|
|
|
35
117
|
Args:
|
|
36
118
|
text: Input text to analyze
|
|
119
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
120
|
+
The text is divided into chunks of this size, and metrics are
|
|
121
|
+
computed per-chunk. Use a large value (e.g., 1_000_000) for
|
|
122
|
+
single-chunk "aggregate" mode.
|
|
37
123
|
|
|
38
124
|
Returns:
|
|
39
|
-
FleschResult with
|
|
125
|
+
FleschResult with:
|
|
126
|
+
- reading_ease: Mean reading ease across chunks
|
|
127
|
+
- grade_level: Mean grade level across chunks
|
|
128
|
+
- difficulty: Difficulty rating based on mean reading_ease
|
|
129
|
+
- reading_ease_dist: Distribution with per-chunk values and stats
|
|
130
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
131
|
+
- chunk_size: The chunk size used
|
|
132
|
+
- chunk_count: Number of chunks analyzed
|
|
40
133
|
|
|
41
134
|
Example:
|
|
42
|
-
>>> result = compute_flesch("
|
|
43
|
-
>>>
|
|
44
|
-
|
|
45
|
-
>>>
|
|
135
|
+
>>> result = compute_flesch("Long text here...", chunk_size=1000)
|
|
136
|
+
>>> result.reading_ease # Mean across chunks
|
|
137
|
+
68.54
|
|
138
|
+
>>> result.reading_ease_dist.std # Variance reveals fingerprint
|
|
139
|
+
4.2
|
|
140
|
+
>>> result.reading_ease_dist.values # Per-chunk values
|
|
141
|
+
[65.2, 71.1, 68.8, ...]
|
|
142
|
+
>>> result.chunk_count
|
|
143
|
+
59
|
|
144
|
+
|
|
145
|
+
>>> # Single-chunk mode (no chunking)
|
|
146
|
+
>>> result = compute_flesch("Short text.", chunk_size=1_000_000)
|
|
147
|
+
>>> result.chunk_count
|
|
148
|
+
1
|
|
46
149
|
"""
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
150
|
+
import math
|
|
151
|
+
|
|
152
|
+
# Chunk the text
|
|
153
|
+
chunks = chunk_text(text, chunk_size)
|
|
154
|
+
|
|
155
|
+
# Compute metrics per chunk
|
|
156
|
+
reading_ease_values = []
|
|
157
|
+
grade_level_values = []
|
|
158
|
+
total_sentences = 0
|
|
159
|
+
total_words = 0
|
|
160
|
+
total_syllables = 0
|
|
161
|
+
|
|
162
|
+
for chunk in chunks:
|
|
163
|
+
re, gl, meta = _compute_flesch_single(chunk)
|
|
164
|
+
if not math.isnan(re): # Only include valid results
|
|
165
|
+
reading_ease_values.append(re)
|
|
166
|
+
grade_level_values.append(gl)
|
|
167
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
168
|
+
total_words += meta.get("word_count", 0)
|
|
169
|
+
total_syllables += meta.get("syllable_count", 0)
|
|
170
|
+
|
|
171
|
+
# Handle empty or all-invalid chunks
|
|
172
|
+
if not reading_ease_values:
|
|
173
|
+
empty_dist = Distribution(
|
|
174
|
+
values=[],
|
|
175
|
+
mean=float("nan"),
|
|
176
|
+
median=float("nan"),
|
|
177
|
+
std=0.0,
|
|
178
|
+
range=0.0,
|
|
179
|
+
iqr=0.0,
|
|
180
|
+
)
|
|
51
181
|
return FleschResult(
|
|
52
|
-
reading_ease=
|
|
53
|
-
grade_level=
|
|
182
|
+
reading_ease=float("nan"),
|
|
183
|
+
grade_level=float("nan"),
|
|
54
184
|
difficulty="Unknown",
|
|
55
|
-
|
|
185
|
+
reading_ease_dist=empty_dist,
|
|
186
|
+
grade_level_dist=empty_dist,
|
|
187
|
+
chunk_size=chunk_size,
|
|
188
|
+
chunk_count=len(chunks),
|
|
189
|
+
metadata={
|
|
190
|
+
# Backward-compatible keys
|
|
191
|
+
"sentence_count": 0,
|
|
192
|
+
"word_count": 0,
|
|
193
|
+
"syllable_count": 0,
|
|
194
|
+
# New prefixed keys for consistency
|
|
195
|
+
"total_sentence_count": 0,
|
|
196
|
+
"total_word_count": 0,
|
|
197
|
+
"total_syllable_count": 0,
|
|
198
|
+
},
|
|
56
199
|
)
|
|
57
200
|
|
|
58
|
-
#
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
# Calculate metrics
|
|
62
|
-
words_per_sentence = len(tokens) / len(sentences)
|
|
63
|
-
syllables_per_word = total_syllables / len(tokens)
|
|
201
|
+
# Build distributions
|
|
202
|
+
reading_ease_dist = make_distribution(reading_ease_values)
|
|
203
|
+
grade_level_dist = make_distribution(grade_level_values)
|
|
64
204
|
|
|
65
|
-
#
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
difficulty =
|
|
205
|
+
# Use mean for convenient access
|
|
206
|
+
mean_reading_ease = reading_ease_dist.mean
|
|
207
|
+
mean_grade_level = grade_level_dist.mean
|
|
208
|
+
difficulty = _get_difficulty(mean_reading_ease)
|
|
69
209
|
|
|
70
210
|
return FleschResult(
|
|
71
|
-
reading_ease=
|
|
72
|
-
grade_level=
|
|
211
|
+
reading_ease=mean_reading_ease,
|
|
212
|
+
grade_level=mean_grade_level,
|
|
73
213
|
difficulty=difficulty,
|
|
214
|
+
reading_ease_dist=reading_ease_dist,
|
|
215
|
+
grade_level_dist=grade_level_dist,
|
|
216
|
+
chunk_size=chunk_size,
|
|
217
|
+
chunk_count=len(chunks),
|
|
74
218
|
metadata={
|
|
75
|
-
|
|
76
|
-
"
|
|
219
|
+
# Backward-compatible keys
|
|
220
|
+
"sentence_count": total_sentences,
|
|
221
|
+
"word_count": total_words,
|
|
77
222
|
"syllable_count": total_syllables,
|
|
78
|
-
|
|
79
|
-
"
|
|
223
|
+
# New prefixed keys for consistency
|
|
224
|
+
"total_sentence_count": total_sentences,
|
|
225
|
+
"total_word_count": total_words,
|
|
226
|
+
"total_syllable_count": total_syllables,
|
|
227
|
+
"words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
|
|
228
|
+
"syllables_per_word": total_syllables / total_words if total_words > 0 else 0,
|
|
80
229
|
},
|
|
81
230
|
)
|
|
@@ -1,63 +1,236 @@
|
|
|
1
|
-
"""Gunning Fog Index.
|
|
1
|
+
"""Gunning Fog Index with NLP-enhanced complex word detection.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This module computes the Gunning Fog Index, a readability metric that
|
|
4
|
+
estimates the years of formal education needed to understand text on first reading.
|
|
5
|
+
|
|
6
|
+
This implementation includes native chunked analysis for stylometric fingerprinting.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issues:
|
|
9
|
+
#4 - NLP-enhanced complex word detection
|
|
10
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
11
|
+
|
|
12
|
+
Historical Background:
|
|
13
|
+
----------------------
|
|
14
|
+
The Gunning Fog Index was developed by Robert Gunning in 1952 as part of his
|
|
15
|
+
work helping businesses improve the clarity of their writing. The formula produces
|
|
16
|
+
a U.S. grade-level score (e.g., 12 = high school senior reading level).
|
|
17
|
+
|
|
18
|
+
Reference:
|
|
19
|
+
Gunning, R. (1952). The Technique of Clear Writing.
|
|
20
|
+
McGraw-Hill, New York.
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
import math
|
|
24
|
+
|
|
25
|
+
from .._normalize import normalize_for_readability
|
|
26
|
+
from .._types import Distribution, GunningFogResult, chunk_text, make_distribution
|
|
4
27
|
from .._utils import split_sentences, tokenize
|
|
5
|
-
from .
|
|
28
|
+
from .complex_words import process_text_for_complex_words
|
|
29
|
+
|
|
30
|
+
# Formula coefficient from Gunning (1952)
|
|
31
|
+
_FOG_COEFFICIENT = 0.4
|
|
32
|
+
|
|
6
33
|
|
|
34
|
+
def _compute_gunning_fog_single(text: str, spacy_model: str) -> tuple[float, float, dict]:
|
|
35
|
+
"""Compute Gunning Fog metrics for a single chunk of text.
|
|
7
36
|
|
|
8
|
-
|
|
37
|
+
Returns:
|
|
38
|
+
Tuple of (fog_index, grade_level, metadata_dict).
|
|
39
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
9
40
|
"""
|
|
10
|
-
|
|
41
|
+
sentences = split_sentences(text)
|
|
42
|
+
all_tokens = tokenize(text)
|
|
43
|
+
tokens = normalize_for_readability(all_tokens)
|
|
44
|
+
|
|
45
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
46
|
+
return (
|
|
47
|
+
float("nan"),
|
|
48
|
+
float("nan"),
|
|
49
|
+
{
|
|
50
|
+
"sentence_count": 0,
|
|
51
|
+
"word_count": 0,
|
|
52
|
+
"complex_word_count": 0,
|
|
53
|
+
"complex_word_percentage": 0.0,
|
|
54
|
+
},
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Count complex words using NLP-enhanced detection
|
|
58
|
+
complex_word_count, detection_metadata = process_text_for_complex_words(
|
|
59
|
+
text, tokens, model=spacy_model
|
|
60
|
+
)
|
|
11
61
|
|
|
12
|
-
|
|
62
|
+
# Calculate formula components
|
|
63
|
+
average_words_per_sentence = len(tokens) / len(sentences)
|
|
64
|
+
complex_word_percentage = (complex_word_count / len(tokens)) * 100
|
|
65
|
+
|
|
66
|
+
# Apply Gunning Fog formula
|
|
67
|
+
fog_index = _FOG_COEFFICIENT * (average_words_per_sentence + complex_word_percentage)
|
|
68
|
+
grade_level = max(0, min(20, round(fog_index)))
|
|
69
|
+
|
|
70
|
+
metadata = {
|
|
71
|
+
"sentence_count": len(sentences),
|
|
72
|
+
"word_count": len(tokens),
|
|
73
|
+
"complex_word_count": complex_word_count,
|
|
74
|
+
"complex_word_percentage": complex_word_percentage,
|
|
75
|
+
"average_words_per_sentence": average_words_per_sentence,
|
|
76
|
+
**detection_metadata,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return (fog_index, float(grade_level), metadata)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def compute_gunning_fog(
|
|
83
|
+
text: str, chunk_size: int = 1000, spacy_model: str = "en_core_web_sm"
|
|
84
|
+
) -> GunningFogResult:
|
|
85
|
+
"""
|
|
86
|
+
Compute Gunning Fog Index with NLP-enhanced complex word detection.
|
|
87
|
+
|
|
88
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
89
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
90
|
+
|
|
91
|
+
Formula (Gunning, 1952):
|
|
92
|
+
------------------------
|
|
13
93
|
Fog Index = 0.4 × [(words/sentences) + 100 × (complex words/words)]
|
|
14
94
|
|
|
15
|
-
Where complex words are
|
|
16
|
-
|
|
95
|
+
Where complex words are words with 3+ syllables, EXCLUDING:
|
|
96
|
+
1. Proper nouns (names, places, organizations)
|
|
97
|
+
2. Compound words (hyphenated)
|
|
98
|
+
3. Common verb forms (-es, -ed, -ing endings)
|
|
17
99
|
|
|
18
|
-
|
|
19
|
-
|
|
100
|
+
Related GitHub Issues:
|
|
101
|
+
#4 - NLP-enhanced complex word detection
|
|
102
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
20
103
|
|
|
21
|
-
|
|
22
|
-
Gunning, R. (1952). The Technique of Clear Writing.
|
|
23
|
-
McGraw-Hill.
|
|
104
|
+
Reference:
|
|
105
|
+
Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
|
|
24
106
|
|
|
25
107
|
Args:
|
|
26
108
|
text: Input text to analyze
|
|
109
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
110
|
+
The text is divided into chunks of this size, and metrics are
|
|
111
|
+
computed per-chunk.
|
|
112
|
+
spacy_model: spaCy model name for enhanced mode (default: "en_core_web_sm")
|
|
27
113
|
|
|
28
114
|
Returns:
|
|
29
|
-
GunningFogResult with
|
|
115
|
+
GunningFogResult with:
|
|
116
|
+
- fog_index: Mean Fog Index across chunks
|
|
117
|
+
- grade_level: Mean grade level across chunks
|
|
118
|
+
- fog_index_dist: Distribution with per-chunk values and stats
|
|
119
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
120
|
+
- chunk_size: The chunk size used
|
|
121
|
+
- chunk_count: Number of chunks analyzed
|
|
30
122
|
|
|
31
123
|
Example:
|
|
32
|
-
>>> result = compute_gunning_fog("
|
|
33
|
-
>>>
|
|
34
|
-
|
|
124
|
+
>>> result = compute_gunning_fog("Long text here...", chunk_size=1000)
|
|
125
|
+
>>> result.fog_index # Mean across chunks
|
|
126
|
+
12.5
|
|
127
|
+
>>> result.fog_index_dist.std # Variance reveals fingerprint
|
|
128
|
+
2.1
|
|
35
129
|
"""
|
|
36
|
-
|
|
37
|
-
|
|
130
|
+
# Chunk the text
|
|
131
|
+
chunks = chunk_text(text, chunk_size)
|
|
38
132
|
|
|
39
|
-
|
|
133
|
+
# Compute metrics per chunk
|
|
134
|
+
fog_values = []
|
|
135
|
+
grade_values = []
|
|
136
|
+
total_sentences = 0
|
|
137
|
+
total_words = 0
|
|
138
|
+
total_complex = 0
|
|
139
|
+
detection_metadata: dict = {}
|
|
140
|
+
|
|
141
|
+
for chunk in chunks:
|
|
142
|
+
fi, gl, meta = _compute_gunning_fog_single(chunk, spacy_model)
|
|
143
|
+
if not math.isnan(fi):
|
|
144
|
+
fog_values.append(fi)
|
|
145
|
+
grade_values.append(gl)
|
|
146
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
147
|
+
total_words += meta.get("word_count", 0)
|
|
148
|
+
total_complex += meta.get("complex_word_count", 0)
|
|
149
|
+
# Capture detection metadata from first chunk (same for all chunks)
|
|
150
|
+
if not detection_metadata and "mode" in meta:
|
|
151
|
+
detection_metadata = {
|
|
152
|
+
"mode": meta.get("mode"),
|
|
153
|
+
"proper_noun_detection": meta.get("proper_noun_detection"),
|
|
154
|
+
"inflection_handling": meta.get("inflection_handling"),
|
|
155
|
+
}
|
|
156
|
+
if "spacy_model" in meta:
|
|
157
|
+
detection_metadata["spacy_model"] = meta.get("spacy_model")
|
|
158
|
+
|
|
159
|
+
# Handle empty or all-invalid chunks
|
|
160
|
+
if not fog_values:
|
|
161
|
+
empty_dist = Distribution(
|
|
162
|
+
values=[],
|
|
163
|
+
mean=float("nan"),
|
|
164
|
+
median=float("nan"),
|
|
165
|
+
std=0.0,
|
|
166
|
+
range=0.0,
|
|
167
|
+
iqr=0.0,
|
|
168
|
+
)
|
|
40
169
|
return GunningFogResult(
|
|
41
|
-
fog_index=
|
|
42
|
-
grade_level=
|
|
43
|
-
|
|
170
|
+
fog_index=float("nan"),
|
|
171
|
+
grade_level=float("nan"),
|
|
172
|
+
fog_index_dist=empty_dist,
|
|
173
|
+
grade_level_dist=empty_dist,
|
|
174
|
+
chunk_size=chunk_size,
|
|
175
|
+
chunk_count=len(chunks),
|
|
176
|
+
metadata={
|
|
177
|
+
# Backward-compatible keys
|
|
178
|
+
"sentence_count": 0,
|
|
179
|
+
"word_count": 0,
|
|
180
|
+
"complex_word_count": 0,
|
|
181
|
+
"complex_word_percentage": 0.0,
|
|
182
|
+
"average_words_per_sentence": 0.0,
|
|
183
|
+
# New prefixed keys for consistency
|
|
184
|
+
"total_sentence_count": 0,
|
|
185
|
+
"total_word_count": 0,
|
|
186
|
+
"total_complex_word_count": 0,
|
|
187
|
+
"reliable": False,
|
|
188
|
+
# Detection metadata
|
|
189
|
+
"mode": "none",
|
|
190
|
+
"proper_noun_detection": "none",
|
|
191
|
+
"inflection_handling": "none",
|
|
192
|
+
},
|
|
44
193
|
)
|
|
45
194
|
|
|
46
|
-
#
|
|
47
|
-
|
|
48
|
-
|
|
195
|
+
# Build distributions
|
|
196
|
+
fog_dist = make_distribution(fog_values)
|
|
197
|
+
grade_dist = make_distribution(grade_values)
|
|
198
|
+
|
|
199
|
+
# Reliability heuristic
|
|
200
|
+
reliable = total_words >= 100 and total_sentences >= 3
|
|
49
201
|
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
|
|
202
|
+
# Ensure detection metadata has defaults
|
|
203
|
+
if not detection_metadata:
|
|
204
|
+
detection_metadata = {
|
|
205
|
+
"mode": "none",
|
|
206
|
+
"proper_noun_detection": "none",
|
|
207
|
+
"inflection_handling": "none",
|
|
208
|
+
}
|
|
53
209
|
|
|
54
210
|
return GunningFogResult(
|
|
55
|
-
fog_index=
|
|
56
|
-
grade_level=
|
|
211
|
+
fog_index=fog_dist.mean,
|
|
212
|
+
grade_level=grade_dist.mean,
|
|
213
|
+
fog_index_dist=fog_dist,
|
|
214
|
+
grade_level_dist=grade_dist,
|
|
215
|
+
chunk_size=chunk_size,
|
|
216
|
+
chunk_count=len(chunks),
|
|
57
217
|
metadata={
|
|
58
|
-
|
|
59
|
-
"
|
|
60
|
-
"
|
|
61
|
-
"
|
|
218
|
+
# Backward-compatible keys
|
|
219
|
+
"sentence_count": total_sentences,
|
|
220
|
+
"word_count": total_words,
|
|
221
|
+
"complex_word_count": total_complex,
|
|
222
|
+
"complex_word_percentage": (total_complex / total_words * 100)
|
|
223
|
+
if total_words > 0
|
|
224
|
+
else 0,
|
|
225
|
+
"average_words_per_sentence": total_words / total_sentences
|
|
226
|
+
if total_sentences > 0
|
|
227
|
+
else 0,
|
|
228
|
+
# New prefixed keys for consistency
|
|
229
|
+
"total_sentence_count": total_sentences,
|
|
230
|
+
"total_word_count": total_words,
|
|
231
|
+
"total_complex_word_count": total_complex,
|
|
232
|
+
"reliable": reliable,
|
|
233
|
+
# Detection metadata
|
|
234
|
+
**detection_metadata,
|
|
62
235
|
},
|
|
63
236
|
)
|