pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +29 -3
- pystylometry/_types.py +963 -259
- pystylometry/authorship/__init__.py +23 -2
- pystylometry/authorship/additional_methods.py +4 -29
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -3,6 +3,12 @@
|
|
|
3
3
|
This module computes the Gunning Fog Index, a readability metric that
|
|
4
4
|
estimates the years of formal education needed to understand text on first reading.
|
|
5
5
|
|
|
6
|
+
This implementation includes native chunked analysis for stylometric fingerprinting.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issues:
|
|
9
|
+
#4 - NLP-enhanced complex word detection
|
|
10
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
11
|
+
|
|
6
12
|
Historical Background:
|
|
7
13
|
----------------------
|
|
8
14
|
The Gunning Fog Index was developed by Robert Gunning in 1952 as part of his
|
|
@@ -12,221 +18,219 @@ a U.S. grade-level score (e.g., 12 = high school senior reading level).
|
|
|
12
18
|
Reference:
|
|
13
19
|
Gunning, R. (1952). The Technique of Clear Writing.
|
|
14
20
|
McGraw-Hill, New York.
|
|
15
|
-
|
|
16
|
-
Implementation Notes (PR #4):
|
|
17
|
-
------------------------------
|
|
18
|
-
This implementation addresses issues raised in GitHub PR #4:
|
|
19
|
-
https://github.com/craigtrim/pystylometry/pull/4
|
|
20
|
-
|
|
21
|
-
The original TODO implementation used simple syllable counting without proper
|
|
22
|
-
exclusions for proper nouns, compounds, or inflections. This NLP-enhanced
|
|
23
|
-
version uses the complex_words module for accurate detection via:
|
|
24
|
-
|
|
25
|
-
1. spaCy POS tagging for proper noun detection (enhanced mode)
|
|
26
|
-
2. spaCy lemmatization for morphological analysis (enhanced mode)
|
|
27
|
-
3. Component-based analysis for hyphenated words (both modes)
|
|
28
|
-
4. Graceful fallback to heuristics when spaCy unavailable (basic mode)
|
|
29
|
-
|
|
30
|
-
See complex_words.py for detailed rationale and implementation.
|
|
31
21
|
"""
|
|
32
22
|
|
|
23
|
+
import math
|
|
24
|
+
|
|
33
25
|
from .._normalize import normalize_for_readability
|
|
34
|
-
from .._types import GunningFogResult
|
|
26
|
+
from .._types import Distribution, GunningFogResult, chunk_text, make_distribution
|
|
35
27
|
from .._utils import split_sentences, tokenize
|
|
36
|
-
|
|
37
|
-
# Import NLP-enhanced complex word detection module
|
|
38
|
-
# This module addresses PR #4 issues with proper noun and inflection detection
|
|
39
28
|
from .complex_words import process_text_for_complex_words
|
|
40
29
|
|
|
41
30
|
# Formula coefficient from Gunning (1952)
|
|
42
|
-
# Reference: Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
|
|
43
|
-
# The 0.4 coefficient scales the combined complexity measure to approximate grade level
|
|
44
31
|
_FOG_COEFFICIENT = 0.4
|
|
45
32
|
|
|
46
33
|
|
|
47
|
-
def
|
|
34
|
+
def _compute_gunning_fog_single(text: str, spacy_model: str) -> tuple[float, float, dict]:
|
|
35
|
+
"""Compute Gunning Fog metrics for a single chunk of text.
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
Tuple of (fog_index, grade_level, metadata_dict).
|
|
39
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
40
|
+
"""
|
|
41
|
+
sentences = split_sentences(text)
|
|
42
|
+
all_tokens = tokenize(text)
|
|
43
|
+
tokens = normalize_for_readability(all_tokens)
|
|
44
|
+
|
|
45
|
+
if len(sentences) == 0 or len(tokens) == 0:
|
|
46
|
+
return (
|
|
47
|
+
float("nan"),
|
|
48
|
+
float("nan"),
|
|
49
|
+
{
|
|
50
|
+
"sentence_count": 0,
|
|
51
|
+
"word_count": 0,
|
|
52
|
+
"complex_word_count": 0,
|
|
53
|
+
"complex_word_percentage": 0.0,
|
|
54
|
+
},
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
# Count complex words using NLP-enhanced detection
|
|
58
|
+
complex_word_count, detection_metadata = process_text_for_complex_words(
|
|
59
|
+
text, tokens, model=spacy_model
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Calculate formula components
|
|
63
|
+
average_words_per_sentence = len(tokens) / len(sentences)
|
|
64
|
+
complex_word_percentage = (complex_word_count / len(tokens)) * 100
|
|
65
|
+
|
|
66
|
+
# Apply Gunning Fog formula
|
|
67
|
+
fog_index = _FOG_COEFFICIENT * (average_words_per_sentence + complex_word_percentage)
|
|
68
|
+
grade_level = max(0, min(20, round(fog_index)))
|
|
69
|
+
|
|
70
|
+
metadata = {
|
|
71
|
+
"sentence_count": len(sentences),
|
|
72
|
+
"word_count": len(tokens),
|
|
73
|
+
"complex_word_count": complex_word_count,
|
|
74
|
+
"complex_word_percentage": complex_word_percentage,
|
|
75
|
+
"average_words_per_sentence": average_words_per_sentence,
|
|
76
|
+
**detection_metadata,
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return (fog_index, float(grade_level), metadata)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def compute_gunning_fog(
|
|
83
|
+
text: str, chunk_size: int = 1000, spacy_model: str = "en_core_web_sm"
|
|
84
|
+
) -> GunningFogResult:
|
|
48
85
|
"""
|
|
49
86
|
Compute Gunning Fog Index with NLP-enhanced complex word detection.
|
|
50
87
|
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
lexical complexity (polysyllabic words) into a single grade-level score.
|
|
88
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
89
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
54
90
|
|
|
55
91
|
Formula (Gunning, 1952):
|
|
56
92
|
------------------------
|
|
57
93
|
Fog Index = 0.4 × [(words/sentences) + 100 × (complex words/words)]
|
|
58
94
|
|
|
59
|
-
Where:
|
|
60
|
-
- words/sentences = Average Sentence Length (ASL)
|
|
61
|
-
- complex words/words = Percentage of Hard Words (PHW)
|
|
62
|
-
- 0.4 = Scaling coefficient to approximate U.S. grade levels
|
|
63
|
-
|
|
64
|
-
The resulting score represents a U.S. education grade level:
|
|
65
|
-
- 6 = Sixth grade (age 11-12)
|
|
66
|
-
- 12 = High school senior (age 17-18)
|
|
67
|
-
- 17+ = College graduate level
|
|
68
|
-
|
|
69
|
-
Complex Words Definition (Gunning, 1952):
|
|
70
|
-
------------------------------------------
|
|
71
|
-
Words with 3+ syllables, EXCLUDING:
|
|
95
|
+
Where complex words are words with 3+ syllables, EXCLUDING:
|
|
72
96
|
1. Proper nouns (names, places, organizations)
|
|
73
97
|
2. Compound words (hyphenated)
|
|
74
98
|
3. Common verb forms (-es, -ed, -ing endings)
|
|
75
99
|
|
|
100
|
+
Related GitHub Issues:
|
|
101
|
+
#4 - NLP-enhanced complex word detection
|
|
102
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
103
|
+
|
|
76
104
|
Reference:
|
|
77
105
|
Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
|
|
78
|
-
Pages 38-39: Complex word criteria
|
|
79
|
-
|
|
80
|
-
NLP Enhancement (PR #4):
|
|
81
|
-
------------------------
|
|
82
|
-
This implementation addresses issues in GitHub PR #4:
|
|
83
|
-
https://github.com/craigtrim/pystylometry/pull/4
|
|
84
|
-
|
|
85
|
-
**Enhanced Mode** (when spaCy available):
|
|
86
|
-
- Uses POS tagging (PROPN) for proper noun detection
|
|
87
|
-
- Uses lemmatization for morphological analysis
|
|
88
|
-
- Analyzes hyphenated word components individually
|
|
89
|
-
- More accurate, handles edge cases (acronyms, irregular verbs)
|
|
90
|
-
|
|
91
|
-
**Basic Mode** (when spaCy unavailable):
|
|
92
|
-
- Uses capitalization heuristic for proper nouns
|
|
93
|
-
- Uses simple suffix stripping for inflections
|
|
94
|
-
- Analyzes hyphenated word components individually
|
|
95
|
-
- Less accurate but requires no external dependencies
|
|
96
|
-
|
|
97
|
-
The mode used is reported in metadata for transparency.
|
|
98
106
|
|
|
99
107
|
Args:
|
|
100
108
|
text: Input text to analyze
|
|
109
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
110
|
+
The text is divided into chunks of this size, and metrics are
|
|
111
|
+
computed per-chunk.
|
|
101
112
|
spacy_model: spaCy model name for enhanced mode (default: "en_core_web_sm")
|
|
102
|
-
Requires model download: python -m spacy download en_core_web_sm
|
|
103
|
-
Other options: "en_core_web_md", "en_core_web_lg"
|
|
104
113
|
|
|
105
114
|
Returns:
|
|
106
115
|
GunningFogResult with:
|
|
107
|
-
- fog_index:
|
|
108
|
-
- grade_level:
|
|
109
|
-
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
- complex_word_percentage: Percentage of complex words
|
|
114
|
-
- average_words_per_sentence: Mean sentence length
|
|
115
|
-
- reliable: Boolean, True if word_count >= 100 and sentence_count >= 3
|
|
116
|
-
- mode: "enhanced" (spaCy) or "basic" (heuristics)
|
|
117
|
-
- proper_noun_detection: Detection method used
|
|
118
|
-
- inflection_handling: Inflection analysis method used
|
|
119
|
-
- spacy_model: Model name if enhanced mode (else absent)
|
|
116
|
+
- fog_index: Mean Fog Index across chunks
|
|
117
|
+
- grade_level: Mean grade level across chunks
|
|
118
|
+
- fog_index_dist: Distribution with per-chunk values and stats
|
|
119
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
120
|
+
- chunk_size: The chunk size used
|
|
121
|
+
- chunk_count: Number of chunks analyzed
|
|
120
122
|
|
|
121
123
|
Example:
|
|
122
|
-
>>>
|
|
123
|
-
>>> result
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
Grade Level: 3
|
|
128
|
-
>>> print(f"Mode: {result.metadata['mode']}")
|
|
129
|
-
Mode: enhanced
|
|
130
|
-
|
|
131
|
-
>>> # Complex academic text (high complexity)
|
|
132
|
-
>>> text = "Understanding phenomenological hermeneutics necessitates comprehensive study."
|
|
133
|
-
>>> result = compute_gunning_fog(text)
|
|
134
|
-
>>> print(f"Fog Index: {result.fog_index:.1f}")
|
|
135
|
-
Fog Index: 23.6
|
|
136
|
-
>>> print(f"Grade Level: {result.grade_level}")
|
|
137
|
-
Grade Level: 20
|
|
138
|
-
|
|
139
|
-
>>> # Check which detection mode was used
|
|
140
|
-
>>> if result.metadata['mode'] == 'enhanced':
|
|
141
|
-
... print("Using spaCy NLP features")
|
|
142
|
-
Using spaCy NLP features
|
|
143
|
-
|
|
144
|
-
Notes:
|
|
145
|
-
- Empty text returns fog_index=NaN and grade_level=NaN (no data)
|
|
146
|
-
- Grade levels are clamped to [0, 20] range for valid input
|
|
147
|
-
- For short texts (< 100 words), results may be unreliable
|
|
148
|
-
- Gunning (1952) recommends analyzing samples of 100+ words
|
|
124
|
+
>>> result = compute_gunning_fog("Long text here...", chunk_size=1000)
|
|
125
|
+
>>> result.fog_index # Mean across chunks
|
|
126
|
+
12.5
|
|
127
|
+
>>> result.fog_index_dist.std # Variance reveals fingerprint
|
|
128
|
+
2.1
|
|
149
129
|
"""
|
|
150
|
-
#
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
130
|
+
# Chunk the text
|
|
131
|
+
chunks = chunk_text(text, chunk_size)
|
|
132
|
+
|
|
133
|
+
# Compute metrics per chunk
|
|
134
|
+
fog_values = []
|
|
135
|
+
grade_values = []
|
|
136
|
+
total_sentences = 0
|
|
137
|
+
total_words = 0
|
|
138
|
+
total_complex = 0
|
|
139
|
+
detection_metadata: dict = {}
|
|
140
|
+
|
|
141
|
+
for chunk in chunks:
|
|
142
|
+
fi, gl, meta = _compute_gunning_fog_single(chunk, spacy_model)
|
|
143
|
+
if not math.isnan(fi):
|
|
144
|
+
fog_values.append(fi)
|
|
145
|
+
grade_values.append(gl)
|
|
146
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
147
|
+
total_words += meta.get("word_count", 0)
|
|
148
|
+
total_complex += meta.get("complex_word_count", 0)
|
|
149
|
+
# Capture detection metadata from first chunk (same for all chunks)
|
|
150
|
+
if not detection_metadata and "mode" in meta:
|
|
151
|
+
detection_metadata = {
|
|
152
|
+
"mode": meta.get("mode"),
|
|
153
|
+
"proper_noun_detection": meta.get("proper_noun_detection"),
|
|
154
|
+
"inflection_handling": meta.get("inflection_handling"),
|
|
155
|
+
}
|
|
156
|
+
if "spacy_model" in meta:
|
|
157
|
+
detection_metadata["spacy_model"] = meta.get("spacy_model")
|
|
158
|
+
|
|
159
|
+
# Handle empty or all-invalid chunks
|
|
160
|
+
if not fog_values:
|
|
161
|
+
empty_dist = Distribution(
|
|
162
|
+
values=[],
|
|
163
|
+
mean=float("nan"),
|
|
164
|
+
median=float("nan"),
|
|
165
|
+
std=0.0,
|
|
166
|
+
range=0.0,
|
|
167
|
+
iqr=0.0,
|
|
168
|
+
)
|
|
164
169
|
return GunningFogResult(
|
|
165
170
|
fog_index=float("nan"),
|
|
166
171
|
grade_level=float("nan"),
|
|
172
|
+
fog_index_dist=empty_dist,
|
|
173
|
+
grade_level_dist=empty_dist,
|
|
174
|
+
chunk_size=chunk_size,
|
|
175
|
+
chunk_count=len(chunks),
|
|
167
176
|
metadata={
|
|
177
|
+
# Backward-compatible keys
|
|
168
178
|
"sentence_count": 0,
|
|
169
179
|
"word_count": 0,
|
|
170
180
|
"complex_word_count": 0,
|
|
171
181
|
"complex_word_percentage": 0.0,
|
|
172
182
|
"average_words_per_sentence": 0.0,
|
|
183
|
+
# New prefixed keys for consistency
|
|
184
|
+
"total_sentence_count": 0,
|
|
185
|
+
"total_word_count": 0,
|
|
186
|
+
"total_complex_word_count": 0,
|
|
173
187
|
"reliable": False,
|
|
188
|
+
# Detection metadata
|
|
174
189
|
"mode": "none",
|
|
175
|
-
"proper_noun_detection": "
|
|
176
|
-
"inflection_handling": "
|
|
190
|
+
"proper_noun_detection": "none",
|
|
191
|
+
"inflection_handling": "none",
|
|
177
192
|
},
|
|
178
193
|
)
|
|
179
194
|
|
|
180
|
-
#
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
complex_word_count, detection_metadata = process_text_for_complex_words(
|
|
184
|
-
text, tokens, model=spacy_model
|
|
185
|
-
)
|
|
186
|
-
|
|
187
|
-
# Step 3: Calculate formula components
|
|
188
|
-
# Reference: Gunning (1952), p. 40: "The Fog Index formula"
|
|
195
|
+
# Build distributions
|
|
196
|
+
fog_dist = make_distribution(fog_values)
|
|
197
|
+
grade_dist = make_distribution(grade_values)
|
|
189
198
|
|
|
190
|
-
#
|
|
191
|
-
|
|
192
|
-
average_words_per_sentence = len(tokens) / len(sentences)
|
|
193
|
-
|
|
194
|
-
# Percentage of Hard Words (PHW)
|
|
195
|
-
# Number of complex words divided by total words, multiplied by 100
|
|
196
|
-
complex_word_percentage = (complex_word_count / len(tokens)) * 100
|
|
197
|
-
|
|
198
|
-
# Step 4: Apply Gunning Fog formula
|
|
199
|
-
# Fog = 0.4 × (ASL + PHW)
|
|
200
|
-
# The 0.4 coefficient scales the result to approximate U.S. grade levels
|
|
201
|
-
fog_index = _FOG_COEFFICIENT * (average_words_per_sentence + complex_word_percentage)
|
|
202
|
-
|
|
203
|
-
# Step 5: Convert to grade level
|
|
204
|
-
# Round to nearest integer using standard rounding (round half to even)
|
|
205
|
-
# Clamp to reasonable range [0, 20] to prevent extreme values
|
|
206
|
-
# Note: Texts with fog_index > 20 are considered "post-graduate" level
|
|
207
|
-
grade_level = max(0, min(20, round(fog_index)))
|
|
199
|
+
# Reliability heuristic
|
|
200
|
+
reliable = total_words >= 100 and total_sentences >= 3
|
|
208
201
|
|
|
209
|
-
#
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
202
|
+
# Ensure detection metadata has defaults
|
|
203
|
+
if not detection_metadata:
|
|
204
|
+
detection_metadata = {
|
|
205
|
+
"mode": "none",
|
|
206
|
+
"proper_noun_detection": "none",
|
|
207
|
+
"inflection_handling": "none",
|
|
208
|
+
}
|
|
213
209
|
|
|
214
|
-
# Step 6: Assemble result with comprehensive metadata
|
|
215
210
|
return GunningFogResult(
|
|
216
|
-
fog_index=
|
|
217
|
-
grade_level=
|
|
211
|
+
fog_index=fog_dist.mean,
|
|
212
|
+
grade_level=grade_dist.mean,
|
|
213
|
+
fog_index_dist=fog_dist,
|
|
214
|
+
grade_level_dist=grade_dist,
|
|
215
|
+
chunk_size=chunk_size,
|
|
216
|
+
chunk_count=len(chunks),
|
|
218
217
|
metadata={
|
|
219
|
-
#
|
|
220
|
-
"sentence_count":
|
|
221
|
-
"word_count":
|
|
222
|
-
"complex_word_count":
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
218
|
+
# Backward-compatible keys
|
|
219
|
+
"sentence_count": total_sentences,
|
|
220
|
+
"word_count": total_words,
|
|
221
|
+
"complex_word_count": total_complex,
|
|
222
|
+
"complex_word_percentage": (total_complex / total_words * 100)
|
|
223
|
+
if total_words > 0
|
|
224
|
+
else 0,
|
|
225
|
+
"average_words_per_sentence": total_words / total_sentences
|
|
226
|
+
if total_sentences > 0
|
|
227
|
+
else 0,
|
|
228
|
+
# New prefixed keys for consistency
|
|
229
|
+
"total_sentence_count": total_sentences,
|
|
230
|
+
"total_word_count": total_words,
|
|
231
|
+
"total_complex_word_count": total_complex,
|
|
227
232
|
"reliable": reliable,
|
|
228
|
-
# Detection
|
|
229
|
-
# This allows users to verify which mode was used
|
|
233
|
+
# Detection metadata
|
|
230
234
|
**detection_metadata,
|
|
231
235
|
},
|
|
232
236
|
)
|
pystylometry/readability/smog.py
CHANGED
|
@@ -1,17 +1,62 @@
|
|
|
1
|
-
"""SMOG (Simple Measure of Gobbledygook) Index.
|
|
1
|
+
"""SMOG (Simple Measure of Gobbledygook) Index.
|
|
2
|
+
|
|
3
|
+
This module implements the SMOG readability formula with native chunked
|
|
4
|
+
analysis for stylometric fingerprinting.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
9
|
+
"""
|
|
2
10
|
|
|
3
11
|
import math
|
|
4
12
|
|
|
5
13
|
from .._normalize import normalize_for_readability
|
|
6
|
-
from .._types import SMOGResult
|
|
14
|
+
from .._types import Distribution, SMOGResult, chunk_text, make_distribution
|
|
7
15
|
from .._utils import split_sentences, tokenize
|
|
8
16
|
from .syllables import count_syllables
|
|
9
17
|
|
|
10
18
|
|
|
11
|
-
def
|
|
19
|
+
def _compute_smog_single(text: str) -> tuple[float, float, dict]:
|
|
20
|
+
"""Compute SMOG metrics for a single chunk of text.
|
|
21
|
+
|
|
22
|
+
Returns:
|
|
23
|
+
Tuple of (smog_index, grade_level, metadata_dict).
|
|
24
|
+
Returns (nan, nan, metadata) for empty/invalid input.
|
|
25
|
+
"""
|
|
26
|
+
sentences = split_sentences(text)
|
|
27
|
+
tokens = tokenize(text)
|
|
28
|
+
word_tokens = normalize_for_readability(tokens)
|
|
29
|
+
|
|
30
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
31
|
+
return (
|
|
32
|
+
float("nan"),
|
|
33
|
+
float("nan"),
|
|
34
|
+
{"sentence_count": 0, "word_count": 0, "polysyllable_count": 0},
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
# Count polysyllables (words with 3+ syllables)
|
|
38
|
+
polysyllable_count = sum(1 for word in word_tokens if count_syllables(word) >= 3)
|
|
39
|
+
|
|
40
|
+
# SMOG formula
|
|
41
|
+
smog_index = 1.043 * math.sqrt(polysyllable_count * 30 / len(sentences)) + 3.1291
|
|
42
|
+
grade_level = max(0, min(20, math.floor(smog_index + 0.5)))
|
|
43
|
+
|
|
44
|
+
metadata = {
|
|
45
|
+
"sentence_count": len(sentences),
|
|
46
|
+
"word_count": len(word_tokens),
|
|
47
|
+
"polysyllable_count": polysyllable_count,
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
return (smog_index, float(grade_level), metadata)
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def compute_smog(text: str, chunk_size: int = 1000) -> SMOGResult:
|
|
12
54
|
"""
|
|
13
55
|
Compute SMOG (Simple Measure of Gobbledygook) Index.
|
|
14
56
|
|
|
57
|
+
This function uses native chunked analysis to capture variance and patterns
|
|
58
|
+
across the text, which is essential for stylometric fingerprinting.
|
|
59
|
+
|
|
15
60
|
Formula:
|
|
16
61
|
SMOG = 1.043 × √(polysyllables × 30/sentences) + 3.1291
|
|
17
62
|
|
|
@@ -20,69 +65,105 @@ def compute_smog(text: str) -> SMOGResult:
|
|
|
20
65
|
The SMOG index estimates the years of education needed to understand the text.
|
|
21
66
|
It's particularly useful for healthcare materials.
|
|
22
67
|
|
|
68
|
+
Related GitHub Issue:
|
|
69
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
70
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
71
|
+
|
|
23
72
|
References:
|
|
24
73
|
McLaughlin, G. H. (1969). SMOG grading: A new readability formula.
|
|
25
74
|
Journal of Reading, 12(8), 639-646.
|
|
26
75
|
|
|
27
76
|
Args:
|
|
28
77
|
text: Input text to analyze
|
|
78
|
+
chunk_size: Number of words per chunk (default: 1000).
|
|
79
|
+
The text is divided into chunks of this size, and metrics are
|
|
80
|
+
computed per-chunk.
|
|
29
81
|
|
|
30
82
|
Returns:
|
|
31
|
-
SMOGResult with
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
83
|
+
SMOGResult with:
|
|
84
|
+
- smog_index: Mean SMOG index across chunks
|
|
85
|
+
- grade_level: Mean grade level across chunks
|
|
86
|
+
- smog_index_dist: Distribution with per-chunk values and stats
|
|
87
|
+
- grade_level_dist: Distribution with per-chunk values and stats
|
|
88
|
+
- chunk_size: The chunk size used
|
|
89
|
+
- chunk_count: Number of chunks analyzed
|
|
38
90
|
|
|
39
91
|
Example:
|
|
40
|
-
>>>
|
|
41
|
-
>>> result
|
|
42
|
-
|
|
43
|
-
>>>
|
|
92
|
+
>>> result = compute_smog("Long text here...", chunk_size=1000)
|
|
93
|
+
>>> result.smog_index # Mean across chunks
|
|
94
|
+
12.5
|
|
95
|
+
>>> result.smog_index_dist.std # Variance reveals fingerprint
|
|
96
|
+
1.8
|
|
44
97
|
"""
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
#
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
98
|
+
# Chunk the text
|
|
99
|
+
chunks = chunk_text(text, chunk_size)
|
|
100
|
+
|
|
101
|
+
# Compute metrics per chunk
|
|
102
|
+
smog_values = []
|
|
103
|
+
grade_values = []
|
|
104
|
+
total_sentences = 0
|
|
105
|
+
total_words = 0
|
|
106
|
+
total_polysyllables = 0
|
|
107
|
+
|
|
108
|
+
for chunk in chunks:
|
|
109
|
+
si, gl, meta = _compute_smog_single(chunk)
|
|
110
|
+
if not math.isnan(si):
|
|
111
|
+
smog_values.append(si)
|
|
112
|
+
grade_values.append(gl)
|
|
113
|
+
total_sentences += meta.get("sentence_count", 0)
|
|
114
|
+
total_words += meta.get("word_count", 0)
|
|
115
|
+
total_polysyllables += meta.get("polysyllable_count", 0)
|
|
116
|
+
|
|
117
|
+
# Handle empty or all-invalid chunks
|
|
118
|
+
if not smog_values:
|
|
119
|
+
empty_dist = Distribution(
|
|
120
|
+
values=[],
|
|
121
|
+
mean=float("nan"),
|
|
122
|
+
median=float("nan"),
|
|
123
|
+
std=0.0,
|
|
124
|
+
range=0.0,
|
|
125
|
+
iqr=0.0,
|
|
126
|
+
)
|
|
53
127
|
return SMOGResult(
|
|
54
128
|
smog_index=float("nan"),
|
|
55
129
|
grade_level=float("nan"),
|
|
130
|
+
smog_index_dist=empty_dist,
|
|
131
|
+
grade_level_dist=empty_dist,
|
|
132
|
+
chunk_size=chunk_size,
|
|
133
|
+
chunk_count=len(chunks),
|
|
56
134
|
metadata={
|
|
135
|
+
# Backward-compatible keys
|
|
57
136
|
"sentence_count": 0,
|
|
58
137
|
"word_count": 0,
|
|
59
138
|
"polysyllable_count": 0,
|
|
139
|
+
# New prefixed keys for consistency
|
|
140
|
+
"total_sentence_count": 0,
|
|
141
|
+
"total_word_count": 0,
|
|
142
|
+
"total_polysyllable_count": 0,
|
|
60
143
|
"warning": "Insufficient text",
|
|
61
144
|
},
|
|
62
145
|
)
|
|
63
146
|
|
|
64
|
-
#
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
# SMOG formula: 1.043 × √(polysyllables × 30/sentences) + 3.1291
|
|
68
|
-
smog_index = 1.043 * math.sqrt(polysyllable_count * 30 / len(sentences)) + 3.1291
|
|
69
|
-
|
|
70
|
-
# Use round-half-up rounding (not banker's rounding)
|
|
71
|
-
# Clamp to valid grade range [0, 20]
|
|
72
|
-
# Round half up: 4.5 → 5 (not Python's default round-half-to-even)
|
|
73
|
-
# math.floor(x + 0.5) implements round-half-up for both positive and negative values
|
|
74
|
-
# Lower bound: Prevent negative grades
|
|
75
|
-
# (though mathematically unlikely with SMOG's +3.1291 constant)
|
|
76
|
-
# Upper bound: Cap at grade 20 (post-graduate) for extreme complexity
|
|
77
|
-
grade_level = max(0, min(20, math.floor(smog_index + 0.5)))
|
|
147
|
+
# Build distributions
|
|
148
|
+
smog_dist = make_distribution(smog_values)
|
|
149
|
+
grade_dist = make_distribution(grade_values)
|
|
78
150
|
|
|
79
151
|
return SMOGResult(
|
|
80
|
-
smog_index=
|
|
81
|
-
grade_level=
|
|
152
|
+
smog_index=smog_dist.mean,
|
|
153
|
+
grade_level=grade_dist.mean,
|
|
154
|
+
smog_index_dist=smog_dist,
|
|
155
|
+
grade_level_dist=grade_dist,
|
|
156
|
+
chunk_size=chunk_size,
|
|
157
|
+
chunk_count=len(chunks),
|
|
82
158
|
metadata={
|
|
83
|
-
|
|
84
|
-
"
|
|
85
|
-
"
|
|
86
|
-
"
|
|
159
|
+
# Backward-compatible keys
|
|
160
|
+
"sentence_count": total_sentences,
|
|
161
|
+
"word_count": total_words,
|
|
162
|
+
"polysyllable_count": total_polysyllables,
|
|
163
|
+
# New prefixed keys for consistency
|
|
164
|
+
"total_sentence_count": total_sentences,
|
|
165
|
+
"total_word_count": total_words,
|
|
166
|
+
"total_polysyllable_count": total_polysyllables,
|
|
167
|
+
"warning": "Less than 30 sentences" if total_sentences < 30 else None,
|
|
87
168
|
},
|
|
88
169
|
)
|