pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. pystylometry/__init__.py +29 -3
  2. pystylometry/_types.py +963 -259
  3. pystylometry/authorship/__init__.py +23 -2
  4. pystylometry/authorship/additional_methods.py +4 -29
  5. pystylometry/authorship/kilgarriff.py +347 -0
  6. pystylometry/character/character_metrics.py +267 -179
  7. pystylometry/cli.py +427 -0
  8. pystylometry/consistency/__init__.py +57 -0
  9. pystylometry/consistency/_thresholds.py +162 -0
  10. pystylometry/consistency/drift.py +549 -0
  11. pystylometry/dialect/__init__.py +65 -0
  12. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  13. pystylometry/dialect/_loader.py +360 -0
  14. pystylometry/dialect/detector.py +533 -0
  15. pystylometry/lexical/advanced_diversity.py +61 -22
  16. pystylometry/lexical/function_words.py +255 -56
  17. pystylometry/lexical/hapax.py +182 -52
  18. pystylometry/lexical/mtld.py +108 -26
  19. pystylometry/lexical/ttr.py +76 -10
  20. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  21. pystylometry/lexical/yule.py +136 -50
  22. pystylometry/ngrams/entropy.py +150 -49
  23. pystylometry/readability/additional_formulas.py +1887 -762
  24. pystylometry/readability/ari.py +144 -82
  25. pystylometry/readability/coleman_liau.py +136 -109
  26. pystylometry/readability/flesch.py +177 -73
  27. pystylometry/readability/gunning_fog.py +165 -161
  28. pystylometry/readability/smog.py +123 -42
  29. pystylometry/syntactic/advanced_syntactic.py +76 -14
  30. pystylometry/syntactic/pos_ratios.py +70 -6
  31. pystylometry/syntactic/sentence_stats.py +55 -12
  32. pystylometry/syntactic/sentence_types.py +71 -15
  33. pystylometry/viz/__init__.py +71 -0
  34. pystylometry/viz/drift.py +589 -0
  35. pystylometry/viz/jsx/__init__.py +31 -0
  36. pystylometry/viz/jsx/_base.py +144 -0
  37. pystylometry/viz/jsx/report.py +677 -0
  38. pystylometry/viz/jsx/timeline.py +716 -0
  39. pystylometry/viz/jsx/viewer.py +1032 -0
  40. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
  41. pystylometry-1.1.0.dist-info/RECORD +63 -0
  42. {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
  43. pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
  44. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -3,6 +3,12 @@
3
3
  This module computes the Gunning Fog Index, a readability metric that
4
4
  estimates the years of formal education needed to understand text on first reading.
5
5
 
6
+ This implementation includes native chunked analysis for stylometric fingerprinting.
7
+
8
+ Related GitHub Issues:
9
+ #4 - NLP-enhanced complex word detection
10
+ #27 - Native chunked analysis with Distribution dataclass
11
+
6
12
  Historical Background:
7
13
  ----------------------
8
14
  The Gunning Fog Index was developed by Robert Gunning in 1952 as part of his
@@ -12,221 +18,219 @@ a U.S. grade-level score (e.g., 12 = high school senior reading level).
12
18
  Reference:
13
19
  Gunning, R. (1952). The Technique of Clear Writing.
14
20
  McGraw-Hill, New York.
15
-
16
- Implementation Notes (PR #4):
17
- ------------------------------
18
- This implementation addresses issues raised in GitHub PR #4:
19
- https://github.com/craigtrim/pystylometry/pull/4
20
-
21
- The original TODO implementation used simple syllable counting without proper
22
- exclusions for proper nouns, compounds, or inflections. This NLP-enhanced
23
- version uses the complex_words module for accurate detection via:
24
-
25
- 1. spaCy POS tagging for proper noun detection (enhanced mode)
26
- 2. spaCy lemmatization for morphological analysis (enhanced mode)
27
- 3. Component-based analysis for hyphenated words (both modes)
28
- 4. Graceful fallback to heuristics when spaCy unavailable (basic mode)
29
-
30
- See complex_words.py for detailed rationale and implementation.
31
21
  """
32
22
 
23
+ import math
24
+
33
25
  from .._normalize import normalize_for_readability
34
- from .._types import GunningFogResult
26
+ from .._types import Distribution, GunningFogResult, chunk_text, make_distribution
35
27
  from .._utils import split_sentences, tokenize
36
-
37
- # Import NLP-enhanced complex word detection module
38
- # This module addresses PR #4 issues with proper noun and inflection detection
39
28
  from .complex_words import process_text_for_complex_words
40
29
 
41
30
  # Formula coefficient from Gunning (1952)
42
- # Reference: Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
43
- # The 0.4 coefficient scales the combined complexity measure to approximate grade level
44
31
  _FOG_COEFFICIENT = 0.4
45
32
 
46
33
 
47
- def compute_gunning_fog(text: str, spacy_model: str = "en_core_web_sm") -> GunningFogResult:
34
+ def _compute_gunning_fog_single(text: str, spacy_model: str) -> tuple[float, float, dict]:
35
+ """Compute Gunning Fog metrics for a single chunk of text.
36
+
37
+ Returns:
38
+ Tuple of (fog_index, grade_level, metadata_dict).
39
+ Returns (nan, nan, metadata) for empty/invalid input.
40
+ """
41
+ sentences = split_sentences(text)
42
+ all_tokens = tokenize(text)
43
+ tokens = normalize_for_readability(all_tokens)
44
+
45
+ if len(sentences) == 0 or len(tokens) == 0:
46
+ return (
47
+ float("nan"),
48
+ float("nan"),
49
+ {
50
+ "sentence_count": 0,
51
+ "word_count": 0,
52
+ "complex_word_count": 0,
53
+ "complex_word_percentage": 0.0,
54
+ },
55
+ )
56
+
57
+ # Count complex words using NLP-enhanced detection
58
+ complex_word_count, detection_metadata = process_text_for_complex_words(
59
+ text, tokens, model=spacy_model
60
+ )
61
+
62
+ # Calculate formula components
63
+ average_words_per_sentence = len(tokens) / len(sentences)
64
+ complex_word_percentage = (complex_word_count / len(tokens)) * 100
65
+
66
+ # Apply Gunning Fog formula
67
+ fog_index = _FOG_COEFFICIENT * (average_words_per_sentence + complex_word_percentage)
68
+ grade_level = max(0, min(20, round(fog_index)))
69
+
70
+ metadata = {
71
+ "sentence_count": len(sentences),
72
+ "word_count": len(tokens),
73
+ "complex_word_count": complex_word_count,
74
+ "complex_word_percentage": complex_word_percentage,
75
+ "average_words_per_sentence": average_words_per_sentence,
76
+ **detection_metadata,
77
+ }
78
+
79
+ return (fog_index, float(grade_level), metadata)
80
+
81
+
82
+ def compute_gunning_fog(
83
+ text: str, chunk_size: int = 1000, spacy_model: str = "en_core_web_sm"
84
+ ) -> GunningFogResult:
48
85
  """
49
86
  Compute Gunning Fog Index with NLP-enhanced complex word detection.
50
87
 
51
- The Gunning Fog Index estimates the years of formal education required
52
- to understand text on first reading. It combines sentence length and
53
- lexical complexity (polysyllabic words) into a single grade-level score.
88
+ This function uses native chunked analysis to capture variance and patterns
89
+ across the text, which is essential for stylometric fingerprinting.
54
90
 
55
91
  Formula (Gunning, 1952):
56
92
  ------------------------
57
93
  Fog Index = 0.4 × [(words/sentences) + 100 × (complex words/words)]
58
94
 
59
- Where:
60
- - words/sentences = Average Sentence Length (ASL)
61
- - complex words/words = Percentage of Hard Words (PHW)
62
- - 0.4 = Scaling coefficient to approximate U.S. grade levels
63
-
64
- The resulting score represents a U.S. education grade level:
65
- - 6 = Sixth grade (age 11-12)
66
- - 12 = High school senior (age 17-18)
67
- - 17+ = College graduate level
68
-
69
- Complex Words Definition (Gunning, 1952):
70
- ------------------------------------------
71
- Words with 3+ syllables, EXCLUDING:
95
+ Where complex words are words with 3+ syllables, EXCLUDING:
72
96
  1. Proper nouns (names, places, organizations)
73
97
  2. Compound words (hyphenated)
74
98
  3. Common verb forms (-es, -ed, -ing endings)
75
99
 
100
+ Related GitHub Issues:
101
+ #4 - NLP-enhanced complex word detection
102
+ #27 - Native chunked analysis with Distribution dataclass
103
+
76
104
  Reference:
77
105
  Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
78
- Pages 38-39: Complex word criteria
79
-
80
- NLP Enhancement (PR #4):
81
- ------------------------
82
- This implementation addresses issues in GitHub PR #4:
83
- https://github.com/craigtrim/pystylometry/pull/4
84
-
85
- **Enhanced Mode** (when spaCy available):
86
- - Uses POS tagging (PROPN) for proper noun detection
87
- - Uses lemmatization for morphological analysis
88
- - Analyzes hyphenated word components individually
89
- - More accurate, handles edge cases (acronyms, irregular verbs)
90
-
91
- **Basic Mode** (when spaCy unavailable):
92
- - Uses capitalization heuristic for proper nouns
93
- - Uses simple suffix stripping for inflections
94
- - Analyzes hyphenated word components individually
95
- - Less accurate but requires no external dependencies
96
-
97
- The mode used is reported in metadata for transparency.
98
106
 
99
107
  Args:
100
108
  text: Input text to analyze
109
+ chunk_size: Number of words per chunk (default: 1000).
110
+ The text is divided into chunks of this size, and metrics are
111
+ computed per-chunk.
101
112
  spacy_model: spaCy model name for enhanced mode (default: "en_core_web_sm")
102
- Requires model download: python -m spacy download en_core_web_sm
103
- Other options: "en_core_web_md", "en_core_web_lg"
104
113
 
105
114
  Returns:
106
115
  GunningFogResult with:
107
- - fog_index: Float, the calculated Gunning Fog Index
108
- - grade_level: Float, rounded U.S. grade level (0-20), or NaN if empty
109
- - metadata: Dict with:
110
- - sentence_count: Number of sentences
111
- - word_count: Number of words (tokens)
112
- - complex_word_count: Number of complex words
113
- - complex_word_percentage: Percentage of complex words
114
- - average_words_per_sentence: Mean sentence length
115
- - reliable: Boolean, True if word_count >= 100 and sentence_count >= 3
116
- - mode: "enhanced" (spaCy) or "basic" (heuristics)
117
- - proper_noun_detection: Detection method used
118
- - inflection_handling: Inflection analysis method used
119
- - spacy_model: Model name if enhanced mode (else absent)
116
+ - fog_index: Mean Fog Index across chunks
117
+ - grade_level: Mean grade level across chunks
118
+ - fog_index_dist: Distribution with per-chunk values and stats
119
+ - grade_level_dist: Distribution with per-chunk values and stats
120
+ - chunk_size: The chunk size used
121
+ - chunk_count: Number of chunks analyzed
120
122
 
121
123
  Example:
122
- >>> # Simple text (low complexity)
123
- >>> result = compute_gunning_fog("The cat sat on the mat. The dog ran.")
124
- >>> print(f"Fog Index: {result.fog_index:.1f}")
125
- Fog Index: 2.7
126
- >>> print(f"Grade Level: {result.grade_level}")
127
- Grade Level: 3
128
- >>> print(f"Mode: {result.metadata['mode']}")
129
- Mode: enhanced
130
-
131
- >>> # Complex academic text (high complexity)
132
- >>> text = "Understanding phenomenological hermeneutics necessitates comprehensive study."
133
- >>> result = compute_gunning_fog(text)
134
- >>> print(f"Fog Index: {result.fog_index:.1f}")
135
- Fog Index: 23.6
136
- >>> print(f"Grade Level: {result.grade_level}")
137
- Grade Level: 20
138
-
139
- >>> # Check which detection mode was used
140
- >>> if result.metadata['mode'] == 'enhanced':
141
- ... print("Using spaCy NLP features")
142
- Using spaCy NLP features
143
-
144
- Notes:
145
- - Empty text returns fog_index=NaN and grade_level=NaN (no data)
146
- - Grade levels are clamped to [0, 20] range for valid input
147
- - For short texts (< 100 words), results may be unreliable
148
- - Gunning (1952) recommends analyzing samples of 100+ words
124
+ >>> result = compute_gunning_fog("Long text here...", chunk_size=1000)
125
+ >>> result.fog_index # Mean across chunks
126
+ 12.5
127
+ >>> result.fog_index_dist.std # Variance reveals fingerprint
128
+ 2.1
149
129
  """
150
- # Step 1: Sentence and word tokenization
151
- # Using the project's standard utilities for consistency
152
- sentences = split_sentences(text)
153
- all_tokens = tokenize(text)
154
-
155
- # Filter to only valid words (exclude punctuation, numbers, URLs, emails)
156
- # Allows hyphenated words and contractions per Gunning (1952)
157
- # Prevents errors in syllable counting from non-word tokens
158
- tokens = normalize_for_readability(all_tokens)
159
-
160
- # Edge case: Empty or whitespace-only input
161
- # Return NaN to distinguish "no data" from actual zero scores
162
- # This matches SMOG behavior and prevents conflating empty input with simple text
163
- if len(sentences) == 0 or len(tokens) == 0:
130
+ # Chunk the text
131
+ chunks = chunk_text(text, chunk_size)
132
+
133
+ # Compute metrics per chunk
134
+ fog_values = []
135
+ grade_values = []
136
+ total_sentences = 0
137
+ total_words = 0
138
+ total_complex = 0
139
+ detection_metadata: dict = {}
140
+
141
+ for chunk in chunks:
142
+ fi, gl, meta = _compute_gunning_fog_single(chunk, spacy_model)
143
+ if not math.isnan(fi):
144
+ fog_values.append(fi)
145
+ grade_values.append(gl)
146
+ total_sentences += meta.get("sentence_count", 0)
147
+ total_words += meta.get("word_count", 0)
148
+ total_complex += meta.get("complex_word_count", 0)
149
+ # Capture detection metadata from first chunk (same for all chunks)
150
+ if not detection_metadata and "mode" in meta:
151
+ detection_metadata = {
152
+ "mode": meta.get("mode"),
153
+ "proper_noun_detection": meta.get("proper_noun_detection"),
154
+ "inflection_handling": meta.get("inflection_handling"),
155
+ }
156
+ if "spacy_model" in meta:
157
+ detection_metadata["spacy_model"] = meta.get("spacy_model")
158
+
159
+ # Handle empty or all-invalid chunks
160
+ if not fog_values:
161
+ empty_dist = Distribution(
162
+ values=[],
163
+ mean=float("nan"),
164
+ median=float("nan"),
165
+ std=0.0,
166
+ range=0.0,
167
+ iqr=0.0,
168
+ )
164
169
  return GunningFogResult(
165
170
  fog_index=float("nan"),
166
171
  grade_level=float("nan"),
172
+ fog_index_dist=empty_dist,
173
+ grade_level_dist=empty_dist,
174
+ chunk_size=chunk_size,
175
+ chunk_count=len(chunks),
167
176
  metadata={
177
+ # Backward-compatible keys
168
178
  "sentence_count": 0,
169
179
  "word_count": 0,
170
180
  "complex_word_count": 0,
171
181
  "complex_word_percentage": 0.0,
172
182
  "average_words_per_sentence": 0.0,
183
+ # New prefixed keys for consistency
184
+ "total_sentence_count": 0,
185
+ "total_word_count": 0,
186
+ "total_complex_word_count": 0,
173
187
  "reliable": False,
188
+ # Detection metadata
174
189
  "mode": "none",
175
- "proper_noun_detection": "N/A",
176
- "inflection_handling": "N/A",
190
+ "proper_noun_detection": "none",
191
+ "inflection_handling": "none",
177
192
  },
178
193
  )
179
194
 
180
- # Step 2: Count complex words using NLP-enhanced detection
181
- # This addresses PR #4 issues with proper noun and inflection detection
182
- # See complex_words.py for detailed implementation
183
- complex_word_count, detection_metadata = process_text_for_complex_words(
184
- text, tokens, model=spacy_model
185
- )
186
-
187
- # Step 3: Calculate formula components
188
- # Reference: Gunning (1952), p. 40: "The Fog Index formula"
195
+ # Build distributions
196
+ fog_dist = make_distribution(fog_values)
197
+ grade_dist = make_distribution(grade_values)
189
198
 
190
- # Average Sentence Length (ASL)
191
- # Number of words divided by number of sentences
192
- average_words_per_sentence = len(tokens) / len(sentences)
193
-
194
- # Percentage of Hard Words (PHW)
195
- # Number of complex words divided by total words, multiplied by 100
196
- complex_word_percentage = (complex_word_count / len(tokens)) * 100
197
-
198
- # Step 4: Apply Gunning Fog formula
199
- # Fog = 0.4 × (ASL + PHW)
200
- # The 0.4 coefficient scales the result to approximate U.S. grade levels
201
- fog_index = _FOG_COEFFICIENT * (average_words_per_sentence + complex_word_percentage)
202
-
203
- # Step 5: Convert to grade level
204
- # Round to nearest integer using standard rounding (round half to even)
205
- # Clamp to reasonable range [0, 20] to prevent extreme values
206
- # Note: Texts with fog_index > 20 are considered "post-graduate" level
207
- grade_level = max(0, min(20, round(fog_index)))
199
+ # Reliability heuristic
200
+ reliable = total_words >= 100 and total_sentences >= 3
208
201
 
209
- # Reliability heuristic: Gunning (1952) recommends 100+ word samples
210
- # Also require 3+ sentences to ensure meaningful average sentence length
211
- # Very long texts with few sentences can produce unstable FOG estimates
212
- reliable = len(tokens) >= 100 and len(sentences) >= 3
202
+ # Ensure detection metadata has defaults
203
+ if not detection_metadata:
204
+ detection_metadata = {
205
+ "mode": "none",
206
+ "proper_noun_detection": "none",
207
+ "inflection_handling": "none",
208
+ }
213
209
 
214
- # Step 6: Assemble result with comprehensive metadata
215
210
  return GunningFogResult(
216
- fog_index=fog_index,
217
- grade_level=grade_level,
211
+ fog_index=fog_dist.mean,
212
+ grade_level=grade_dist.mean,
213
+ fog_index_dist=fog_dist,
214
+ grade_level_dist=grade_dist,
215
+ chunk_size=chunk_size,
216
+ chunk_count=len(chunks),
218
217
  metadata={
219
- # Core counts
220
- "sentence_count": len(sentences),
221
- "word_count": len(tokens),
222
- "complex_word_count": complex_word_count,
223
- # Derived metrics
224
- "complex_word_percentage": complex_word_percentage,
225
- "average_words_per_sentence": average_words_per_sentence,
226
- # Reliability indicator
218
+ # Backward-compatible keys
219
+ "sentence_count": total_sentences,
220
+ "word_count": total_words,
221
+ "complex_word_count": total_complex,
222
+ "complex_word_percentage": (total_complex / total_words * 100)
223
+ if total_words > 0
224
+ else 0,
225
+ "average_words_per_sentence": total_words / total_sentences
226
+ if total_sentences > 0
227
+ else 0,
228
+ # New prefixed keys for consistency
229
+ "total_sentence_count": total_sentences,
230
+ "total_word_count": total_words,
231
+ "total_complex_word_count": total_complex,
227
232
  "reliable": reliable,
228
- # Detection method transparency (from complex_words module)
229
- # This allows users to verify which mode was used
233
+ # Detection metadata
230
234
  **detection_metadata,
231
235
  },
232
236
  )
@@ -1,17 +1,62 @@
1
- """SMOG (Simple Measure of Gobbledygook) Index."""
1
+ """SMOG (Simple Measure of Gobbledygook) Index.
2
+
3
+ This module implements the SMOG readability formula with native chunked
4
+ analysis for stylometric fingerprinting.
5
+
6
+ Related GitHub Issue:
7
+ #27 - Native chunked analysis with Distribution dataclass
8
+ https://github.com/craigtrim/pystylometry/issues/27
9
+ """
2
10
 
3
11
  import math
4
12
 
5
13
  from .._normalize import normalize_for_readability
6
- from .._types import SMOGResult
14
+ from .._types import Distribution, SMOGResult, chunk_text, make_distribution
7
15
  from .._utils import split_sentences, tokenize
8
16
  from .syllables import count_syllables
9
17
 
10
18
 
11
- def compute_smog(text: str) -> SMOGResult:
19
+ def _compute_smog_single(text: str) -> tuple[float, float, dict]:
20
+ """Compute SMOG metrics for a single chunk of text.
21
+
22
+ Returns:
23
+ Tuple of (smog_index, grade_level, metadata_dict).
24
+ Returns (nan, nan, metadata) for empty/invalid input.
25
+ """
26
+ sentences = split_sentences(text)
27
+ tokens = tokenize(text)
28
+ word_tokens = normalize_for_readability(tokens)
29
+
30
+ if len(sentences) == 0 or len(word_tokens) == 0:
31
+ return (
32
+ float("nan"),
33
+ float("nan"),
34
+ {"sentence_count": 0, "word_count": 0, "polysyllable_count": 0},
35
+ )
36
+
37
+ # Count polysyllables (words with 3+ syllables)
38
+ polysyllable_count = sum(1 for word in word_tokens if count_syllables(word) >= 3)
39
+
40
+ # SMOG formula
41
+ smog_index = 1.043 * math.sqrt(polysyllable_count * 30 / len(sentences)) + 3.1291
42
+ grade_level = max(0, min(20, math.floor(smog_index + 0.5)))
43
+
44
+ metadata = {
45
+ "sentence_count": len(sentences),
46
+ "word_count": len(word_tokens),
47
+ "polysyllable_count": polysyllable_count,
48
+ }
49
+
50
+ return (smog_index, float(grade_level), metadata)
51
+
52
+
53
+ def compute_smog(text: str, chunk_size: int = 1000) -> SMOGResult:
12
54
  """
13
55
  Compute SMOG (Simple Measure of Gobbledygook) Index.
14
56
 
57
+ This function uses native chunked analysis to capture variance and patterns
58
+ across the text, which is essential for stylometric fingerprinting.
59
+
15
60
  Formula:
16
61
  SMOG = 1.043 × √(polysyllables × 30/sentences) + 3.1291
17
62
 
@@ -20,69 +65,105 @@ def compute_smog(text: str) -> SMOGResult:
20
65
  The SMOG index estimates the years of education needed to understand the text.
21
66
  It's particularly useful for healthcare materials.
22
67
 
68
+ Related GitHub Issue:
69
+ #27 - Native chunked analysis with Distribution dataclass
70
+ https://github.com/craigtrim/pystylometry/issues/27
71
+
23
72
  References:
24
73
  McLaughlin, G. H. (1969). SMOG grading: A new readability formula.
25
74
  Journal of Reading, 12(8), 639-646.
26
75
 
27
76
  Args:
28
77
  text: Input text to analyze
78
+ chunk_size: Number of words per chunk (default: 1000).
79
+ The text is divided into chunks of this size, and metrics are
80
+ computed per-chunk.
29
81
 
30
82
  Returns:
31
- SMOGResult with SMOG index and grade level
32
-
33
- Note: For empty input (no sentences or words), smog_index and grade_level
34
- will be float('nan'). This prevents conflating "no data" with actual scores.
35
-
36
- SMOG is designed for texts with 30+ sentences. For shorter texts, the formula
37
- still computes but a warning is included in metadata. Results may be less reliable.
83
+ SMOGResult with:
84
+ - smog_index: Mean SMOG index across chunks
85
+ - grade_level: Mean grade level across chunks
86
+ - smog_index_dist: Distribution with per-chunk values and stats
87
+ - grade_level_dist: Distribution with per-chunk values and stats
88
+ - chunk_size: The chunk size used
89
+ - chunk_count: Number of chunks analyzed
38
90
 
39
91
  Example:
40
- >>> text = "Caffeinated programmers debugged incomprehensible code."
41
- >>> result = compute_smog(text)
42
- >>> print(f"SMOG Index: {result.smog_index:.1f}")
43
- >>> print(f"Grade Level: {result.grade_level}")
92
+ >>> result = compute_smog("Long text here...", chunk_size=1000)
93
+ >>> result.smog_index # Mean across chunks
94
+ 12.5
95
+ >>> result.smog_index_dist.std # Variance reveals fingerprint
96
+ 1.8
44
97
  """
45
- sentences = split_sentences(text)
46
- tokens = tokenize(text)
47
-
48
- # Filter tokens to only valid words for syllable counting
49
- # Removes numbers, URLs, emails, etc. that would cause errors
50
- word_tokens = normalize_for_readability(tokens)
51
-
52
- if len(sentences) == 0 or len(word_tokens) == 0:
98
+ # Chunk the text
99
+ chunks = chunk_text(text, chunk_size)
100
+
101
+ # Compute metrics per chunk
102
+ smog_values = []
103
+ grade_values = []
104
+ total_sentences = 0
105
+ total_words = 0
106
+ total_polysyllables = 0
107
+
108
+ for chunk in chunks:
109
+ si, gl, meta = _compute_smog_single(chunk)
110
+ if not math.isnan(si):
111
+ smog_values.append(si)
112
+ grade_values.append(gl)
113
+ total_sentences += meta.get("sentence_count", 0)
114
+ total_words += meta.get("word_count", 0)
115
+ total_polysyllables += meta.get("polysyllable_count", 0)
116
+
117
+ # Handle empty or all-invalid chunks
118
+ if not smog_values:
119
+ empty_dist = Distribution(
120
+ values=[],
121
+ mean=float("nan"),
122
+ median=float("nan"),
123
+ std=0.0,
124
+ range=0.0,
125
+ iqr=0.0,
126
+ )
53
127
  return SMOGResult(
54
128
  smog_index=float("nan"),
55
129
  grade_level=float("nan"),
130
+ smog_index_dist=empty_dist,
131
+ grade_level_dist=empty_dist,
132
+ chunk_size=chunk_size,
133
+ chunk_count=len(chunks),
56
134
  metadata={
135
+ # Backward-compatible keys
57
136
  "sentence_count": 0,
58
137
  "word_count": 0,
59
138
  "polysyllable_count": 0,
139
+ # New prefixed keys for consistency
140
+ "total_sentence_count": 0,
141
+ "total_word_count": 0,
142
+ "total_polysyllable_count": 0,
60
143
  "warning": "Insufficient text",
61
144
  },
62
145
  )
63
146
 
64
- # Count polysyllables (words with 3+ syllables) - safe now, only valid words
65
- polysyllable_count = sum(1 for word in word_tokens if count_syllables(word) >= 3)
66
-
67
- # SMOG formula: 1.043 × √(polysyllables × 30/sentences) + 3.1291
68
- smog_index = 1.043 * math.sqrt(polysyllable_count * 30 / len(sentences)) + 3.1291
69
-
70
- # Use round-half-up rounding (not banker's rounding)
71
- # Clamp to valid grade range [0, 20]
72
- # Round half up: 4.5 → 5 (not Python's default round-half-to-even)
73
- # math.floor(x + 0.5) implements round-half-up for both positive and negative values
74
- # Lower bound: Prevent negative grades
75
- # (though mathematically unlikely with SMOG's +3.1291 constant)
76
- # Upper bound: Cap at grade 20 (post-graduate) for extreme complexity
77
- grade_level = max(0, min(20, math.floor(smog_index + 0.5)))
147
+ # Build distributions
148
+ smog_dist = make_distribution(smog_values)
149
+ grade_dist = make_distribution(grade_values)
78
150
 
79
151
  return SMOGResult(
80
- smog_index=smog_index,
81
- grade_level=grade_level,
152
+ smog_index=smog_dist.mean,
153
+ grade_level=grade_dist.mean,
154
+ smog_index_dist=smog_dist,
155
+ grade_level_dist=grade_dist,
156
+ chunk_size=chunk_size,
157
+ chunk_count=len(chunks),
82
158
  metadata={
83
- "sentence_count": len(sentences),
84
- "word_count": len(word_tokens),
85
- "polysyllable_count": polysyllable_count,
86
- "warning": "Less than 30 sentences" if len(sentences) < 30 else None,
159
+ # Backward-compatible keys
160
+ "sentence_count": total_sentences,
161
+ "word_count": total_words,
162
+ "polysyllable_count": total_polysyllables,
163
+ # New prefixed keys for consistency
164
+ "total_sentence_count": total_sentences,
165
+ "total_word_count": total_words,
166
+ "total_polysyllable_count": total_polysyllables,
167
+ "warning": "Less than 30 sentences" if total_sentences < 30 else None,
87
168
  },
88
169
  )