pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +1 -2
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1224 -2
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +4 -0
- pystylometry/authorship/additional_methods.py +100 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +301 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +641 -0
- pystylometry/lexical/function_words.py +391 -0
- pystylometry/lexical/hapax.py +154 -7
- pystylometry/lexical/mtld.py +83 -7
- pystylometry/lexical/ttr.py +83 -0
- pystylometry/lexical/word_frequency_sophistication.py +581 -0
- pystylometry/lexical/yule.py +34 -7
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +985 -0
- pystylometry/readability/ari.py +93 -17
- pystylometry/readability/coleman_liau.py +102 -9
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +59 -14
- pystylometry/readability/gunning_fog.py +194 -25
- pystylometry/readability/smog.py +31 -14
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +432 -0
- pystylometry/syntactic/pos_ratios.py +104 -13
- pystylometry/syntactic/sentence_stats.py +57 -13
- pystylometry/syntactic/sentence_types.py +470 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
- pystylometry-1.0.0.dist-info/RECORD +46 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
- pystylometry-0.1.0.dist-info/RECORD +0 -26
|
@@ -1,63 +1,232 @@
|
|
|
1
|
-
"""Gunning Fog Index.
|
|
1
|
+
"""Gunning Fog Index with NLP-enhanced complex word detection.
|
|
2
2
|
|
|
3
|
+
This module computes the Gunning Fog Index, a readability metric that
|
|
4
|
+
estimates the years of formal education needed to understand text on first reading.
|
|
5
|
+
|
|
6
|
+
Historical Background:
|
|
7
|
+
----------------------
|
|
8
|
+
The Gunning Fog Index was developed by Robert Gunning in 1952 as part of his
|
|
9
|
+
work helping businesses improve the clarity of their writing. The formula produces
|
|
10
|
+
a U.S. grade-level score (e.g., 12 = high school senior reading level).
|
|
11
|
+
|
|
12
|
+
Reference:
|
|
13
|
+
Gunning, R. (1952). The Technique of Clear Writing.
|
|
14
|
+
McGraw-Hill, New York.
|
|
15
|
+
|
|
16
|
+
Implementation Notes (PR #4):
|
|
17
|
+
------------------------------
|
|
18
|
+
This implementation addresses issues raised in GitHub PR #4:
|
|
19
|
+
https://github.com/craigtrim/pystylometry/pull/4
|
|
20
|
+
|
|
21
|
+
The original TODO implementation used simple syllable counting without proper
|
|
22
|
+
exclusions for proper nouns, compounds, or inflections. This NLP-enhanced
|
|
23
|
+
version uses the complex_words module for accurate detection via:
|
|
24
|
+
|
|
25
|
+
1. spaCy POS tagging for proper noun detection (enhanced mode)
|
|
26
|
+
2. spaCy lemmatization for morphological analysis (enhanced mode)
|
|
27
|
+
3. Component-based analysis for hyphenated words (both modes)
|
|
28
|
+
4. Graceful fallback to heuristics when spaCy unavailable (basic mode)
|
|
29
|
+
|
|
30
|
+
See complex_words.py for detailed rationale and implementation.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
from .._normalize import normalize_for_readability
|
|
3
34
|
from .._types import GunningFogResult
|
|
4
35
|
from .._utils import split_sentences, tokenize
|
|
5
|
-
from .syllables import count_syllables
|
|
6
36
|
|
|
37
|
+
# Import NLP-enhanced complex word detection module
|
|
38
|
+
# This module addresses PR #4 issues with proper noun and inflection detection
|
|
39
|
+
from .complex_words import process_text_for_complex_words
|
|
7
40
|
|
|
8
|
-
|
|
41
|
+
# Formula coefficient from Gunning (1952)
|
|
42
|
+
# Reference: Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
|
|
43
|
+
# The 0.4 coefficient scales the combined complexity measure to approximate grade level
|
|
44
|
+
_FOG_COEFFICIENT = 0.4
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def compute_gunning_fog(text: str, spacy_model: str = "en_core_web_sm") -> GunningFogResult:
|
|
9
48
|
"""
|
|
10
|
-
Compute Gunning Fog Index.
|
|
49
|
+
Compute Gunning Fog Index with NLP-enhanced complex word detection.
|
|
11
50
|
|
|
12
|
-
|
|
51
|
+
The Gunning Fog Index estimates the years of formal education required
|
|
52
|
+
to understand text on first reading. It combines sentence length and
|
|
53
|
+
lexical complexity (polysyllabic words) into a single grade-level score.
|
|
54
|
+
|
|
55
|
+
Formula (Gunning, 1952):
|
|
56
|
+
------------------------
|
|
13
57
|
Fog Index = 0.4 × [(words/sentences) + 100 × (complex words/words)]
|
|
14
58
|
|
|
15
|
-
Where
|
|
16
|
-
|
|
59
|
+
Where:
|
|
60
|
+
- words/sentences = Average Sentence Length (ASL)
|
|
61
|
+
- complex words/words = Percentage of Hard Words (PHW)
|
|
62
|
+
- 0.4 = Scaling coefficient to approximate U.S. grade levels
|
|
63
|
+
|
|
64
|
+
The resulting score represents a U.S. education grade level:
|
|
65
|
+
- 6 = Sixth grade (age 11-12)
|
|
66
|
+
- 12 = High school senior (age 17-18)
|
|
67
|
+
- 17+ = College graduate level
|
|
68
|
+
|
|
69
|
+
Complex Words Definition (Gunning, 1952):
|
|
70
|
+
------------------------------------------
|
|
71
|
+
Words with 3+ syllables, EXCLUDING:
|
|
72
|
+
1. Proper nouns (names, places, organizations)
|
|
73
|
+
2. Compound words (hyphenated)
|
|
74
|
+
3. Common verb forms (-es, -ed, -ing endings)
|
|
75
|
+
|
|
76
|
+
Reference:
|
|
77
|
+
Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
|
|
78
|
+
Pages 38-39: Complex word criteria
|
|
79
|
+
|
|
80
|
+
NLP Enhancement (PR #4):
|
|
81
|
+
------------------------
|
|
82
|
+
This implementation addresses issues in GitHub PR #4:
|
|
83
|
+
https://github.com/craigtrim/pystylometry/pull/4
|
|
17
84
|
|
|
18
|
-
|
|
19
|
-
|
|
85
|
+
**Enhanced Mode** (when spaCy available):
|
|
86
|
+
- Uses POS tagging (PROPN) for proper noun detection
|
|
87
|
+
- Uses lemmatization for morphological analysis
|
|
88
|
+
- Analyzes hyphenated word components individually
|
|
89
|
+
- More accurate, handles edge cases (acronyms, irregular verbs)
|
|
20
90
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
91
|
+
**Basic Mode** (when spaCy unavailable):
|
|
92
|
+
- Uses capitalization heuristic for proper nouns
|
|
93
|
+
- Uses simple suffix stripping for inflections
|
|
94
|
+
- Analyzes hyphenated word components individually
|
|
95
|
+
- Less accurate but requires no external dependencies
|
|
96
|
+
|
|
97
|
+
The mode used is reported in metadata for transparency.
|
|
24
98
|
|
|
25
99
|
Args:
|
|
26
100
|
text: Input text to analyze
|
|
101
|
+
spacy_model: spaCy model name for enhanced mode (default: "en_core_web_sm")
|
|
102
|
+
Requires model download: python -m spacy download en_core_web_sm
|
|
103
|
+
Other options: "en_core_web_md", "en_core_web_lg"
|
|
27
104
|
|
|
28
105
|
Returns:
|
|
29
|
-
GunningFogResult with
|
|
106
|
+
GunningFogResult with:
|
|
107
|
+
- fog_index: Float, the calculated Gunning Fog Index
|
|
108
|
+
- grade_level: Float, rounded U.S. grade level (0-20), or NaN if empty
|
|
109
|
+
- metadata: Dict with:
|
|
110
|
+
- sentence_count: Number of sentences
|
|
111
|
+
- word_count: Number of words (tokens)
|
|
112
|
+
- complex_word_count: Number of complex words
|
|
113
|
+
- complex_word_percentage: Percentage of complex words
|
|
114
|
+
- average_words_per_sentence: Mean sentence length
|
|
115
|
+
- reliable: Boolean, True if word_count >= 100 and sentence_count >= 3
|
|
116
|
+
- mode: "enhanced" (spaCy) or "basic" (heuristics)
|
|
117
|
+
- proper_noun_detection: Detection method used
|
|
118
|
+
- inflection_handling: Inflection analysis method used
|
|
119
|
+
- spacy_model: Model name if enhanced mode (else absent)
|
|
30
120
|
|
|
31
121
|
Example:
|
|
32
|
-
>>>
|
|
122
|
+
>>> # Simple text (low complexity)
|
|
123
|
+
>>> result = compute_gunning_fog("The cat sat on the mat. The dog ran.")
|
|
124
|
+
>>> print(f"Fog Index: {result.fog_index:.1f}")
|
|
125
|
+
Fog Index: 2.7
|
|
126
|
+
>>> print(f"Grade Level: {result.grade_level}")
|
|
127
|
+
Grade Level: 3
|
|
128
|
+
>>> print(f"Mode: {result.metadata['mode']}")
|
|
129
|
+
Mode: enhanced
|
|
130
|
+
|
|
131
|
+
>>> # Complex academic text (high complexity)
|
|
132
|
+
>>> text = "Understanding phenomenological hermeneutics necessitates comprehensive study."
|
|
133
|
+
>>> result = compute_gunning_fog(text)
|
|
33
134
|
>>> print(f"Fog Index: {result.fog_index:.1f}")
|
|
135
|
+
Fog Index: 23.6
|
|
34
136
|
>>> print(f"Grade Level: {result.grade_level}")
|
|
137
|
+
Grade Level: 20
|
|
138
|
+
|
|
139
|
+
>>> # Check which detection mode was used
|
|
140
|
+
>>> if result.metadata['mode'] == 'enhanced':
|
|
141
|
+
... print("Using spaCy NLP features")
|
|
142
|
+
Using spaCy NLP features
|
|
143
|
+
|
|
144
|
+
Notes:
|
|
145
|
+
- Empty text returns fog_index=NaN and grade_level=NaN (no data)
|
|
146
|
+
- Grade levels are clamped to [0, 20] range for valid input
|
|
147
|
+
- For short texts (< 100 words), results may be unreliable
|
|
148
|
+
- Gunning (1952) recommends analyzing samples of 100+ words
|
|
35
149
|
"""
|
|
150
|
+
# Step 1: Sentence and word tokenization
|
|
151
|
+
# Using the project's standard utilities for consistency
|
|
36
152
|
sentences = split_sentences(text)
|
|
37
|
-
|
|
153
|
+
all_tokens = tokenize(text)
|
|
154
|
+
|
|
155
|
+
# Filter to only valid words (exclude punctuation, numbers, URLs, emails)
|
|
156
|
+
# Allows hyphenated words and contractions per Gunning (1952)
|
|
157
|
+
# Prevents errors in syllable counting from non-word tokens
|
|
158
|
+
tokens = normalize_for_readability(all_tokens)
|
|
38
159
|
|
|
160
|
+
# Edge case: Empty or whitespace-only input
|
|
161
|
+
# Return NaN to distinguish "no data" from actual zero scores
|
|
162
|
+
# This matches SMOG behavior and prevents conflating empty input with simple text
|
|
39
163
|
if len(sentences) == 0 or len(tokens) == 0:
|
|
40
164
|
return GunningFogResult(
|
|
41
|
-
fog_index=
|
|
42
|
-
grade_level=
|
|
43
|
-
metadata={
|
|
165
|
+
fog_index=float("nan"),
|
|
166
|
+
grade_level=float("nan"),
|
|
167
|
+
metadata={
|
|
168
|
+
"sentence_count": 0,
|
|
169
|
+
"word_count": 0,
|
|
170
|
+
"complex_word_count": 0,
|
|
171
|
+
"complex_word_percentage": 0.0,
|
|
172
|
+
"average_words_per_sentence": 0.0,
|
|
173
|
+
"reliable": False,
|
|
174
|
+
"mode": "none",
|
|
175
|
+
"proper_noun_detection": "N/A",
|
|
176
|
+
"inflection_handling": "N/A",
|
|
177
|
+
},
|
|
44
178
|
)
|
|
45
179
|
|
|
46
|
-
# Count complex words
|
|
47
|
-
#
|
|
48
|
-
|
|
180
|
+
# Step 2: Count complex words using NLP-enhanced detection
|
|
181
|
+
# This addresses PR #4 issues with proper noun and inflection detection
|
|
182
|
+
# See complex_words.py for detailed implementation
|
|
183
|
+
complex_word_count, detection_metadata = process_text_for_complex_words(
|
|
184
|
+
text, tokens, model=spacy_model
|
|
185
|
+
)
|
|
186
|
+
|
|
187
|
+
# Step 3: Calculate formula components
|
|
188
|
+
# Reference: Gunning (1952), p. 40: "The Fog Index formula"
|
|
189
|
+
|
|
190
|
+
# Average Sentence Length (ASL)
|
|
191
|
+
# Number of words divided by number of sentences
|
|
192
|
+
average_words_per_sentence = len(tokens) / len(sentences)
|
|
193
|
+
|
|
194
|
+
# Percentage of Hard Words (PHW)
|
|
195
|
+
# Number of complex words divided by total words, multiplied by 100
|
|
196
|
+
complex_word_percentage = (complex_word_count / len(tokens)) * 100
|
|
197
|
+
|
|
198
|
+
# Step 4: Apply Gunning Fog formula
|
|
199
|
+
# Fog = 0.4 × (ASL + PHW)
|
|
200
|
+
# The 0.4 coefficient scales the result to approximate U.S. grade levels
|
|
201
|
+
fog_index = _FOG_COEFFICIENT * (average_words_per_sentence + complex_word_percentage)
|
|
202
|
+
|
|
203
|
+
# Step 5: Convert to grade level
|
|
204
|
+
# Round to nearest integer using standard rounding (round half to even)
|
|
205
|
+
# Clamp to reasonable range [0, 20] to prevent extreme values
|
|
206
|
+
# Note: Texts with fog_index > 20 are considered "post-graduate" level
|
|
207
|
+
grade_level = max(0, min(20, round(fog_index)))
|
|
49
208
|
|
|
50
|
-
#
|
|
51
|
-
|
|
52
|
-
|
|
209
|
+
# Reliability heuristic: Gunning (1952) recommends 100+ word samples
|
|
210
|
+
# Also require 3+ sentences to ensure meaningful average sentence length
|
|
211
|
+
# Very long texts with few sentences can produce unstable FOG estimates
|
|
212
|
+
reliable = len(tokens) >= 100 and len(sentences) >= 3
|
|
53
213
|
|
|
214
|
+
# Step 6: Assemble result with comprehensive metadata
|
|
54
215
|
return GunningFogResult(
|
|
55
216
|
fog_index=fog_index,
|
|
56
217
|
grade_level=grade_level,
|
|
57
218
|
metadata={
|
|
219
|
+
# Core counts
|
|
58
220
|
"sentence_count": len(sentences),
|
|
59
221
|
"word_count": len(tokens),
|
|
60
222
|
"complex_word_count": complex_word_count,
|
|
61
|
-
|
|
223
|
+
# Derived metrics
|
|
224
|
+
"complex_word_percentage": complex_word_percentage,
|
|
225
|
+
"average_words_per_sentence": average_words_per_sentence,
|
|
226
|
+
# Reliability indicator
|
|
227
|
+
"reliable": reliable,
|
|
228
|
+
# Detection method transparency (from complex_words module)
|
|
229
|
+
# This allows users to verify which mode was used
|
|
230
|
+
**detection_metadata,
|
|
62
231
|
},
|
|
63
232
|
)
|
pystylometry/readability/smog.py
CHANGED
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
"""SMOG (Simple Measure of Gobbledygook) Index."""
|
|
2
2
|
|
|
3
|
+
import math
|
|
4
|
+
|
|
5
|
+
from .._normalize import normalize_for_readability
|
|
3
6
|
from .._types import SMOGResult
|
|
4
7
|
from .._utils import split_sentences, tokenize
|
|
5
8
|
from .syllables import count_syllables
|
|
@@ -27,23 +30,29 @@ def compute_smog(text: str) -> SMOGResult:
|
|
|
27
30
|
Returns:
|
|
28
31
|
SMOGResult with SMOG index and grade level
|
|
29
32
|
|
|
33
|
+
Note: For empty input (no sentences or words), smog_index and grade_level
|
|
34
|
+
will be float('nan'). This prevents conflating "no data" with actual scores.
|
|
35
|
+
|
|
36
|
+
SMOG is designed for texts with 30+ sentences. For shorter texts, the formula
|
|
37
|
+
still computes but a warning is included in metadata. Results may be less reliable.
|
|
38
|
+
|
|
30
39
|
Example:
|
|
31
|
-
>>>
|
|
40
|
+
>>> text = "Caffeinated programmers debugged incomprehensible code."
|
|
41
|
+
>>> result = compute_smog(text)
|
|
32
42
|
>>> print(f"SMOG Index: {result.smog_index:.1f}")
|
|
33
43
|
>>> print(f"Grade Level: {result.grade_level}")
|
|
34
44
|
"""
|
|
35
45
|
sentences = split_sentences(text)
|
|
36
46
|
tokens = tokenize(text)
|
|
37
47
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
pass
|
|
48
|
+
# Filter tokens to only valid words for syllable counting
|
|
49
|
+
# Removes numbers, URLs, emails, etc. that would cause errors
|
|
50
|
+
word_tokens = normalize_for_readability(tokens)
|
|
42
51
|
|
|
43
|
-
if len(sentences) == 0 or len(
|
|
52
|
+
if len(sentences) == 0 or len(word_tokens) == 0:
|
|
44
53
|
return SMOGResult(
|
|
45
|
-
smog_index=
|
|
46
|
-
grade_level=
|
|
54
|
+
smog_index=float("nan"),
|
|
55
|
+
grade_level=float("nan"),
|
|
47
56
|
metadata={
|
|
48
57
|
"sentence_count": 0,
|
|
49
58
|
"word_count": 0,
|
|
@@ -52,19 +61,27 @@ def compute_smog(text: str) -> SMOGResult:
|
|
|
52
61
|
},
|
|
53
62
|
)
|
|
54
63
|
|
|
55
|
-
# Count polysyllables (words with 3+ syllables)
|
|
56
|
-
polysyllable_count = sum(1 for word in
|
|
64
|
+
# Count polysyllables (words with 3+ syllables) - safe now, only valid words
|
|
65
|
+
polysyllable_count = sum(1 for word in word_tokens if count_syllables(word) >= 3)
|
|
66
|
+
|
|
67
|
+
# SMOG formula: 1.043 × √(polysyllables × 30/sentences) + 3.1291
|
|
68
|
+
smog_index = 1.043 * math.sqrt(polysyllable_count * 30 / len(sentences)) + 3.1291
|
|
57
69
|
|
|
58
|
-
#
|
|
59
|
-
|
|
60
|
-
|
|
70
|
+
# Use round-half-up rounding (not banker's rounding)
|
|
71
|
+
# Clamp to valid grade range [0, 20]
|
|
72
|
+
# Round half up: 4.5 → 5 (not Python's default round-half-to-even)
|
|
73
|
+
# math.floor(x + 0.5) implements round-half-up for both positive and negative values
|
|
74
|
+
# Lower bound: Prevent negative grades
|
|
75
|
+
# (though mathematically unlikely with SMOG's +3.1291 constant)
|
|
76
|
+
# Upper bound: Cap at grade 20 (post-graduate) for extreme complexity
|
|
77
|
+
grade_level = max(0, min(20, math.floor(smog_index + 0.5)))
|
|
61
78
|
|
|
62
79
|
return SMOGResult(
|
|
63
80
|
smog_index=smog_index,
|
|
64
81
|
grade_level=grade_level,
|
|
65
82
|
metadata={
|
|
66
83
|
"sentence_count": len(sentences),
|
|
67
|
-
"word_count": len(
|
|
84
|
+
"word_count": len(word_tokens),
|
|
68
85
|
"polysyllable_count": polysyllable_count,
|
|
69
86
|
"warning": "Less than 30 sentences" if len(sentences) < 30 else None,
|
|
70
87
|
},
|
|
@@ -1,54 +1,161 @@
|
|
|
1
|
-
"""
|
|
1
|
+
"""
|
|
2
|
+
Syllable counting using CMU Pronouncing Dictionary.
|
|
2
3
|
|
|
4
|
+
Uses the pronouncing library which provides access to the CMU Pronouncing
|
|
5
|
+
Dictionary for high-accuracy syllable counting based on phonetic transcriptions.
|
|
6
|
+
"""
|
|
3
7
|
|
|
8
|
+
import re
|
|
9
|
+
from functools import lru_cache
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import pronouncing # type: ignore[import-untyped]
|
|
13
|
+
except ImportError:
|
|
14
|
+
raise ImportError(
|
|
15
|
+
"The 'pronouncing' library is required for syllable counting. "
|
|
16
|
+
"Install it with: pip install pystylometry[readability]"
|
|
17
|
+
)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@lru_cache(maxsize=4096)
|
|
4
21
|
def count_syllables(word: str) -> int:
|
|
5
22
|
"""
|
|
6
|
-
Count syllables
|
|
23
|
+
Count syllables using CMU Pronouncing Dictionary.
|
|
24
|
+
|
|
25
|
+
Uses phonetic transcriptions from CMU dictionary. For words with multiple
|
|
26
|
+
pronunciations, uses the first pronunciation (typically the most common).
|
|
27
|
+
Falls back to simple vowel counting for words not in the dictionary.
|
|
7
28
|
|
|
8
29
|
Args:
|
|
9
|
-
word:
|
|
30
|
+
word: Input word (handles mixed case, strips whitespace)
|
|
10
31
|
|
|
11
32
|
Returns:
|
|
12
|
-
|
|
33
|
+
Syllable count (minimum 1 for non-empty input)
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> count_syllables("beautiful")
|
|
37
|
+
3
|
|
38
|
+
>>> count_syllables("fire")
|
|
39
|
+
2
|
|
40
|
+
>>> count_syllables("cruel")
|
|
41
|
+
1
|
|
13
42
|
"""
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
43
|
+
word = word.lower().strip()
|
|
44
|
+
if not word:
|
|
45
|
+
return 0
|
|
17
46
|
|
|
47
|
+
# Strip common punctuation
|
|
48
|
+
word = word.strip(".,;:!?\"'()-")
|
|
49
|
+
if not word:
|
|
50
|
+
return 0
|
|
18
51
|
|
|
19
|
-
|
|
20
|
-
""
|
|
21
|
-
|
|
52
|
+
# Handle contractions by removing apostrophes
|
|
53
|
+
if "'" in word:
|
|
54
|
+
word = word.replace("'", "")
|
|
22
55
|
|
|
23
|
-
|
|
24
|
-
|
|
56
|
+
# Handle hyphenated compounds
|
|
57
|
+
if "-" in word:
|
|
58
|
+
return sum(count_syllables(part) for part in word.split("-") if part)
|
|
25
59
|
|
|
26
|
-
|
|
27
|
-
|
|
60
|
+
# Get pronunciations from CMU dictionary
|
|
61
|
+
phones_list = pronouncing.phones_for_word(word)
|
|
28
62
|
|
|
29
|
-
|
|
30
|
-
|
|
63
|
+
if phones_list:
|
|
64
|
+
# Use first pronunciation (most common)
|
|
65
|
+
# Count stress markers (0, 1, 2) in phoneme representation
|
|
66
|
+
phones = phones_list[0]
|
|
67
|
+
return pronouncing.syllable_count(phones) # type: ignore[no-any-return]
|
|
68
|
+
|
|
69
|
+
# Fallback for words not in dictionary: simple vowel counting
|
|
70
|
+
return _fallback_count(word)
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
def _fallback_count(word: str) -> int:
|
|
31
74
|
"""
|
|
32
|
-
|
|
33
|
-
if len(word) == 0:
|
|
34
|
-
return 0
|
|
75
|
+
Simple fallback syllable counter for words not in CMU dictionary.
|
|
35
76
|
|
|
77
|
+
Uses basic vowel counting with silent-e adjustment.
|
|
78
|
+
Less accurate than CMU but handles rare/technical words.
|
|
79
|
+
"""
|
|
36
80
|
vowels = "aeiouy"
|
|
37
|
-
|
|
38
|
-
|
|
81
|
+
count = 0
|
|
82
|
+
prev_was_vowel = False
|
|
39
83
|
|
|
40
84
|
for char in word:
|
|
41
85
|
is_vowel = char in vowels
|
|
42
|
-
if is_vowel and not
|
|
43
|
-
|
|
44
|
-
|
|
86
|
+
if is_vowel and not prev_was_vowel:
|
|
87
|
+
count += 1
|
|
88
|
+
prev_was_vowel = is_vowel
|
|
45
89
|
|
|
46
90
|
# Adjust for silent 'e'
|
|
47
|
-
if word.endswith("e") and
|
|
48
|
-
|
|
91
|
+
if word.endswith("e") and count > 1:
|
|
92
|
+
count -= 1
|
|
93
|
+
|
|
94
|
+
# Ensure minimum of 1
|
|
95
|
+
return max(1, count)
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def count_syllables_text(text: str) -> list[tuple[str, int]]:
|
|
99
|
+
"""
|
|
100
|
+
Count syllables for all words in a text.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
text: Input text
|
|
104
|
+
|
|
105
|
+
Returns:
|
|
106
|
+
List of (word, syllable_count) tuples
|
|
107
|
+
|
|
108
|
+
Example:
|
|
109
|
+
>>> count_syllables_text("The quick brown fox")
|
|
110
|
+
[('The', 1), ('quick', 1), ('brown', 1), ('fox', 1)]
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
words = re.findall(r"[a-zA-Z']+", text)
|
|
114
|
+
return [(w, count_syllables(w)) for w in words]
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
def total_syllables(text: str) -> int:
|
|
118
|
+
"""
|
|
119
|
+
Return total syllable count for text.
|
|
120
|
+
|
|
121
|
+
Args:
|
|
122
|
+
text: Input text
|
|
123
|
+
|
|
124
|
+
Returns:
|
|
125
|
+
Total number of syllables
|
|
126
|
+
|
|
127
|
+
Example:
|
|
128
|
+
>>> total_syllables("The quick brown fox")
|
|
129
|
+
4
|
|
130
|
+
"""
|
|
131
|
+
return sum(count for _, count in count_syllables_text(text))
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def validate_accuracy(
|
|
135
|
+
test_pairs: list[tuple[str, int]],
|
|
136
|
+
) -> tuple[float, list[tuple[str, int, int]]]:
|
|
137
|
+
"""
|
|
138
|
+
Test accuracy against known word-syllable pairs.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
test_pairs: List of (word, expected_syllables) tuples
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
(accuracy_percentage, list of (word, expected, got) for failures)
|
|
145
|
+
|
|
146
|
+
Example:
|
|
147
|
+
>>> test_pairs = [("hello", 2), ("world", 1), ("beautiful", 3)]
|
|
148
|
+
>>> accuracy, failures = validate_accuracy(test_pairs)
|
|
149
|
+
>>> print(f"Accuracy: {accuracy:.1f}%")
|
|
150
|
+
"""
|
|
151
|
+
failures = []
|
|
152
|
+
for word, expected in test_pairs:
|
|
153
|
+
got = count_syllables(word)
|
|
154
|
+
if got != expected:
|
|
155
|
+
failures.append((word, expected, got))
|
|
49
156
|
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
syllable_count = 1
|
|
157
|
+
if not test_pairs:
|
|
158
|
+
return 0.0, []
|
|
53
159
|
|
|
54
|
-
|
|
160
|
+
accuracy = (len(test_pairs) - len(failures)) / len(test_pairs) * 100
|
|
161
|
+
return accuracy, failures
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
"""Stylistic analysis metrics.
|
|
2
|
+
|
|
3
|
+
Related GitHub Issues:
|
|
4
|
+
#20 - Stylistic Markers
|
|
5
|
+
#21 - Vocabulary Overlap and Similarity Metrics
|
|
6
|
+
#22 - Cohesion and Coherence Metrics
|
|
7
|
+
#23 - Genre and Register Features
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from .cohesion_coherence import compute_cohesion_coherence
|
|
11
|
+
from .genre_register import compute_genre_register
|
|
12
|
+
from .markers import compute_stylistic_markers
|
|
13
|
+
from .vocabulary_overlap import compute_vocabulary_overlap
|
|
14
|
+
|
|
15
|
+
__all__ = [
|
|
16
|
+
"compute_stylistic_markers",
|
|
17
|
+
"compute_vocabulary_overlap",
|
|
18
|
+
"compute_cohesion_coherence",
|
|
19
|
+
"compute_genre_register",
|
|
20
|
+
]
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Cohesion and coherence metrics.
|
|
2
|
+
|
|
3
|
+
This module measures how well a text holds together structurally (cohesion)
|
|
4
|
+
and semantically (coherence). Important for analyzing writing quality and
|
|
5
|
+
authorial sophistication.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#22 - Cohesion and Coherence Metrics
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/22
|
|
10
|
+
|
|
11
|
+
References:
|
|
12
|
+
Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
|
|
13
|
+
Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from .._types import CohesionCoherenceResult
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def compute_cohesion_coherence(text: str, model: str = "en_core_web_sm") -> CohesionCoherenceResult:
|
|
20
|
+
"""
|
|
21
|
+
Compute cohesion and coherence metrics.
|
|
22
|
+
|
|
23
|
+
Related GitHub Issue:
|
|
24
|
+
#22 - Cohesion and Coherence Metrics
|
|
25
|
+
https://github.com/craigtrim/pystylometry/issues/22
|
|
26
|
+
|
|
27
|
+
Args:
|
|
28
|
+
text: Input text to analyze
|
|
29
|
+
model: spaCy model for linguistic analysis
|
|
30
|
+
|
|
31
|
+
Returns:
|
|
32
|
+
CohesionCoherenceResult with referential cohesion, lexical cohesion,
|
|
33
|
+
connective density, and coherence scores.
|
|
34
|
+
|
|
35
|
+
Example:
|
|
36
|
+
>>> result = compute_cohesion_coherence("Multi-paragraph text...")
|
|
37
|
+
>>> print(f"Pronoun density: {result.pronoun_density:.2f}")
|
|
38
|
+
>>> print(f"Connective density: {result.connective_density:.2f}")
|
|
39
|
+
"""
|
|
40
|
+
# TODO: Implement cohesion/coherence analysis
|
|
41
|
+
# GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22
|
|
42
|
+
raise NotImplementedError(
|
|
43
|
+
"Cohesion/coherence metrics not yet implemented. "
|
|
44
|
+
"See GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22"
|
|
45
|
+
)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Genre and register classification features.
|
|
2
|
+
|
|
3
|
+
This module extracts features that distinguish between different text types
|
|
4
|
+
(academic, journalistic, fiction, legal, etc.) and formality levels.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#23 - Genre and Register Features
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/23
|
|
9
|
+
|
|
10
|
+
References:
|
|
11
|
+
Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
|
|
12
|
+
Biber, D., & Conrad, S. (2009). Register, genre, and style.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from .._types import GenreRegisterResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def compute_genre_register(text: str, model: str = "en_core_web_sm") -> GenreRegisterResult:
|
|
19
|
+
"""
|
|
20
|
+
Analyze genre and register features for text classification.
|
|
21
|
+
|
|
22
|
+
Related GitHub Issue:
|
|
23
|
+
#23 - Genre and Register Features
|
|
24
|
+
https://github.com/craigtrim/pystylometry/issues/23
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
text: Input text to analyze
|
|
28
|
+
model: spaCy model for linguistic analysis
|
|
29
|
+
|
|
30
|
+
Returns:
|
|
31
|
+
GenreRegisterResult with formality scores, register classification,
|
|
32
|
+
genre predictions, and feature scores for major genres.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
>>> result = compute_genre_register("Academic paper text...")
|
|
36
|
+
>>> print(f"Formality score: {result.formality_score:.2f}")
|
|
37
|
+
>>> print(f"Predicted genre: {result.predicted_genre}")
|
|
38
|
+
>>> print(f"Academic score: {result.academic_score:.3f}")
|
|
39
|
+
"""
|
|
40
|
+
# TODO: Implement genre/register analysis
|
|
41
|
+
# GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23
|
|
42
|
+
raise NotImplementedError(
|
|
43
|
+
"Genre/register classification not yet implemented. "
|
|
44
|
+
"See GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23"
|
|
45
|
+
)
|