pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
@@ -1,63 +1,232 @@
1
- """Gunning Fog Index."""
1
+ """Gunning Fog Index with NLP-enhanced complex word detection.
2
2
 
3
+ This module computes the Gunning Fog Index, a readability metric that
4
+ estimates the years of formal education needed to understand text on first reading.
5
+
6
+ Historical Background:
7
+ ----------------------
8
+ The Gunning Fog Index was developed by Robert Gunning in 1952 as part of his
9
+ work helping businesses improve the clarity of their writing. The formula produces
10
+ a U.S. grade-level score (e.g., 12 = high school senior reading level).
11
+
12
+ Reference:
13
+ Gunning, R. (1952). The Technique of Clear Writing.
14
+ McGraw-Hill, New York.
15
+
16
+ Implementation Notes (PR #4):
17
+ ------------------------------
18
+ This implementation addresses issues raised in GitHub PR #4:
19
+ https://github.com/craigtrim/pystylometry/pull/4
20
+
21
+ The original TODO implementation used simple syllable counting without proper
22
+ exclusions for proper nouns, compounds, or inflections. This NLP-enhanced
23
+ version uses the complex_words module for accurate detection via:
24
+
25
+ 1. spaCy POS tagging for proper noun detection (enhanced mode)
26
+ 2. spaCy lemmatization for morphological analysis (enhanced mode)
27
+ 3. Component-based analysis for hyphenated words (both modes)
28
+ 4. Graceful fallback to heuristics when spaCy unavailable (basic mode)
29
+
30
+ See complex_words.py for detailed rationale and implementation.
31
+ """
32
+
33
+ from .._normalize import normalize_for_readability
3
34
  from .._types import GunningFogResult
4
35
  from .._utils import split_sentences, tokenize
5
- from .syllables import count_syllables
6
36
 
37
+ # Import NLP-enhanced complex word detection module
38
+ # This module addresses PR #4 issues with proper noun and inflection detection
39
+ from .complex_words import process_text_for_complex_words
7
40
 
8
- def compute_gunning_fog(text: str) -> GunningFogResult:
41
+ # Formula coefficient from Gunning (1952)
42
+ # Reference: Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
43
+ # The 0.4 coefficient scales the combined complexity measure to approximate grade level
44
+ _FOG_COEFFICIENT = 0.4
45
+
46
+
47
+ def compute_gunning_fog(text: str, spacy_model: str = "en_core_web_sm") -> GunningFogResult:
9
48
  """
10
- Compute Gunning Fog Index.
49
+ Compute Gunning Fog Index with NLP-enhanced complex word detection.
11
50
 
12
- Formula:
51
+ The Gunning Fog Index estimates the years of formal education required
52
+ to understand text on first reading. It combines sentence length and
53
+ lexical complexity (polysyllabic words) into a single grade-level score.
54
+
55
+ Formula (Gunning, 1952):
56
+ ------------------------
13
57
  Fog Index = 0.4 × [(words/sentences) + 100 × (complex words/words)]
14
58
 
15
- Where complex words are defined as words with 3+ syllables,
16
- excluding proper nouns, compound words, and common suffixes.
59
+ Where:
60
+ - words/sentences = Average Sentence Length (ASL)
61
+ - complex words/words = Percentage of Hard Words (PHW)
62
+ - 0.4 = Scaling coefficient to approximate U.S. grade levels
63
+
64
+ The resulting score represents a U.S. education grade level:
65
+ - 6 = Sixth grade (age 11-12)
66
+ - 12 = High school senior (age 17-18)
67
+ - 17+ = College graduate level
68
+
69
+ Complex Words Definition (Gunning, 1952):
70
+ ------------------------------------------
71
+ Words with 3+ syllables, EXCLUDING:
72
+ 1. Proper nouns (names, places, organizations)
73
+ 2. Compound words (hyphenated)
74
+ 3. Common verb forms (-es, -ed, -ing endings)
75
+
76
+ Reference:
77
+ Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
78
+ Pages 38-39: Complex word criteria
79
+
80
+ NLP Enhancement (PR #4):
81
+ ------------------------
82
+ This implementation addresses issues in GitHub PR #4:
83
+ https://github.com/craigtrim/pystylometry/pull/4
17
84
 
18
- The index estimates years of formal education needed to understand the text
19
- on first reading.
85
+ **Enhanced Mode** (when spaCy available):
86
+ - Uses POS tagging (PROPN) for proper noun detection
87
+ - Uses lemmatization for morphological analysis
88
+ - Analyzes hyphenated word components individually
89
+ - More accurate, handles edge cases (acronyms, irregular verbs)
20
90
 
21
- References:
22
- Gunning, R. (1952). The Technique of Clear Writing.
23
- McGraw-Hill.
91
+ **Basic Mode** (when spaCy unavailable):
92
+ - Uses capitalization heuristic for proper nouns
93
+ - Uses simple suffix stripping for inflections
94
+ - Analyzes hyphenated word components individually
95
+ - Less accurate but requires no external dependencies
96
+
97
+ The mode used is reported in metadata for transparency.
24
98
 
25
99
  Args:
26
100
  text: Input text to analyze
101
+ spacy_model: spaCy model name for enhanced mode (default: "en_core_web_sm")
102
+ Requires model download: python -m spacy download en_core_web_sm
103
+ Other options: "en_core_web_md", "en_core_web_lg"
27
104
 
28
105
  Returns:
29
- GunningFogResult with fog index and grade level
106
+ GunningFogResult with:
107
+ - fog_index: Float, the calculated Gunning Fog Index
108
+ - grade_level: Float, rounded U.S. grade level (0-20), or NaN if empty
109
+ - metadata: Dict with:
110
+ - sentence_count: Number of sentences
111
+ - word_count: Number of words (tokens)
112
+ - complex_word_count: Number of complex words
113
+ - complex_word_percentage: Percentage of complex words
114
+ - average_words_per_sentence: Mean sentence length
115
+ - reliable: Boolean, True if word_count >= 100 and sentence_count >= 3
116
+ - mode: "enhanced" (spaCy) or "basic" (heuristics)
117
+ - proper_noun_detection: Detection method used
118
+ - inflection_handling: Inflection analysis method used
119
+ - spacy_model: Model name if enhanced mode (else absent)
30
120
 
31
121
  Example:
32
- >>> result = compute_gunning_fog("The quick brown fox jumps over the lazy dog.")
122
+ >>> # Simple text (low complexity)
123
+ >>> result = compute_gunning_fog("The cat sat on the mat. The dog ran.")
124
+ >>> print(f"Fog Index: {result.fog_index:.1f}")
125
+ Fog Index: 2.7
126
+ >>> print(f"Grade Level: {result.grade_level}")
127
+ Grade Level: 3
128
+ >>> print(f"Mode: {result.metadata['mode']}")
129
+ Mode: enhanced
130
+
131
+ >>> # Complex academic text (high complexity)
132
+ >>> text = "Understanding phenomenological hermeneutics necessitates comprehensive study."
133
+ >>> result = compute_gunning_fog(text)
33
134
  >>> print(f"Fog Index: {result.fog_index:.1f}")
135
+ Fog Index: 23.6
34
136
  >>> print(f"Grade Level: {result.grade_level}")
137
+ Grade Level: 20
138
+
139
+ >>> # Check which detection mode was used
140
+ >>> if result.metadata['mode'] == 'enhanced':
141
+ ... print("Using spaCy NLP features")
142
+ Using spaCy NLP features
143
+
144
+ Notes:
145
+ - Empty text returns fog_index=NaN and grade_level=NaN (no data)
146
+ - Grade levels are clamped to [0, 20] range for valid input
147
+ - For short texts (< 100 words), results may be unreliable
148
+ - Gunning (1952) recommends analyzing samples of 100+ words
35
149
  """
150
+ # Step 1: Sentence and word tokenization
151
+ # Using the project's standard utilities for consistency
36
152
  sentences = split_sentences(text)
37
- tokens = tokenize(text)
153
+ all_tokens = tokenize(text)
154
+
155
+ # Filter to only valid words (exclude punctuation, numbers, URLs, emails)
156
+ # Allows hyphenated words and contractions per Gunning (1952)
157
+ # Prevents errors in syllable counting from non-word tokens
158
+ tokens = normalize_for_readability(all_tokens)
38
159
 
160
+ # Edge case: Empty or whitespace-only input
161
+ # Return NaN to distinguish "no data" from actual zero scores
162
+ # This matches SMOG behavior and prevents conflating empty input with simple text
39
163
  if len(sentences) == 0 or len(tokens) == 0:
40
164
  return GunningFogResult(
41
- fog_index=0.0,
42
- grade_level=0,
43
- metadata={"sentence_count": 0, "word_count": 0, "complex_word_count": 0},
165
+ fog_index=float("nan"),
166
+ grade_level=float("nan"),
167
+ metadata={
168
+ "sentence_count": 0,
169
+ "word_count": 0,
170
+ "complex_word_count": 0,
171
+ "complex_word_percentage": 0.0,
172
+ "average_words_per_sentence": 0.0,
173
+ "reliable": False,
174
+ "mode": "none",
175
+ "proper_noun_detection": "N/A",
176
+ "inflection_handling": "N/A",
177
+ },
44
178
  )
45
179
 
46
- # Count complex words (3+ syllables)
47
- # TODO: Exclude proper nouns, compound words, and -es/-ed/-ing endings
48
- complex_word_count = sum(1 for word in tokens if count_syllables(word) >= 3)
180
+ # Step 2: Count complex words using NLP-enhanced detection
181
+ # This addresses PR #4 issues with proper noun and inflection detection
182
+ # See complex_words.py for detailed implementation
183
+ complex_word_count, detection_metadata = process_text_for_complex_words(
184
+ text, tokens, model=spacy_model
185
+ )
186
+
187
+ # Step 3: Calculate formula components
188
+ # Reference: Gunning (1952), p. 40: "The Fog Index formula"
189
+
190
+ # Average Sentence Length (ASL)
191
+ # Number of words divided by number of sentences
192
+ average_words_per_sentence = len(tokens) / len(sentences)
193
+
194
+ # Percentage of Hard Words (PHW)
195
+ # Number of complex words divided by total words, multiplied by 100
196
+ complex_word_percentage = (complex_word_count / len(tokens)) * 100
197
+
198
+ # Step 4: Apply Gunning Fog formula
199
+ # Fog = 0.4 × (ASL + PHW)
200
+ # The 0.4 coefficient scales the result to approximate U.S. grade levels
201
+ fog_index = _FOG_COEFFICIENT * (average_words_per_sentence + complex_word_percentage)
202
+
203
+ # Step 5: Convert to grade level
204
+ # Round to nearest integer using standard rounding (round half to even)
205
+ # Clamp to reasonable range [0, 20] to prevent extreme values
206
+ # Note: Texts with fog_index > 20 are considered "post-graduate" level
207
+ grade_level = max(0, min(20, round(fog_index)))
49
208
 
50
- # TODO: Implement Gunning Fog formula
51
- fog_index = 0.0 # Placeholder
52
- grade_level = 0 # Placeholder
209
+ # Reliability heuristic: Gunning (1952) recommends 100+ word samples
210
+ # Also require 3+ sentences to ensure meaningful average sentence length
211
+ # Very long texts with few sentences can produce unstable FOG estimates
212
+ reliable = len(tokens) >= 100 and len(sentences) >= 3
53
213
 
214
+ # Step 6: Assemble result with comprehensive metadata
54
215
  return GunningFogResult(
55
216
  fog_index=fog_index,
56
217
  grade_level=grade_level,
57
218
  metadata={
219
+ # Core counts
58
220
  "sentence_count": len(sentences),
59
221
  "word_count": len(tokens),
60
222
  "complex_word_count": complex_word_count,
61
- "complex_word_percentage": (complex_word_count / len(tokens) * 100) if tokens else 0,
223
+ # Derived metrics
224
+ "complex_word_percentage": complex_word_percentage,
225
+ "average_words_per_sentence": average_words_per_sentence,
226
+ # Reliability indicator
227
+ "reliable": reliable,
228
+ # Detection method transparency (from complex_words module)
229
+ # This allows users to verify which mode was used
230
+ **detection_metadata,
62
231
  },
63
232
  )
@@ -1,5 +1,8 @@
1
1
  """SMOG (Simple Measure of Gobbledygook) Index."""
2
2
 
3
+ import math
4
+
5
+ from .._normalize import normalize_for_readability
3
6
  from .._types import SMOGResult
4
7
  from .._utils import split_sentences, tokenize
5
8
  from .syllables import count_syllables
@@ -27,23 +30,29 @@ def compute_smog(text: str) -> SMOGResult:
27
30
  Returns:
28
31
  SMOGResult with SMOG index and grade level
29
32
 
33
+ Note: For empty input (no sentences or words), smog_index and grade_level
34
+ will be float('nan'). This prevents conflating "no data" with actual scores.
35
+
36
+ SMOG is designed for texts with 30+ sentences. For shorter texts, the formula
37
+ still computes but a warning is included in metadata. Results may be less reliable.
38
+
30
39
  Example:
31
- >>> result = compute_smog("The quick brown fox jumps over the lazy dog.")
40
+ >>> text = "Caffeinated programmers debugged incomprehensible code."
41
+ >>> result = compute_smog(text)
32
42
  >>> print(f"SMOG Index: {result.smog_index:.1f}")
33
43
  >>> print(f"Grade Level: {result.grade_level}")
34
44
  """
35
45
  sentences = split_sentences(text)
36
46
  tokens = tokenize(text)
37
47
 
38
- if len(sentences) < 30:
39
- # SMOG requires at least 30 sentences for accuracy
40
- # We'll compute anyway but note in metadata
41
- pass
48
+ # Filter tokens to only valid words for syllable counting
49
+ # Removes numbers, URLs, emails, etc. that would cause errors
50
+ word_tokens = normalize_for_readability(tokens)
42
51
 
43
- if len(sentences) == 0 or len(tokens) == 0:
52
+ if len(sentences) == 0 or len(word_tokens) == 0:
44
53
  return SMOGResult(
45
- smog_index=0.0,
46
- grade_level=0,
54
+ smog_index=float("nan"),
55
+ grade_level=float("nan"),
47
56
  metadata={
48
57
  "sentence_count": 0,
49
58
  "word_count": 0,
@@ -52,19 +61,27 @@ def compute_smog(text: str) -> SMOGResult:
52
61
  },
53
62
  )
54
63
 
55
- # Count polysyllables (words with 3+ syllables)
56
- polysyllable_count = sum(1 for word in tokens if count_syllables(word) >= 3)
64
+ # Count polysyllables (words with 3+ syllables) - safe now, only valid words
65
+ polysyllable_count = sum(1 for word in word_tokens if count_syllables(word) >= 3)
66
+
67
+ # SMOG formula: 1.043 × √(polysyllables × 30/sentences) + 3.1291
68
+ smog_index = 1.043 * math.sqrt(polysyllable_count * 30 / len(sentences)) + 3.1291
57
69
 
58
- # TODO: Implement SMOG formula
59
- smog_index = 0.0 # Placeholder
60
- grade_level = 0 # Placeholder
70
+ # Use round-half-up rounding (not banker's rounding)
71
+ # Clamp to valid grade range [0, 20]
72
+ # Round half up: 4.5 → 5 (not Python's default round-half-to-even)
73
+ # math.floor(x + 0.5) implements round-half-up for both positive and negative values
74
+ # Lower bound: Prevent negative grades
75
+ # (though mathematically unlikely with SMOG's +3.1291 constant)
76
+ # Upper bound: Cap at grade 20 (post-graduate) for extreme complexity
77
+ grade_level = max(0, min(20, math.floor(smog_index + 0.5)))
61
78
 
62
79
  return SMOGResult(
63
80
  smog_index=smog_index,
64
81
  grade_level=grade_level,
65
82
  metadata={
66
83
  "sentence_count": len(sentences),
67
- "word_count": len(tokens),
84
+ "word_count": len(word_tokens),
68
85
  "polysyllable_count": polysyllable_count,
69
86
  "warning": "Less than 30 sentences" if len(sentences) < 30 else None,
70
87
  },
@@ -1,54 +1,161 @@
1
- """Syllable counting utilities using CMU Pronouncing Dictionary."""
1
+ """
2
+ Syllable counting using CMU Pronouncing Dictionary.
2
3
 
4
+ Uses the pronouncing library which provides access to the CMU Pronouncing
5
+ Dictionary for high-accuracy syllable counting based on phonetic transcriptions.
6
+ """
3
7
 
8
+ import re
9
+ from functools import lru_cache
10
+
11
+ try:
12
+ import pronouncing # type: ignore[import-untyped]
13
+ except ImportError:
14
+ raise ImportError(
15
+ "The 'pronouncing' library is required for syllable counting. "
16
+ "Install it with: pip install pystylometry[readability]"
17
+ )
18
+
19
+
20
+ @lru_cache(maxsize=4096)
4
21
  def count_syllables(word: str) -> int:
5
22
  """
6
- Count syllables in a word using CMU Pronouncing Dictionary with heuristic fallback.
23
+ Count syllables using CMU Pronouncing Dictionary.
24
+
25
+ Uses phonetic transcriptions from CMU dictionary. For words with multiple
26
+ pronunciations, uses the first pronunciation (typically the most common).
27
+ Falls back to simple vowel counting for words not in the dictionary.
7
28
 
8
29
  Args:
9
- word: The word to count syllables for
30
+ word: Input word (handles mixed case, strips whitespace)
10
31
 
11
32
  Returns:
12
- Number of syllables in the word
33
+ Syllable count (minimum 1 for non-empty input)
34
+
35
+ Example:
36
+ >>> count_syllables("beautiful")
37
+ 3
38
+ >>> count_syllables("fire")
39
+ 2
40
+ >>> count_syllables("cruel")
41
+ 1
13
42
  """
14
- # TODO: Implement with pronouncing library
15
- # For now, use simple heuristic fallback
16
- return _heuristic_syllable_count(word)
43
+ word = word.lower().strip()
44
+ if not word:
45
+ return 0
17
46
 
47
+ # Strip common punctuation
48
+ word = word.strip(".,;:!?\"'()-")
49
+ if not word:
50
+ return 0
18
51
 
19
- def _heuristic_syllable_count(word: str) -> int:
20
- """
21
- Simple heuristic syllable counter for fallback.
52
+ # Handle contractions by removing apostrophes
53
+ if "'" in word:
54
+ word = word.replace("'", "")
22
55
 
23
- This is a basic implementation that counts vowel groups.
24
- Should be replaced with CMU dict lookup when pronouncing is available.
56
+ # Handle hyphenated compounds
57
+ if "-" in word:
58
+ return sum(count_syllables(part) for part in word.split("-") if part)
25
59
 
26
- Args:
27
- word: The word to count syllables for
60
+ # Get pronunciations from CMU dictionary
61
+ phones_list = pronouncing.phones_for_word(word)
28
62
 
29
- Returns:
30
- Estimated number of syllables
63
+ if phones_list:
64
+ # Use first pronunciation (most common)
65
+ # Count stress markers (0, 1, 2) in phoneme representation
66
+ phones = phones_list[0]
67
+ return pronouncing.syllable_count(phones) # type: ignore[no-any-return]
68
+
69
+ # Fallback for words not in dictionary: simple vowel counting
70
+ return _fallback_count(word)
71
+
72
+
73
+ def _fallback_count(word: str) -> int:
31
74
  """
32
- word = word.lower().strip()
33
- if len(word) == 0:
34
- return 0
75
+ Simple fallback syllable counter for words not in CMU dictionary.
35
76
 
77
+ Uses basic vowel counting with silent-e adjustment.
78
+ Less accurate than CMU but handles rare/technical words.
79
+ """
36
80
  vowels = "aeiouy"
37
- syllable_count = 0
38
- previous_was_vowel = False
81
+ count = 0
82
+ prev_was_vowel = False
39
83
 
40
84
  for char in word:
41
85
  is_vowel = char in vowels
42
- if is_vowel and not previous_was_vowel:
43
- syllable_count += 1
44
- previous_was_vowel = is_vowel
86
+ if is_vowel and not prev_was_vowel:
87
+ count += 1
88
+ prev_was_vowel = is_vowel
45
89
 
46
90
  # Adjust for silent 'e'
47
- if word.endswith("e") and syllable_count > 1:
48
- syllable_count -= 1
91
+ if word.endswith("e") and count > 1:
92
+ count -= 1
93
+
94
+ # Ensure minimum of 1
95
+ return max(1, count)
96
+
97
+
98
+ def count_syllables_text(text: str) -> list[tuple[str, int]]:
99
+ """
100
+ Count syllables for all words in a text.
101
+
102
+ Args:
103
+ text: Input text
104
+
105
+ Returns:
106
+ List of (word, syllable_count) tuples
107
+
108
+ Example:
109
+ >>> count_syllables_text("The quick brown fox")
110
+ [('The', 1), ('quick', 1), ('brown', 1), ('fox', 1)]
111
+ """
112
+
113
+ words = re.findall(r"[a-zA-Z']+", text)
114
+ return [(w, count_syllables(w)) for w in words]
115
+
116
+
117
+ def total_syllables(text: str) -> int:
118
+ """
119
+ Return total syllable count for text.
120
+
121
+ Args:
122
+ text: Input text
123
+
124
+ Returns:
125
+ Total number of syllables
126
+
127
+ Example:
128
+ >>> total_syllables("The quick brown fox")
129
+ 4
130
+ """
131
+ return sum(count for _, count in count_syllables_text(text))
132
+
133
+
134
+ def validate_accuracy(
135
+ test_pairs: list[tuple[str, int]],
136
+ ) -> tuple[float, list[tuple[str, int, int]]]:
137
+ """
138
+ Test accuracy against known word-syllable pairs.
139
+
140
+ Args:
141
+ test_pairs: List of (word, expected_syllables) tuples
142
+
143
+ Returns:
144
+ (accuracy_percentage, list of (word, expected, got) for failures)
145
+
146
+ Example:
147
+ >>> test_pairs = [("hello", 2), ("world", 1), ("beautiful", 3)]
148
+ >>> accuracy, failures = validate_accuracy(test_pairs)
149
+ >>> print(f"Accuracy: {accuracy:.1f}%")
150
+ """
151
+ failures = []
152
+ for word, expected in test_pairs:
153
+ got = count_syllables(word)
154
+ if got != expected:
155
+ failures.append((word, expected, got))
49
156
 
50
- # Ensure at least one syllable
51
- if syllable_count == 0:
52
- syllable_count = 1
157
+ if not test_pairs:
158
+ return 0.0, []
53
159
 
54
- return syllable_count
160
+ accuracy = (len(test_pairs) - len(failures)) / len(test_pairs) * 100
161
+ return accuracy, failures
@@ -0,0 +1,20 @@
1
+ """Stylistic analysis metrics.
2
+
3
+ Related GitHub Issues:
4
+ #20 - Stylistic Markers
5
+ #21 - Vocabulary Overlap and Similarity Metrics
6
+ #22 - Cohesion and Coherence Metrics
7
+ #23 - Genre and Register Features
8
+ """
9
+
10
+ from .cohesion_coherence import compute_cohesion_coherence
11
+ from .genre_register import compute_genre_register
12
+ from .markers import compute_stylistic_markers
13
+ from .vocabulary_overlap import compute_vocabulary_overlap
14
+
15
+ __all__ = [
16
+ "compute_stylistic_markers",
17
+ "compute_vocabulary_overlap",
18
+ "compute_cohesion_coherence",
19
+ "compute_genre_register",
20
+ ]
@@ -0,0 +1,45 @@
1
+ """Cohesion and coherence metrics.
2
+
3
+ This module measures how well a text holds together structurally (cohesion)
4
+ and semantically (coherence). Important for analyzing writing quality and
5
+ authorial sophistication.
6
+
7
+ Related GitHub Issue:
8
+ #22 - Cohesion and Coherence Metrics
9
+ https://github.com/craigtrim/pystylometry/issues/22
10
+
11
+ References:
12
+ Halliday, M. A. K., & Hasan, R. (1976). Cohesion in English. Longman.
13
+ Graesser, A. C., McNamara, D. S., & Kulikowich, J. M. (2011). Coh-Metrix.
14
+ """
15
+
16
+ from .._types import CohesionCoherenceResult
17
+
18
+
19
+ def compute_cohesion_coherence(text: str, model: str = "en_core_web_sm") -> CohesionCoherenceResult:
20
+ """
21
+ Compute cohesion and coherence metrics.
22
+
23
+ Related GitHub Issue:
24
+ #22 - Cohesion and Coherence Metrics
25
+ https://github.com/craigtrim/pystylometry/issues/22
26
+
27
+ Args:
28
+ text: Input text to analyze
29
+ model: spaCy model for linguistic analysis
30
+
31
+ Returns:
32
+ CohesionCoherenceResult with referential cohesion, lexical cohesion,
33
+ connective density, and coherence scores.
34
+
35
+ Example:
36
+ >>> result = compute_cohesion_coherence("Multi-paragraph text...")
37
+ >>> print(f"Pronoun density: {result.pronoun_density:.2f}")
38
+ >>> print(f"Connective density: {result.connective_density:.2f}")
39
+ """
40
+ # TODO: Implement cohesion/coherence analysis
41
+ # GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22
42
+ raise NotImplementedError(
43
+ "Cohesion/coherence metrics not yet implemented. "
44
+ "See GitHub Issue #22: https://github.com/craigtrim/pystylometry/issues/22"
45
+ )
@@ -0,0 +1,45 @@
1
+ """Genre and register classification features.
2
+
3
+ This module extracts features that distinguish between different text types
4
+ (academic, journalistic, fiction, legal, etc.) and formality levels.
5
+
6
+ Related GitHub Issue:
7
+ #23 - Genre and Register Features
8
+ https://github.com/craigtrim/pystylometry/issues/23
9
+
10
+ References:
11
+ Biber, D. (1988). Variation across speech and writing. Cambridge University Press.
12
+ Biber, D., & Conrad, S. (2009). Register, genre, and style.
13
+ """
14
+
15
+ from .._types import GenreRegisterResult
16
+
17
+
18
+ def compute_genre_register(text: str, model: str = "en_core_web_sm") -> GenreRegisterResult:
19
+ """
20
+ Analyze genre and register features for text classification.
21
+
22
+ Related GitHub Issue:
23
+ #23 - Genre and Register Features
24
+ https://github.com/craigtrim/pystylometry/issues/23
25
+
26
+ Args:
27
+ text: Input text to analyze
28
+ model: spaCy model for linguistic analysis
29
+
30
+ Returns:
31
+ GenreRegisterResult with formality scores, register classification,
32
+ genre predictions, and feature scores for major genres.
33
+
34
+ Example:
35
+ >>> result = compute_genre_register("Academic paper text...")
36
+ >>> print(f"Formality score: {result.formality_score:.2f}")
37
+ >>> print(f"Predicted genre: {result.predicted_genre}")
38
+ >>> print(f"Academic score: {result.academic_score:.3f}")
39
+ """
40
+ # TODO: Implement genre/register analysis
41
+ # GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23
42
+ raise NotImplementedError(
43
+ "Genre/register classification not yet implemented. "
44
+ "See GitHub Issue #23: https://github.com/craigtrim/pystylometry/issues/23"
45
+ )