pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -11,43 +11,805 @@ Related GitHub Issue:
|
|
|
11
11
|
Features analyzed:
|
|
12
12
|
- Syllable patterns and stress patterns
|
|
13
13
|
- Rhythmic regularity (coefficient of variation)
|
|
14
|
-
- Phonological features (alliteration, assonance)
|
|
14
|
+
- Phonological features (alliteration, assonance, consonance)
|
|
15
15
|
- Syllable complexity (consonant clusters)
|
|
16
|
-
- Sentence rhythm
|
|
16
|
+
- Sentence rhythm (length alternation)
|
|
17
17
|
- Polysyllabic word usage
|
|
18
|
+
- Metrical foot estimation (iambic, trochaic, dactylic, anapestic)
|
|
19
|
+
|
|
20
|
+
Dependencies:
|
|
21
|
+
- CMU Pronouncing Dictionary (via pronouncing package)
|
|
22
|
+
- pronouncing is already a dependency for pystylometry[readability]
|
|
18
23
|
|
|
19
24
|
References:
|
|
25
|
+
Fabb, N., & Halle, M. (2008). Meter in Poetry: A New Theory. Cambridge
|
|
26
|
+
University Press.
|
|
27
|
+
Greene, E., Bodrumlu, T., & Knight, K. (2010). Automatic analysis of rhythmic
|
|
28
|
+
poetry with applications to generation and translation. Proceedings of
|
|
29
|
+
EMNLP, 524-533.
|
|
20
30
|
Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm and
|
|
21
31
|
text comprehension. Memory & Cognition, 33(3), 388-396.
|
|
22
32
|
"""
|
|
23
33
|
|
|
34
|
+
from __future__ import annotations
|
|
35
|
+
|
|
36
|
+
import math
|
|
37
|
+
import re
|
|
38
|
+
from collections import Counter
|
|
39
|
+
from functools import lru_cache
|
|
40
|
+
from typing import Any
|
|
41
|
+
|
|
24
42
|
from .._types import RhythmProsodyResult
|
|
25
43
|
|
|
44
|
+
# =============================================================================
|
|
45
|
+
# DEPENDENCY: CMU PRONOUNCING DICTIONARY
|
|
46
|
+
# =============================================================================
|
|
47
|
+
# The pronouncing package provides access to the CMU Pronouncing Dictionary,
|
|
48
|
+
# which maps English words to ARPAbet phoneme sequences with stress markers.
|
|
49
|
+
# Stress markers: 0 = no stress, 1 = primary stress, 2 = secondary stress.
|
|
26
50
|
|
|
27
|
-
|
|
51
|
+
try:
|
|
52
|
+
import pronouncing # type: ignore[import-untyped]
|
|
53
|
+
except ImportError:
|
|
54
|
+
raise ImportError(
|
|
55
|
+
"The 'pronouncing' library is required for rhythm and prosody analysis. "
|
|
56
|
+
"Install it with: pip install pystylometry[readability]"
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
# =============================================================================
|
|
60
|
+
# VOWEL AND CONSONANT DEFINITIONS
|
|
61
|
+
# =============================================================================
|
|
62
|
+
# Used for alliteration, assonance, consonance, and cluster detection.
|
|
63
|
+
|
|
64
|
+
VOWELS = set("aeiou")
|
|
65
|
+
CONSONANTS = set("bcdfghjklmnpqrstvwxyz")
|
|
66
|
+
|
|
67
|
+
# ARPAbet vowel phonemes (used in CMU dictionary output)
|
|
68
|
+
ARPABET_VOWELS = {
|
|
69
|
+
"AA",
|
|
70
|
+
"AE",
|
|
71
|
+
"AH",
|
|
72
|
+
"AO",
|
|
73
|
+
"AW",
|
|
74
|
+
"AY",
|
|
75
|
+
"EH",
|
|
76
|
+
"ER",
|
|
77
|
+
"EY",
|
|
78
|
+
"IH",
|
|
79
|
+
"IY",
|
|
80
|
+
"OW",
|
|
81
|
+
"OY",
|
|
82
|
+
"UH",
|
|
83
|
+
"UW",
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
# Consonant cluster patterns at word boundaries
|
|
87
|
+
# Reference: English phonotactics (Clements & Keyser, 1983)
|
|
88
|
+
INITIAL_CLUSTER_PATTERN = re.compile(r"^[bcdfghjklmnpqrstvwxyz]{2,}", re.IGNORECASE)
|
|
89
|
+
FINAL_CLUSTER_PATTERN = re.compile(r"[bcdfghjklmnpqrstvwxyz]{2,}$", re.IGNORECASE)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# =============================================================================
|
|
93
|
+
# PHONEME AND SYLLABLE HELPERS
|
|
94
|
+
# =============================================================================
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _extract_words(text: str) -> list[str]:
|
|
98
|
+
"""Extract alphabetic words from text, preserving order."""
|
|
99
|
+
return re.findall(r"[a-zA-Z]+", text)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _split_sentences(text: str) -> list[str]:
|
|
103
|
+
"""Split text into sentences on sentence-ending punctuation."""
|
|
104
|
+
sentences = re.split(r"[.!?]+", text)
|
|
105
|
+
return [s.strip() for s in sentences if s.strip()]
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
@lru_cache(maxsize=4096)
|
|
109
|
+
def _get_phones(word: str) -> str | None:
|
|
110
|
+
"""Get the first (most common) pronunciation from CMU dictionary.
|
|
111
|
+
|
|
112
|
+
Returns the ARPAbet phoneme string, or None if the word is not found.
|
|
113
|
+
CMU stress markers: 0 = no stress, 1 = primary, 2 = secondary.
|
|
114
|
+
"""
|
|
115
|
+
phones_list = pronouncing.phones_for_word(word.lower())
|
|
116
|
+
if phones_list:
|
|
117
|
+
return phones_list[0] # type: ignore[no-any-return]
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
@lru_cache(maxsize=4096)
|
|
122
|
+
def _count_syllables(word: str) -> int:
|
|
123
|
+
"""Count syllables using CMU dictionary, falling back to vowel heuristic.
|
|
124
|
+
|
|
125
|
+
The CMU dictionary provides phoneme-level transcriptions with stress markers.
|
|
126
|
+
Each vowel phoneme (marked 0, 1, or 2) represents one syllable nucleus.
|
|
127
|
+
"""
|
|
128
|
+
phones = _get_phones(word)
|
|
129
|
+
if phones:
|
|
130
|
+
return pronouncing.syllable_count(phones) # type: ignore[no-any-return]
|
|
131
|
+
return _fallback_syllable_count(word)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _fallback_syllable_count(word: str) -> int:
|
|
135
|
+
"""Heuristic syllable count for words not in CMU dictionary.
|
|
136
|
+
|
|
137
|
+
Counts vowel groups and adjusts for silent-e.
|
|
138
|
+
"""
|
|
139
|
+
word = word.lower()
|
|
140
|
+
count = 0
|
|
141
|
+
prev_vowel = False
|
|
142
|
+
for ch in word:
|
|
143
|
+
is_vowel = ch in VOWELS
|
|
144
|
+
if is_vowel and not prev_vowel:
|
|
145
|
+
count += 1
|
|
146
|
+
prev_vowel = is_vowel
|
|
147
|
+
if word.endswith("e") and count > 1:
|
|
148
|
+
count -= 1
|
|
149
|
+
return max(1, count)
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def _get_stress_pattern(word: str) -> list[int]:
|
|
153
|
+
"""Extract stress pattern from CMU pronunciation.
|
|
154
|
+
|
|
155
|
+
Returns a list of stress values (0, 1, 2) for each syllable.
|
|
156
|
+
Returns empty list if word is not in CMU dictionary.
|
|
157
|
+
|
|
158
|
+
Reference:
|
|
159
|
+
CMU Pronouncing Dictionary stress encoding:
|
|
160
|
+
0 = no stress, 1 = primary stress, 2 = secondary stress
|
|
161
|
+
"""
|
|
162
|
+
phones = _get_phones(word)
|
|
163
|
+
if not phones:
|
|
164
|
+
return []
|
|
165
|
+
return [int(ch) for ch in phones if ch.isdigit()]
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _get_vowel_phonemes(word: str) -> list[str]:
|
|
169
|
+
"""Extract vowel phonemes (without stress markers) from CMU pronunciation.
|
|
170
|
+
|
|
171
|
+
Used for assonance detection: words sharing vowel sounds.
|
|
172
|
+
"""
|
|
173
|
+
phones = _get_phones(word)
|
|
174
|
+
if not phones:
|
|
175
|
+
return []
|
|
176
|
+
phonemes = phones.split()
|
|
177
|
+
return [p.rstrip("012") for p in phonemes if p.rstrip("012") in ARPABET_VOWELS]
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def _get_initial_sound(word: str) -> str | None:
|
|
181
|
+
"""Get the initial consonant sound from CMU pronunciation.
|
|
182
|
+
|
|
183
|
+
Used for alliteration detection: words sharing initial consonant sounds.
|
|
184
|
+
Falls back to the first letter if the word is not in CMU dictionary.
|
|
185
|
+
"""
|
|
186
|
+
phones = _get_phones(word)
|
|
187
|
+
if phones:
|
|
188
|
+
first_phoneme = phones.split()[0].rstrip("012")
|
|
189
|
+
if first_phoneme not in ARPABET_VOWELS:
|
|
190
|
+
return first_phoneme
|
|
191
|
+
return None # Word starts with a vowel sound
|
|
192
|
+
# Fallback: use first letter if consonant
|
|
193
|
+
w = word.lower()
|
|
194
|
+
if w and w[0] in CONSONANTS:
|
|
195
|
+
return w[0]
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _get_consonant_phonemes(word: str) -> list[str]:
|
|
200
|
+
"""Extract consonant phonemes from CMU pronunciation.
|
|
201
|
+
|
|
202
|
+
Used for consonance detection: words sharing consonant sounds.
|
|
203
|
+
"""
|
|
204
|
+
phones = _get_phones(word)
|
|
205
|
+
if not phones:
|
|
206
|
+
return []
|
|
207
|
+
phonemes = phones.split()
|
|
208
|
+
return [p for p in phonemes if p.rstrip("012") not in ARPABET_VOWELS]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
# =============================================================================
|
|
212
|
+
# SYLLABLE PATTERN METRICS
|
|
213
|
+
# =============================================================================
|
|
214
|
+
|
|
215
|
+
|
|
216
|
+
def _compute_syllable_metrics(
|
|
217
|
+
words: list[str],
|
|
218
|
+
) -> tuple[float, float, float, float, list[int]]:
|
|
219
|
+
"""Compute syllable distribution metrics.
|
|
220
|
+
|
|
221
|
+
Returns:
|
|
222
|
+
(mean_syllables, std_dev, polysyllabic_ratio, monosyllabic_ratio,
|
|
223
|
+
syllable_counts)
|
|
224
|
+
|
|
225
|
+
Polysyllabic ratio: fraction of words with 3+ syllables.
|
|
226
|
+
Relevant for readability and stylistic complexity.
|
|
227
|
+
Monosyllabic ratio: fraction of single-syllable words.
|
|
228
|
+
High monosyllabic ratio suggests simpler, more direct style.
|
|
229
|
+
"""
|
|
230
|
+
if not words:
|
|
231
|
+
return 0.0, 0.0, 0.0, 0.0, []
|
|
232
|
+
|
|
233
|
+
syllable_counts = [_count_syllables(w) for w in words]
|
|
234
|
+
n = len(syllable_counts)
|
|
235
|
+
|
|
236
|
+
mean_syl = sum(syllable_counts) / n
|
|
237
|
+
variance = sum((s - mean_syl) ** 2 for s in syllable_counts) / n
|
|
238
|
+
std_dev = math.sqrt(variance)
|
|
239
|
+
|
|
240
|
+
polysyllabic = sum(1 for s in syllable_counts if s >= 3)
|
|
241
|
+
monosyllabic = sum(1 for s in syllable_counts if s == 1)
|
|
242
|
+
|
|
243
|
+
return (
|
|
244
|
+
mean_syl,
|
|
245
|
+
std_dev,
|
|
246
|
+
polysyllabic / n,
|
|
247
|
+
monosyllabic / n,
|
|
248
|
+
syllable_counts,
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
# =============================================================================
|
|
253
|
+
# RHYTHMIC REGULARITY
|
|
254
|
+
# =============================================================================
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def _compute_rhythmic_regularity(syllable_counts: list[int]) -> tuple[float, float]:
|
|
258
|
+
"""Compute rhythmic regularity from syllable count distribution.
|
|
259
|
+
|
|
260
|
+
Rhythmic regularity is the inverse of the coefficient of variation (CV)
|
|
261
|
+
of syllable counts per word. Lower CV means more uniform syllable lengths,
|
|
262
|
+
which produces a more metrically regular text.
|
|
263
|
+
|
|
264
|
+
Formula:
|
|
265
|
+
CV = σ / μ (coefficient of variation)
|
|
266
|
+
Regularity = 1 / CV (higher = more regular rhythm)
|
|
267
|
+
|
|
268
|
+
When CV is 0 (all words same length), regularity is set to the word count
|
|
269
|
+
as a practical upper bound.
|
|
270
|
+
|
|
271
|
+
Reference:
|
|
272
|
+
Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm
|
|
273
|
+
and text comprehension. Memory & Cognition, 33(3), 388-396.
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
(rhythmic_regularity, syllable_cv)
|
|
277
|
+
"""
|
|
278
|
+
if not syllable_counts:
|
|
279
|
+
return 0.0, 0.0
|
|
280
|
+
|
|
281
|
+
n = len(syllable_counts)
|
|
282
|
+
mean_syl = sum(syllable_counts) / n
|
|
283
|
+
if mean_syl == 0.0:
|
|
284
|
+
return 0.0, 0.0
|
|
285
|
+
|
|
286
|
+
variance = sum((s - mean_syl) ** 2 for s in syllable_counts) / n
|
|
287
|
+
std_dev = math.sqrt(variance)
|
|
288
|
+
cv = std_dev / mean_syl
|
|
289
|
+
|
|
290
|
+
if cv == 0.0:
|
|
291
|
+
# All words have the same syllable count: maximally regular
|
|
292
|
+
regularity = float(n)
|
|
293
|
+
else:
|
|
294
|
+
regularity = 1.0 / cv
|
|
295
|
+
|
|
296
|
+
return regularity, cv
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
# =============================================================================
|
|
300
|
+
# STRESS PATTERN ENTROPY
|
|
301
|
+
# =============================================================================
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
def _compute_stress_entropy(words: list[str]) -> float:
|
|
305
|
+
"""Compute Shannon entropy of stress patterns across words.
|
|
306
|
+
|
|
307
|
+
Each word's stress pattern (e.g., "10" for trochee, "01" for iamb) is
|
|
308
|
+
treated as a categorical event. Higher entropy means more varied stress
|
|
309
|
+
patterns; lower entropy means the text gravitates toward a few dominant
|
|
310
|
+
metrical feet.
|
|
311
|
+
|
|
312
|
+
Formula:
|
|
313
|
+
H = -Σ p(pattern) × log₂(p(pattern))
|
|
314
|
+
|
|
315
|
+
Reference:
|
|
316
|
+
Shannon, C. E. (1948). A Mathematical Theory of Communication.
|
|
317
|
+
Applied here to prosodic analysis following Greene et al. (2010).
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
Shannon entropy in bits. 0.0 if no stress data available.
|
|
321
|
+
"""
|
|
322
|
+
patterns: list[str] = []
|
|
323
|
+
for word in words:
|
|
324
|
+
stress = _get_stress_pattern(word)
|
|
325
|
+
if stress:
|
|
326
|
+
# Binarize: 0 stays 0 (unstressed), 1 or 2 become 1 (stressed)
|
|
327
|
+
binary = "".join("1" if s > 0 else "0" for s in stress)
|
|
328
|
+
patterns.append(binary)
|
|
329
|
+
|
|
330
|
+
if not patterns:
|
|
331
|
+
return 0.0
|
|
332
|
+
|
|
333
|
+
counts = Counter(patterns)
|
|
334
|
+
total = len(patterns)
|
|
335
|
+
entropy = 0.0
|
|
336
|
+
for count in counts.values():
|
|
337
|
+
p = count / total
|
|
338
|
+
if p > 0:
|
|
339
|
+
entropy -= p * math.log2(p)
|
|
340
|
+
|
|
341
|
+
return entropy
|
|
342
|
+
|
|
343
|
+
|
|
344
|
+
# =============================================================================
|
|
345
|
+
# SENTENCE RHYTHM
|
|
346
|
+
# =============================================================================
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
def _compute_sentence_rhythm(text: str) -> tuple[float, float]:
|
|
350
|
+
"""Compute sentence-level rhythm metrics.
|
|
351
|
+
|
|
352
|
+
Sentence length alternation measures the degree to which long and short
|
|
353
|
+
sentences alternate. Authors with strong prose rhythm tend to vary sentence
|
|
354
|
+
length deliberately, creating a sense of pacing.
|
|
355
|
+
|
|
356
|
+
Alternation score: average absolute difference in word count between
|
|
357
|
+
consecutive sentences, normalized by mean sentence length.
|
|
358
|
+
|
|
359
|
+
Sentence rhythm score: composite metric combining alternation with
|
|
360
|
+
sentence length variance (higher variance = more dynamic rhythm).
|
|
361
|
+
|
|
362
|
+
Reference:
|
|
363
|
+
Cutts, M. (2013). Oxford Guide to Plain English (4th ed.).
|
|
364
|
+
Recommends varying sentence length for readability.
|
|
365
|
+
|
|
366
|
+
Returns:
|
|
367
|
+
(sentence_length_alternation, sentence_rhythm_score)
|
|
28
368
|
"""
|
|
29
|
-
|
|
369
|
+
sentences = _split_sentences(text)
|
|
370
|
+
if len(sentences) < 2:
|
|
371
|
+
return 0.0, 0.0
|
|
372
|
+
|
|
373
|
+
lengths = [len(_extract_words(s)) for s in sentences]
|
|
374
|
+
lengths = [length for length in lengths if length > 0]
|
|
375
|
+
|
|
376
|
+
if len(lengths) < 2:
|
|
377
|
+
return 0.0, 0.0
|
|
378
|
+
|
|
379
|
+
mean_len = sum(lengths) / len(lengths)
|
|
380
|
+
if mean_len == 0.0:
|
|
381
|
+
return 0.0, 0.0
|
|
382
|
+
|
|
383
|
+
# Alternation: mean absolute diff between consecutive sentences
|
|
384
|
+
diffs = [abs(lengths[i] - lengths[i - 1]) for i in range(1, len(lengths))]
|
|
385
|
+
alternation = (sum(diffs) / len(diffs)) / mean_len
|
|
386
|
+
|
|
387
|
+
# Rhythm score: combines alternation with normalized variance
|
|
388
|
+
variance = sum((length - mean_len) ** 2 for length in lengths) / len(lengths)
|
|
389
|
+
cv = math.sqrt(variance) / mean_len if mean_len > 0 else 0.0
|
|
390
|
+
rhythm_score = (alternation + cv) / 2.0
|
|
391
|
+
|
|
392
|
+
return alternation, rhythm_score
|
|
393
|
+
|
|
394
|
+
|
|
395
|
+
# =============================================================================
|
|
396
|
+
# PHONOLOGICAL FEATURES: ALLITERATION, ASSONANCE, CONSONANCE
|
|
397
|
+
# =============================================================================
|
|
398
|
+
|
|
399
|
+
|
|
400
|
+
def _compute_alliteration(words: list[str]) -> float:
|
|
401
|
+
"""Compute alliteration density (alliterative pairs per 100 words).
|
|
402
|
+
|
|
403
|
+
Alliteration is the repetition of initial consonant sounds in adjacent
|
|
404
|
+
or nearby words. This implementation checks consecutive word pairs for
|
|
405
|
+
matching initial consonant phonemes using the CMU dictionary.
|
|
406
|
+
|
|
407
|
+
Formula:
|
|
408
|
+
density = (alliterative_pairs / total_words) × 100
|
|
409
|
+
|
|
410
|
+
Reference:
|
|
411
|
+
Fabb, N., & Halle, M. (2008). Meter in Poetry. Cambridge University
|
|
412
|
+
Press. Chapter on phonological repetition in verse.
|
|
413
|
+
|
|
414
|
+
Returns:
|
|
415
|
+
Alliterative word pairs per 100 words.
|
|
416
|
+
"""
|
|
417
|
+
if len(words) < 2:
|
|
418
|
+
return 0.0
|
|
419
|
+
|
|
420
|
+
pairs = 0
|
|
421
|
+
for i in range(len(words) - 1):
|
|
422
|
+
sound_a = _get_initial_sound(words[i])
|
|
423
|
+
sound_b = _get_initial_sound(words[i + 1])
|
|
424
|
+
if sound_a and sound_b and sound_a == sound_b:
|
|
425
|
+
pairs += 1
|
|
426
|
+
|
|
427
|
+
return (pairs / len(words)) * 100.0
|
|
428
|
+
|
|
429
|
+
|
|
430
|
+
def _compute_assonance(words: list[str]) -> float:
|
|
431
|
+
"""Compute assonance density (assonant pairs per 100 words).
|
|
432
|
+
|
|
433
|
+
Assonance is the repetition of vowel sounds within nearby words,
|
|
434
|
+
regardless of surrounding consonants. This implementation checks
|
|
435
|
+
consecutive word pairs for shared vowel phonemes.
|
|
436
|
+
|
|
437
|
+
Formula:
|
|
438
|
+
density = (assonant_pairs / total_words) × 100
|
|
439
|
+
|
|
440
|
+
Returns:
|
|
441
|
+
Assonant word pairs per 100 words.
|
|
442
|
+
"""
|
|
443
|
+
if len(words) < 2:
|
|
444
|
+
return 0.0
|
|
445
|
+
|
|
446
|
+
pairs = 0
|
|
447
|
+
for i in range(len(words) - 1):
|
|
448
|
+
vowels_a = set(_get_vowel_phonemes(words[i]))
|
|
449
|
+
vowels_b = set(_get_vowel_phonemes(words[i + 1]))
|
|
450
|
+
if vowels_a and vowels_b and vowels_a & vowels_b:
|
|
451
|
+
pairs += 1
|
|
452
|
+
|
|
453
|
+
return (pairs / len(words)) * 100.0
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def _compute_consonance(words: list[str]) -> float:
|
|
457
|
+
"""Compute consonance density (consonant-repeating pairs per 100 words).
|
|
458
|
+
|
|
459
|
+
Consonance is the repetition of consonant sounds within nearby words,
|
|
460
|
+
especially at the end of words. This implementation checks consecutive
|
|
461
|
+
word pairs for shared consonant phonemes.
|
|
462
|
+
|
|
463
|
+
Formula:
|
|
464
|
+
density = (consonant_pairs / total_words) × 100
|
|
465
|
+
|
|
466
|
+
Returns:
|
|
467
|
+
Consonant-repeating word pairs per 100 words.
|
|
468
|
+
"""
|
|
469
|
+
if len(words) < 2:
|
|
470
|
+
return 0.0
|
|
471
|
+
|
|
472
|
+
pairs = 0
|
|
473
|
+
for i in range(len(words) - 1):
|
|
474
|
+
cons_a = set(_get_consonant_phonemes(words[i]))
|
|
475
|
+
cons_b = set(_get_consonant_phonemes(words[i + 1]))
|
|
476
|
+
if cons_a and cons_b and cons_a & cons_b:
|
|
477
|
+
pairs += 1
|
|
478
|
+
|
|
479
|
+
return (pairs / len(words)) * 100.0
|
|
480
|
+
|
|
481
|
+
|
|
482
|
+
# =============================================================================
|
|
483
|
+
# CONSONANT CLUSTER METRICS
|
|
484
|
+
# =============================================================================
|
|
485
|
+
|
|
486
|
+
|
|
487
|
+
def _compute_cluster_metrics(
|
|
488
|
+
words: list[str],
|
|
489
|
+
) -> tuple[float, float, float]:
|
|
490
|
+
"""Compute consonant cluster complexity metrics.
|
|
491
|
+
|
|
492
|
+
Consonant clusters (two or more consonants in sequence) contribute to
|
|
493
|
+
the perceived complexity and rhythm of text. Languages and styles differ
|
|
494
|
+
in their tolerance for complex clusters.
|
|
495
|
+
|
|
496
|
+
Returns:
|
|
497
|
+
(mean_cluster_length, initial_cluster_ratio, final_cluster_ratio)
|
|
498
|
+
|
|
499
|
+
Where:
|
|
500
|
+
mean_cluster_length: average length of all consonant clusters found
|
|
501
|
+
initial_cluster_ratio: fraction of words starting with a cluster
|
|
502
|
+
final_cluster_ratio: fraction of words ending with a cluster
|
|
503
|
+
"""
|
|
504
|
+
if not words:
|
|
505
|
+
return 0.0, 0.0, 0.0
|
|
506
|
+
|
|
507
|
+
cluster_lengths: list[int] = []
|
|
508
|
+
initial_count = 0
|
|
509
|
+
final_count = 0
|
|
510
|
+
|
|
511
|
+
for word in words:
|
|
512
|
+
w = word.lower()
|
|
513
|
+
|
|
514
|
+
initial_match = INITIAL_CLUSTER_PATTERN.match(w)
|
|
515
|
+
if initial_match:
|
|
516
|
+
initial_count += 1
|
|
517
|
+
cluster_lengths.append(len(initial_match.group()))
|
|
518
|
+
|
|
519
|
+
final_match = FINAL_CLUSTER_PATTERN.search(w)
|
|
520
|
+
if final_match:
|
|
521
|
+
final_count += 1
|
|
522
|
+
cluster_lengths.append(len(final_match.group()))
|
|
523
|
+
|
|
524
|
+
n = len(words)
|
|
525
|
+
mean_cluster = sum(cluster_lengths) / len(cluster_lengths) if cluster_lengths else 0.0
|
|
526
|
+
initial_ratio = initial_count / n
|
|
527
|
+
final_ratio = final_count / n
|
|
528
|
+
|
|
529
|
+
return mean_cluster, initial_ratio, final_ratio
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
# =============================================================================
|
|
533
|
+
# METRICAL FOOT ESTIMATION
|
|
534
|
+
# =============================================================================
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
def _compute_metrical_feet(words: list[str]) -> tuple[float, float, float, float]:
|
|
538
|
+
"""Estimate metrical foot ratios from word-level stress patterns.
|
|
539
|
+
|
|
540
|
+
Classical meter is defined by patterns of stressed (S) and unstressed (U)
|
|
541
|
+
syllables:
|
|
542
|
+
- Iamb: U-S (e.g., "above", "begin")
|
|
543
|
+
- Trochee: S-U (e.g., "garden", "happy")
|
|
544
|
+
- Dactyl: S-U-U (e.g., "merrily", "beautiful")
|
|
545
|
+
- Anapest: U-U-S (e.g., "understand", "intervene")
|
|
546
|
+
|
|
547
|
+
This function examines each word's stress pattern and classifies it as
|
|
548
|
+
matching one or more of these foot types. Multi-syllable words are
|
|
549
|
+
decomposed into overlapping bigrams/trigrams of stress values.
|
|
550
|
+
|
|
551
|
+
Reference:
|
|
552
|
+
Fabb, N., & Halle, M. (2008). Meter in Poetry. Cambridge University
|
|
553
|
+
Press.
|
|
554
|
+
|
|
555
|
+
Returns:
|
|
556
|
+
(iambic_ratio, trochaic_ratio, dactylic_ratio, anapestic_ratio)
|
|
557
|
+
Each as a fraction of total detected foot patterns.
|
|
558
|
+
"""
|
|
559
|
+
iambic = 0
|
|
560
|
+
trochaic = 0
|
|
561
|
+
dactylic = 0
|
|
562
|
+
anapestic = 0
|
|
563
|
+
total = 0
|
|
564
|
+
|
|
565
|
+
for word in words:
|
|
566
|
+
stress = _get_stress_pattern(word)
|
|
567
|
+
if len(stress) < 2:
|
|
568
|
+
continue
|
|
569
|
+
|
|
570
|
+
# Binarize stress: 0 = unstressed, 1/2 = stressed
|
|
571
|
+
binary = [1 if s > 0 else 0 for s in stress]
|
|
572
|
+
|
|
573
|
+
# Check bigrams for iambic (0,1) and trochaic (1,0)
|
|
574
|
+
for i in range(len(binary) - 1):
|
|
575
|
+
pair = (binary[i], binary[i + 1])
|
|
576
|
+
if pair == (0, 1):
|
|
577
|
+
iambic += 1
|
|
578
|
+
total += 1
|
|
579
|
+
elif pair == (1, 0):
|
|
580
|
+
trochaic += 1
|
|
581
|
+
total += 1
|
|
582
|
+
|
|
583
|
+
# Check trigrams for dactylic (1,0,0) and anapestic (0,0,1)
|
|
584
|
+
for i in range(len(binary) - 2):
|
|
585
|
+
triple = (binary[i], binary[i + 1], binary[i + 2])
|
|
586
|
+
if triple == (1, 0, 0):
|
|
587
|
+
dactylic += 1
|
|
588
|
+
total += 1
|
|
589
|
+
elif triple == (0, 0, 1):
|
|
590
|
+
anapestic += 1
|
|
591
|
+
total += 1
|
|
592
|
+
|
|
593
|
+
if total == 0:
|
|
594
|
+
return 0.0, 0.0, 0.0, 0.0
|
|
595
|
+
|
|
596
|
+
return (
|
|
597
|
+
iambic / total,
|
|
598
|
+
trochaic / total,
|
|
599
|
+
dactylic / total,
|
|
600
|
+
anapestic / total,
|
|
601
|
+
)
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
# =============================================================================
|
|
605
|
+
# MAIN ENTRY POINT
|
|
606
|
+
# =============================================================================
|
|
607
|
+
|
|
608
|
+
|
|
609
|
+
def compute_rhythm_prosody(text: str) -> RhythmProsodyResult:
|
|
610
|
+
"""Compute rhythm and prosody metrics for written text.
|
|
611
|
+
|
|
612
|
+
Analyzes the musical qualities of written language through syllable
|
|
613
|
+
patterns, stress patterns, phonological features, and metrical foot
|
|
614
|
+
estimation. These metrics are particularly relevant for poetry analysis
|
|
615
|
+
and literary stylometry, but also capture prose rhythm patterns.
|
|
30
616
|
|
|
31
617
|
Related GitHub Issue:
|
|
32
618
|
#25 - Rhythm and Prosody Metrics
|
|
33
619
|
https://github.com/craigtrim/pystylometry/issues/25
|
|
34
620
|
|
|
621
|
+
Metrics computed:
|
|
622
|
+
|
|
623
|
+
Syllable patterns:
|
|
624
|
+
- mean_syllables_per_word: Average syllable count across all words.
|
|
625
|
+
- syllable_std_dev: Standard deviation of syllable counts.
|
|
626
|
+
- polysyllabic_ratio: Fraction of words with 3+ syllables.
|
|
627
|
+
- monosyllabic_ratio: Fraction of single-syllable words.
|
|
628
|
+
|
|
629
|
+
Rhythmic regularity:
|
|
630
|
+
- rhythmic_regularity: 1 / CV of syllable counts (higher = more regular).
|
|
631
|
+
- syllable_cv: Coefficient of variation of syllable counts.
|
|
632
|
+
- stress_pattern_entropy: Shannon entropy of stress patterns in bits.
|
|
633
|
+
|
|
634
|
+
Sentence rhythm:
|
|
635
|
+
- sentence_length_alternation: Normalized mean absolute difference
|
|
636
|
+
between consecutive sentence lengths.
|
|
637
|
+
- sentence_rhythm_score: Composite metric combining alternation
|
|
638
|
+
and sentence length variance.
|
|
639
|
+
|
|
640
|
+
Phonological features:
|
|
641
|
+
- alliteration_density: Alliterative word pairs per 100 words.
|
|
642
|
+
- assonance_density: Assonant word pairs per 100 words.
|
|
643
|
+
- consonance_density: Consonant-repeating pairs per 100 words.
|
|
644
|
+
|
|
645
|
+
Syllable complexity:
|
|
646
|
+
- mean_consonant_cluster_length: Average length of consonant clusters.
|
|
647
|
+
- initial_cluster_ratio: Words starting with consonant clusters.
|
|
648
|
+
- final_cluster_ratio: Words ending with consonant clusters.
|
|
649
|
+
|
|
650
|
+
Stress patterns (metrical feet):
|
|
651
|
+
- iambic_ratio: Unstressed-stressed pairs / total feet.
|
|
652
|
+
- trochaic_ratio: Stressed-unstressed pairs / total feet.
|
|
653
|
+
- dactylic_ratio: Stressed-unstressed-unstressed trigrams / total feet.
|
|
654
|
+
- anapestic_ratio: Unstressed-unstressed-stressed trigrams / total feet.
|
|
655
|
+
|
|
656
|
+
Dependencies:
|
|
657
|
+
Requires the ``pronouncing`` package for CMU dictionary access.
|
|
658
|
+
Install with: ``pip install pystylometry[readability]``
|
|
659
|
+
|
|
660
|
+
References:
|
|
661
|
+
Fabb, N., & Halle, M. (2008). Meter in Poetry: A New Theory.
|
|
662
|
+
Cambridge University Press.
|
|
663
|
+
Greene, E., Bodrumlu, T., & Knight, K. (2010). Automatic analysis of
|
|
664
|
+
rhythmic poetry with applications to generation and translation.
|
|
665
|
+
Proceedings of EMNLP, 524-533.
|
|
666
|
+
Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm
|
|
667
|
+
and text comprehension. Memory & Cognition, 33(3), 388-396.
|
|
668
|
+
|
|
35
669
|
Args:
|
|
36
|
-
text: Input text to analyze
|
|
670
|
+
text: Input text to analyze. For reliable prosodic statistics, at
|
|
671
|
+
least 100+ words are recommended. Shorter texts will produce
|
|
672
|
+
valid but potentially unstable metrics.
|
|
37
673
|
|
|
38
674
|
Returns:
|
|
39
675
|
RhythmProsodyResult with syllable patterns, rhythmic regularity,
|
|
40
676
|
phonological features, stress patterns, and complexity metrics.
|
|
677
|
+
See ``_types.RhythmProsodyResult`` for complete field documentation.
|
|
41
678
|
|
|
42
679
|
Example:
|
|
43
|
-
>>> result = compute_rhythm_prosody("
|
|
680
|
+
>>> result = compute_rhythm_prosody("The quick brown fox jumps over the lazy dog.")
|
|
44
681
|
>>> print(f"Syllables/word: {result.mean_syllables_per_word:.2f}")
|
|
45
682
|
>>> print(f"Rhythmic regularity: {result.rhythmic_regularity:.3f}")
|
|
46
683
|
>>> print(f"Alliteration density: {result.alliteration_density:.2f}")
|
|
47
684
|
"""
|
|
48
|
-
#
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
685
|
+
# Handle empty text
|
|
686
|
+
if not text or not text.strip():
|
|
687
|
+
return RhythmProsodyResult(
|
|
688
|
+
mean_syllables_per_word=0.0,
|
|
689
|
+
syllable_std_dev=0.0,
|
|
690
|
+
polysyllabic_ratio=0.0,
|
|
691
|
+
monosyllabic_ratio=0.0,
|
|
692
|
+
rhythmic_regularity=0.0,
|
|
693
|
+
syllable_cv=0.0,
|
|
694
|
+
stress_pattern_entropy=0.0,
|
|
695
|
+
sentence_length_alternation=0.0,
|
|
696
|
+
sentence_rhythm_score=0.0,
|
|
697
|
+
alliteration_density=0.0,
|
|
698
|
+
assonance_density=0.0,
|
|
699
|
+
consonance_density=0.0,
|
|
700
|
+
mean_consonant_cluster_length=0.0,
|
|
701
|
+
initial_cluster_ratio=0.0,
|
|
702
|
+
final_cluster_ratio=0.0,
|
|
703
|
+
iambic_ratio=0.0,
|
|
704
|
+
trochaic_ratio=0.0,
|
|
705
|
+
dactylic_ratio=0.0,
|
|
706
|
+
anapestic_ratio=0.0,
|
|
707
|
+
metadata={"word_count": 0, "warning": "Empty text"},
|
|
708
|
+
)
|
|
709
|
+
|
|
710
|
+
words = _extract_words(text)
|
|
711
|
+
if not words:
|
|
712
|
+
return RhythmProsodyResult(
|
|
713
|
+
mean_syllables_per_word=0.0,
|
|
714
|
+
syllable_std_dev=0.0,
|
|
715
|
+
polysyllabic_ratio=0.0,
|
|
716
|
+
monosyllabic_ratio=0.0,
|
|
717
|
+
rhythmic_regularity=0.0,
|
|
718
|
+
syllable_cv=0.0,
|
|
719
|
+
stress_pattern_entropy=0.0,
|
|
720
|
+
sentence_length_alternation=0.0,
|
|
721
|
+
sentence_rhythm_score=0.0,
|
|
722
|
+
alliteration_density=0.0,
|
|
723
|
+
assonance_density=0.0,
|
|
724
|
+
consonance_density=0.0,
|
|
725
|
+
mean_consonant_cluster_length=0.0,
|
|
726
|
+
initial_cluster_ratio=0.0,
|
|
727
|
+
final_cluster_ratio=0.0,
|
|
728
|
+
iambic_ratio=0.0,
|
|
729
|
+
trochaic_ratio=0.0,
|
|
730
|
+
dactylic_ratio=0.0,
|
|
731
|
+
anapestic_ratio=0.0,
|
|
732
|
+
metadata={"word_count": 0, "warning": "No words found"},
|
|
733
|
+
)
|
|
734
|
+
|
|
735
|
+
# =========================================================================
|
|
736
|
+
# SYLLABLE PATTERNS
|
|
737
|
+
# =========================================================================
|
|
738
|
+
mean_syl, syl_std, poly_ratio, mono_ratio, syl_counts = _compute_syllable_metrics(words)
|
|
739
|
+
|
|
740
|
+
# =========================================================================
|
|
741
|
+
# RHYTHMIC REGULARITY
|
|
742
|
+
# =========================================================================
|
|
743
|
+
regularity, cv = _compute_rhythmic_regularity(syl_counts)
|
|
744
|
+
|
|
745
|
+
# =========================================================================
|
|
746
|
+
# STRESS PATTERN ENTROPY
|
|
747
|
+
# =========================================================================
|
|
748
|
+
stress_entropy = _compute_stress_entropy(words)
|
|
749
|
+
|
|
750
|
+
# =========================================================================
|
|
751
|
+
# SENTENCE RHYTHM
|
|
752
|
+
# =========================================================================
|
|
753
|
+
alternation, rhythm_score = _compute_sentence_rhythm(text)
|
|
754
|
+
|
|
755
|
+
# =========================================================================
|
|
756
|
+
# PHONOLOGICAL FEATURES
|
|
757
|
+
# =========================================================================
|
|
758
|
+
alliteration = _compute_alliteration(words)
|
|
759
|
+
assonance = _compute_assonance(words)
|
|
760
|
+
consonance = _compute_consonance(words)
|
|
761
|
+
|
|
762
|
+
# =========================================================================
|
|
763
|
+
# CONSONANT CLUSTER COMPLEXITY
|
|
764
|
+
# =========================================================================
|
|
765
|
+
mean_cluster, initial_ratio, final_ratio = _compute_cluster_metrics(words)
|
|
766
|
+
|
|
767
|
+
# =========================================================================
|
|
768
|
+
# METRICAL FOOT ESTIMATION
|
|
769
|
+
# =========================================================================
|
|
770
|
+
iambic, trochaic, dactylic, anapestic = _compute_metrical_feet(words)
|
|
771
|
+
|
|
772
|
+
# =========================================================================
|
|
773
|
+
# METADATA
|
|
774
|
+
# =========================================================================
|
|
775
|
+
# Collect per-word stress patterns for downstream analysis
|
|
776
|
+
word_stress_patterns: dict[str, list[int]] = {}
|
|
777
|
+
for word in set(words):
|
|
778
|
+
stress = _get_stress_pattern(word)
|
|
779
|
+
if stress:
|
|
780
|
+
word_stress_patterns[word.lower()] = stress
|
|
781
|
+
|
|
782
|
+
cmu_coverage = len(word_stress_patterns) / len(set(words)) if words else 0.0
|
|
783
|
+
|
|
784
|
+
metadata: dict[str, Any] = {
|
|
785
|
+
"word_count": len(words),
|
|
786
|
+
"unique_words": len(set(w.lower() for w in words)),
|
|
787
|
+
"sentence_count": len(_split_sentences(text)),
|
|
788
|
+
"total_syllables": sum(syl_counts),
|
|
789
|
+
"cmu_coverage": cmu_coverage,
|
|
790
|
+
"syllable_distribution": dict(Counter(syl_counts)),
|
|
791
|
+
"word_stress_patterns": word_stress_patterns,
|
|
792
|
+
}
|
|
793
|
+
|
|
794
|
+
return RhythmProsodyResult(
|
|
795
|
+
mean_syllables_per_word=mean_syl,
|
|
796
|
+
syllable_std_dev=syl_std,
|
|
797
|
+
polysyllabic_ratio=poly_ratio,
|
|
798
|
+
monosyllabic_ratio=mono_ratio,
|
|
799
|
+
rhythmic_regularity=regularity,
|
|
800
|
+
syllable_cv=cv,
|
|
801
|
+
stress_pattern_entropy=stress_entropy,
|
|
802
|
+
sentence_length_alternation=alternation,
|
|
803
|
+
sentence_rhythm_score=rhythm_score,
|
|
804
|
+
alliteration_density=alliteration,
|
|
805
|
+
assonance_density=assonance,
|
|
806
|
+
consonance_density=consonance,
|
|
807
|
+
mean_consonant_cluster_length=mean_cluster,
|
|
808
|
+
initial_cluster_ratio=initial_ratio,
|
|
809
|
+
final_cluster_ratio=final_ratio,
|
|
810
|
+
iambic_ratio=iambic,
|
|
811
|
+
trochaic_ratio=trochaic,
|
|
812
|
+
dactylic_ratio=dactylic,
|
|
813
|
+
anapestic_ratio=anapestic,
|
|
814
|
+
metadata=metadata,
|
|
53
815
|
)
|