pystylometry 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -11,43 +11,805 @@ Related GitHub Issue:
11
11
  Features analyzed:
12
12
  - Syllable patterns and stress patterns
13
13
  - Rhythmic regularity (coefficient of variation)
14
- - Phonological features (alliteration, assonance)
14
+ - Phonological features (alliteration, assonance, consonance)
15
15
  - Syllable complexity (consonant clusters)
16
- - Sentence rhythm
16
+ - Sentence rhythm (length alternation)
17
17
  - Polysyllabic word usage
18
+ - Metrical foot estimation (iambic, trochaic, dactylic, anapestic)
19
+
20
+ Dependencies:
21
+ - CMU Pronouncing Dictionary (via pronouncing package)
22
+ - pronouncing is already a dependency for pystylometry[readability]
18
23
 
19
24
  References:
25
+ Fabb, N., & Halle, M. (2008). Meter in Poetry: A New Theory. Cambridge
26
+ University Press.
27
+ Greene, E., Bodrumlu, T., & Knight, K. (2010). Automatic analysis of rhythmic
28
+ poetry with applications to generation and translation. Proceedings of
29
+ EMNLP, 524-533.
20
30
  Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm and
21
31
  text comprehension. Memory & Cognition, 33(3), 388-396.
22
32
  """
23
33
 
34
+ from __future__ import annotations
35
+
36
+ import math
37
+ import re
38
+ from collections import Counter
39
+ from functools import lru_cache
40
+ from typing import Any
41
+
24
42
  from .._types import RhythmProsodyResult
25
43
 
44
+ # =============================================================================
45
+ # DEPENDENCY: CMU PRONOUNCING DICTIONARY
46
+ # =============================================================================
47
+ # The pronouncing package provides access to the CMU Pronouncing Dictionary,
48
+ # which maps English words to ARPAbet phoneme sequences with stress markers.
49
+ # Stress markers: 0 = no stress, 1 = primary stress, 2 = secondary stress.
26
50
 
27
- def compute_rhythm_prosody(text: str) -> RhythmProsodyResult:
51
+ try:
52
+ import pronouncing # type: ignore[import-untyped]
53
+ except ImportError:
54
+ raise ImportError(
55
+ "The 'pronouncing' library is required for rhythm and prosody analysis. "
56
+ "Install it with: pip install pystylometry[readability]"
57
+ )
58
+
59
+ # =============================================================================
60
+ # VOWEL AND CONSONANT DEFINITIONS
61
+ # =============================================================================
62
+ # Used for alliteration, assonance, consonance, and cluster detection.
63
+
64
+ VOWELS = set("aeiou")
65
+ CONSONANTS = set("bcdfghjklmnpqrstvwxyz")
66
+
67
+ # ARPAbet vowel phonemes (used in CMU dictionary output)
68
+ ARPABET_VOWELS = {
69
+ "AA",
70
+ "AE",
71
+ "AH",
72
+ "AO",
73
+ "AW",
74
+ "AY",
75
+ "EH",
76
+ "ER",
77
+ "EY",
78
+ "IH",
79
+ "IY",
80
+ "OW",
81
+ "OY",
82
+ "UH",
83
+ "UW",
84
+ }
85
+
86
+ # Consonant cluster patterns at word boundaries
87
+ # Reference: English phonotactics (Clements & Keyser, 1983)
88
+ INITIAL_CLUSTER_PATTERN = re.compile(r"^[bcdfghjklmnpqrstvwxyz]{2,}", re.IGNORECASE)
89
+ FINAL_CLUSTER_PATTERN = re.compile(r"[bcdfghjklmnpqrstvwxyz]{2,}$", re.IGNORECASE)
90
+
91
+
92
+ # =============================================================================
93
+ # PHONEME AND SYLLABLE HELPERS
94
+ # =============================================================================
95
+
96
+
97
+ def _extract_words(text: str) -> list[str]:
98
+ """Extract alphabetic words from text, preserving order."""
99
+ return re.findall(r"[a-zA-Z]+", text)
100
+
101
+
102
+ def _split_sentences(text: str) -> list[str]:
103
+ """Split text into sentences on sentence-ending punctuation."""
104
+ sentences = re.split(r"[.!?]+", text)
105
+ return [s.strip() for s in sentences if s.strip()]
106
+
107
+
108
+ @lru_cache(maxsize=4096)
109
+ def _get_phones(word: str) -> str | None:
110
+ """Get the first (most common) pronunciation from CMU dictionary.
111
+
112
+ Returns the ARPAbet phoneme string, or None if the word is not found.
113
+ CMU stress markers: 0 = no stress, 1 = primary, 2 = secondary.
114
+ """
115
+ phones_list = pronouncing.phones_for_word(word.lower())
116
+ if phones_list:
117
+ return phones_list[0] # type: ignore[no-any-return]
118
+ return None
119
+
120
+
121
+ @lru_cache(maxsize=4096)
122
+ def _count_syllables(word: str) -> int:
123
+ """Count syllables using CMU dictionary, falling back to vowel heuristic.
124
+
125
+ The CMU dictionary provides phoneme-level transcriptions with stress markers.
126
+ Each vowel phoneme (marked 0, 1, or 2) represents one syllable nucleus.
127
+ """
128
+ phones = _get_phones(word)
129
+ if phones:
130
+ return pronouncing.syllable_count(phones) # type: ignore[no-any-return]
131
+ return _fallback_syllable_count(word)
132
+
133
+
134
+ def _fallback_syllable_count(word: str) -> int:
135
+ """Heuristic syllable count for words not in CMU dictionary.
136
+
137
+ Counts vowel groups and adjusts for silent-e.
138
+ """
139
+ word = word.lower()
140
+ count = 0
141
+ prev_vowel = False
142
+ for ch in word:
143
+ is_vowel = ch in VOWELS
144
+ if is_vowel and not prev_vowel:
145
+ count += 1
146
+ prev_vowel = is_vowel
147
+ if word.endswith("e") and count > 1:
148
+ count -= 1
149
+ return max(1, count)
150
+
151
+
152
+ def _get_stress_pattern(word: str) -> list[int]:
153
+ """Extract stress pattern from CMU pronunciation.
154
+
155
+ Returns a list of stress values (0, 1, 2) for each syllable.
156
+ Returns empty list if word is not in CMU dictionary.
157
+
158
+ Reference:
159
+ CMU Pronouncing Dictionary stress encoding:
160
+ 0 = no stress, 1 = primary stress, 2 = secondary stress
161
+ """
162
+ phones = _get_phones(word)
163
+ if not phones:
164
+ return []
165
+ return [int(ch) for ch in phones if ch.isdigit()]
166
+
167
+
168
+ def _get_vowel_phonemes(word: str) -> list[str]:
169
+ """Extract vowel phonemes (without stress markers) from CMU pronunciation.
170
+
171
+ Used for assonance detection: words sharing vowel sounds.
172
+ """
173
+ phones = _get_phones(word)
174
+ if not phones:
175
+ return []
176
+ phonemes = phones.split()
177
+ return [p.rstrip("012") for p in phonemes if p.rstrip("012") in ARPABET_VOWELS]
178
+
179
+
180
+ def _get_initial_sound(word: str) -> str | None:
181
+ """Get the initial consonant sound from CMU pronunciation.
182
+
183
+ Used for alliteration detection: words sharing initial consonant sounds.
184
+ Falls back to the first letter if the word is not in CMU dictionary.
185
+ """
186
+ phones = _get_phones(word)
187
+ if phones:
188
+ first_phoneme = phones.split()[0].rstrip("012")
189
+ if first_phoneme not in ARPABET_VOWELS:
190
+ return first_phoneme
191
+ return None # Word starts with a vowel sound
192
+ # Fallback: use first letter if consonant
193
+ w = word.lower()
194
+ if w and w[0] in CONSONANTS:
195
+ return w[0]
196
+ return None
197
+
198
+
199
+ def _get_consonant_phonemes(word: str) -> list[str]:
200
+ """Extract consonant phonemes from CMU pronunciation.
201
+
202
+ Used for consonance detection: words sharing consonant sounds.
203
+ """
204
+ phones = _get_phones(word)
205
+ if not phones:
206
+ return []
207
+ phonemes = phones.split()
208
+ return [p for p in phonemes if p.rstrip("012") not in ARPABET_VOWELS]
209
+
210
+
211
+ # =============================================================================
212
+ # SYLLABLE PATTERN METRICS
213
+ # =============================================================================
214
+
215
+
216
+ def _compute_syllable_metrics(
217
+ words: list[str],
218
+ ) -> tuple[float, float, float, float, list[int]]:
219
+ """Compute syllable distribution metrics.
220
+
221
+ Returns:
222
+ (mean_syllables, std_dev, polysyllabic_ratio, monosyllabic_ratio,
223
+ syllable_counts)
224
+
225
+ Polysyllabic ratio: fraction of words with 3+ syllables.
226
+ Relevant for readability and stylistic complexity.
227
+ Monosyllabic ratio: fraction of single-syllable words.
228
+ High monosyllabic ratio suggests simpler, more direct style.
229
+ """
230
+ if not words:
231
+ return 0.0, 0.0, 0.0, 0.0, []
232
+
233
+ syllable_counts = [_count_syllables(w) for w in words]
234
+ n = len(syllable_counts)
235
+
236
+ mean_syl = sum(syllable_counts) / n
237
+ variance = sum((s - mean_syl) ** 2 for s in syllable_counts) / n
238
+ std_dev = math.sqrt(variance)
239
+
240
+ polysyllabic = sum(1 for s in syllable_counts if s >= 3)
241
+ monosyllabic = sum(1 for s in syllable_counts if s == 1)
242
+
243
+ return (
244
+ mean_syl,
245
+ std_dev,
246
+ polysyllabic / n,
247
+ monosyllabic / n,
248
+ syllable_counts,
249
+ )
250
+
251
+
252
+ # =============================================================================
253
+ # RHYTHMIC REGULARITY
254
+ # =============================================================================
255
+
256
+
257
+ def _compute_rhythmic_regularity(syllable_counts: list[int]) -> tuple[float, float]:
258
+ """Compute rhythmic regularity from syllable count distribution.
259
+
260
+ Rhythmic regularity is the inverse of the coefficient of variation (CV)
261
+ of syllable counts per word. Lower CV means more uniform syllable lengths,
262
+ which produces a more metrically regular text.
263
+
264
+ Formula:
265
+ CV = σ / μ (coefficient of variation)
266
+ Regularity = 1 / CV (higher = more regular rhythm)
267
+
268
+ When CV is 0 (all words same length), regularity is set to the word count
269
+ as a practical upper bound.
270
+
271
+ Reference:
272
+ Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm
273
+ and text comprehension. Memory & Cognition, 33(3), 388-396.
274
+
275
+ Returns:
276
+ (rhythmic_regularity, syllable_cv)
277
+ """
278
+ if not syllable_counts:
279
+ return 0.0, 0.0
280
+
281
+ n = len(syllable_counts)
282
+ mean_syl = sum(syllable_counts) / n
283
+ if mean_syl == 0.0:
284
+ return 0.0, 0.0
285
+
286
+ variance = sum((s - mean_syl) ** 2 for s in syllable_counts) / n
287
+ std_dev = math.sqrt(variance)
288
+ cv = std_dev / mean_syl
289
+
290
+ if cv == 0.0:
291
+ # All words have the same syllable count: maximally regular
292
+ regularity = float(n)
293
+ else:
294
+ regularity = 1.0 / cv
295
+
296
+ return regularity, cv
297
+
298
+
299
+ # =============================================================================
300
+ # STRESS PATTERN ENTROPY
301
+ # =============================================================================
302
+
303
+
304
+ def _compute_stress_entropy(words: list[str]) -> float:
305
+ """Compute Shannon entropy of stress patterns across words.
306
+
307
+ Each word's stress pattern (e.g., "10" for trochee, "01" for iamb) is
308
+ treated as a categorical event. Higher entropy means more varied stress
309
+ patterns; lower entropy means the text gravitates toward a few dominant
310
+ metrical feet.
311
+
312
+ Formula:
313
+ H = -Σ p(pattern) × log₂(p(pattern))
314
+
315
+ Reference:
316
+ Shannon, C. E. (1948). A Mathematical Theory of Communication.
317
+ Applied here to prosodic analysis following Greene et al. (2010).
318
+
319
+ Returns:
320
+ Shannon entropy in bits. 0.0 if no stress data available.
321
+ """
322
+ patterns: list[str] = []
323
+ for word in words:
324
+ stress = _get_stress_pattern(word)
325
+ if stress:
326
+ # Binarize: 0 stays 0 (unstressed), 1 or 2 become 1 (stressed)
327
+ binary = "".join("1" if s > 0 else "0" for s in stress)
328
+ patterns.append(binary)
329
+
330
+ if not patterns:
331
+ return 0.0
332
+
333
+ counts = Counter(patterns)
334
+ total = len(patterns)
335
+ entropy = 0.0
336
+ for count in counts.values():
337
+ p = count / total
338
+ if p > 0:
339
+ entropy -= p * math.log2(p)
340
+
341
+ return entropy
342
+
343
+
344
+ # =============================================================================
345
+ # SENTENCE RHYTHM
346
+ # =============================================================================
347
+
348
+
349
+ def _compute_sentence_rhythm(text: str) -> tuple[float, float]:
350
+ """Compute sentence-level rhythm metrics.
351
+
352
+ Sentence length alternation measures the degree to which long and short
353
+ sentences alternate. Authors with strong prose rhythm tend to vary sentence
354
+ length deliberately, creating a sense of pacing.
355
+
356
+ Alternation score: average absolute difference in word count between
357
+ consecutive sentences, normalized by mean sentence length.
358
+
359
+ Sentence rhythm score: composite metric combining alternation with
360
+ sentence length variance (higher variance = more dynamic rhythm).
361
+
362
+ Reference:
363
+ Cutts, M. (2013). Oxford Guide to Plain English (4th ed.).
364
+ Recommends varying sentence length for readability.
365
+
366
+ Returns:
367
+ (sentence_length_alternation, sentence_rhythm_score)
28
368
  """
29
- Compute rhythm and prosody metrics for written text.
369
+ sentences = _split_sentences(text)
370
+ if len(sentences) < 2:
371
+ return 0.0, 0.0
372
+
373
+ lengths = [len(_extract_words(s)) for s in sentences]
374
+ lengths = [length for length in lengths if length > 0]
375
+
376
+ if len(lengths) < 2:
377
+ return 0.0, 0.0
378
+
379
+ mean_len = sum(lengths) / len(lengths)
380
+ if mean_len == 0.0:
381
+ return 0.0, 0.0
382
+
383
+ # Alternation: mean absolute diff between consecutive sentences
384
+ diffs = [abs(lengths[i] - lengths[i - 1]) for i in range(1, len(lengths))]
385
+ alternation = (sum(diffs) / len(diffs)) / mean_len
386
+
387
+ # Rhythm score: combines alternation with normalized variance
388
+ variance = sum((length - mean_len) ** 2 for length in lengths) / len(lengths)
389
+ cv = math.sqrt(variance) / mean_len if mean_len > 0 else 0.0
390
+ rhythm_score = (alternation + cv) / 2.0
391
+
392
+ return alternation, rhythm_score
393
+
394
+
395
+ # =============================================================================
396
+ # PHONOLOGICAL FEATURES: ALLITERATION, ASSONANCE, CONSONANCE
397
+ # =============================================================================
398
+
399
+
400
+ def _compute_alliteration(words: list[str]) -> float:
401
+ """Compute alliteration density (alliterative pairs per 100 words).
402
+
403
+ Alliteration is the repetition of initial consonant sounds in adjacent
404
+ or nearby words. This implementation checks consecutive word pairs for
405
+ matching initial consonant phonemes using the CMU dictionary.
406
+
407
+ Formula:
408
+ density = (alliterative_pairs / total_words) × 100
409
+
410
+ Reference:
411
+ Fabb, N., & Halle, M. (2008). Meter in Poetry. Cambridge University
412
+ Press. Chapter on phonological repetition in verse.
413
+
414
+ Returns:
415
+ Alliterative word pairs per 100 words.
416
+ """
417
+ if len(words) < 2:
418
+ return 0.0
419
+
420
+ pairs = 0
421
+ for i in range(len(words) - 1):
422
+ sound_a = _get_initial_sound(words[i])
423
+ sound_b = _get_initial_sound(words[i + 1])
424
+ if sound_a and sound_b and sound_a == sound_b:
425
+ pairs += 1
426
+
427
+ return (pairs / len(words)) * 100.0
428
+
429
+
430
+ def _compute_assonance(words: list[str]) -> float:
431
+ """Compute assonance density (assonant pairs per 100 words).
432
+
433
+ Assonance is the repetition of vowel sounds within nearby words,
434
+ regardless of surrounding consonants. This implementation checks
435
+ consecutive word pairs for shared vowel phonemes.
436
+
437
+ Formula:
438
+ density = (assonant_pairs / total_words) × 100
439
+
440
+ Returns:
441
+ Assonant word pairs per 100 words.
442
+ """
443
+ if len(words) < 2:
444
+ return 0.0
445
+
446
+ pairs = 0
447
+ for i in range(len(words) - 1):
448
+ vowels_a = set(_get_vowel_phonemes(words[i]))
449
+ vowels_b = set(_get_vowel_phonemes(words[i + 1]))
450
+ if vowels_a and vowels_b and vowels_a & vowels_b:
451
+ pairs += 1
452
+
453
+ return (pairs / len(words)) * 100.0
454
+
455
+
456
+ def _compute_consonance(words: list[str]) -> float:
457
+ """Compute consonance density (consonant-repeating pairs per 100 words).
458
+
459
+ Consonance is the repetition of consonant sounds within nearby words,
460
+ especially at the end of words. This implementation checks consecutive
461
+ word pairs for shared consonant phonemes.
462
+
463
+ Formula:
464
+ density = (consonant_pairs / total_words) × 100
465
+
466
+ Returns:
467
+ Consonant-repeating word pairs per 100 words.
468
+ """
469
+ if len(words) < 2:
470
+ return 0.0
471
+
472
+ pairs = 0
473
+ for i in range(len(words) - 1):
474
+ cons_a = set(_get_consonant_phonemes(words[i]))
475
+ cons_b = set(_get_consonant_phonemes(words[i + 1]))
476
+ if cons_a and cons_b and cons_a & cons_b:
477
+ pairs += 1
478
+
479
+ return (pairs / len(words)) * 100.0
480
+
481
+
482
+ # =============================================================================
483
+ # CONSONANT CLUSTER METRICS
484
+ # =============================================================================
485
+
486
+
487
+ def _compute_cluster_metrics(
488
+ words: list[str],
489
+ ) -> tuple[float, float, float]:
490
+ """Compute consonant cluster complexity metrics.
491
+
492
+ Consonant clusters (two or more consonants in sequence) contribute to
493
+ the perceived complexity and rhythm of text. Languages and styles differ
494
+ in their tolerance for complex clusters.
495
+
496
+ Returns:
497
+ (mean_cluster_length, initial_cluster_ratio, final_cluster_ratio)
498
+
499
+ Where:
500
+ mean_cluster_length: average length of all consonant clusters found
501
+ initial_cluster_ratio: fraction of words starting with a cluster
502
+ final_cluster_ratio: fraction of words ending with a cluster
503
+ """
504
+ if not words:
505
+ return 0.0, 0.0, 0.0
506
+
507
+ cluster_lengths: list[int] = []
508
+ initial_count = 0
509
+ final_count = 0
510
+
511
+ for word in words:
512
+ w = word.lower()
513
+
514
+ initial_match = INITIAL_CLUSTER_PATTERN.match(w)
515
+ if initial_match:
516
+ initial_count += 1
517
+ cluster_lengths.append(len(initial_match.group()))
518
+
519
+ final_match = FINAL_CLUSTER_PATTERN.search(w)
520
+ if final_match:
521
+ final_count += 1
522
+ cluster_lengths.append(len(final_match.group()))
523
+
524
+ n = len(words)
525
+ mean_cluster = sum(cluster_lengths) / len(cluster_lengths) if cluster_lengths else 0.0
526
+ initial_ratio = initial_count / n
527
+ final_ratio = final_count / n
528
+
529
+ return mean_cluster, initial_ratio, final_ratio
530
+
531
+
532
+ # =============================================================================
533
+ # METRICAL FOOT ESTIMATION
534
+ # =============================================================================
535
+
536
+
537
+ def _compute_metrical_feet(words: list[str]) -> tuple[float, float, float, float]:
538
+ """Estimate metrical foot ratios from word-level stress patterns.
539
+
540
+ Classical meter is defined by patterns of stressed (S) and unstressed (U)
541
+ syllables:
542
+ - Iamb: U-S (e.g., "above", "begin")
543
+ - Trochee: S-U (e.g., "garden", "happy")
544
+ - Dactyl: S-U-U (e.g., "merrily", "beautiful")
545
+ - Anapest: U-U-S (e.g., "understand", "intervene")
546
+
547
+ This function examines each word's stress pattern and classifies it as
548
+ matching one or more of these foot types. Multi-syllable words are
549
+ decomposed into overlapping bigrams/trigrams of stress values.
550
+
551
+ Reference:
552
+ Fabb, N., & Halle, M. (2008). Meter in Poetry. Cambridge University
553
+ Press.
554
+
555
+ Returns:
556
+ (iambic_ratio, trochaic_ratio, dactylic_ratio, anapestic_ratio)
557
+ Each as a fraction of total detected foot patterns.
558
+ """
559
+ iambic = 0
560
+ trochaic = 0
561
+ dactylic = 0
562
+ anapestic = 0
563
+ total = 0
564
+
565
+ for word in words:
566
+ stress = _get_stress_pattern(word)
567
+ if len(stress) < 2:
568
+ continue
569
+
570
+ # Binarize stress: 0 = unstressed, 1/2 = stressed
571
+ binary = [1 if s > 0 else 0 for s in stress]
572
+
573
+ # Check bigrams for iambic (0,1) and trochaic (1,0)
574
+ for i in range(len(binary) - 1):
575
+ pair = (binary[i], binary[i + 1])
576
+ if pair == (0, 1):
577
+ iambic += 1
578
+ total += 1
579
+ elif pair == (1, 0):
580
+ trochaic += 1
581
+ total += 1
582
+
583
+ # Check trigrams for dactylic (1,0,0) and anapestic (0,0,1)
584
+ for i in range(len(binary) - 2):
585
+ triple = (binary[i], binary[i + 1], binary[i + 2])
586
+ if triple == (1, 0, 0):
587
+ dactylic += 1
588
+ total += 1
589
+ elif triple == (0, 0, 1):
590
+ anapestic += 1
591
+ total += 1
592
+
593
+ if total == 0:
594
+ return 0.0, 0.0, 0.0, 0.0
595
+
596
+ return (
597
+ iambic / total,
598
+ trochaic / total,
599
+ dactylic / total,
600
+ anapestic / total,
601
+ )
602
+
603
+
604
+ # =============================================================================
605
+ # MAIN ENTRY POINT
606
+ # =============================================================================
607
+
608
+
609
+ def compute_rhythm_prosody(text: str) -> RhythmProsodyResult:
610
+ """Compute rhythm and prosody metrics for written text.
611
+
612
+ Analyzes the musical qualities of written language through syllable
613
+ patterns, stress patterns, phonological features, and metrical foot
614
+ estimation. These metrics are particularly relevant for poetry analysis
615
+ and literary stylometry, but also capture prose rhythm patterns.
30
616
 
31
617
  Related GitHub Issue:
32
618
  #25 - Rhythm and Prosody Metrics
33
619
  https://github.com/craigtrim/pystylometry/issues/25
34
620
 
621
+ Metrics computed:
622
+
623
+ Syllable patterns:
624
+ - mean_syllables_per_word: Average syllable count across all words.
625
+ - syllable_std_dev: Standard deviation of syllable counts.
626
+ - polysyllabic_ratio: Fraction of words with 3+ syllables.
627
+ - monosyllabic_ratio: Fraction of single-syllable words.
628
+
629
+ Rhythmic regularity:
630
+ - rhythmic_regularity: 1 / CV of syllable counts (higher = more regular).
631
+ - syllable_cv: Coefficient of variation of syllable counts.
632
+ - stress_pattern_entropy: Shannon entropy of stress patterns in bits.
633
+
634
+ Sentence rhythm:
635
+ - sentence_length_alternation: Normalized mean absolute difference
636
+ between consecutive sentence lengths.
637
+ - sentence_rhythm_score: Composite metric combining alternation
638
+ and sentence length variance.
639
+
640
+ Phonological features:
641
+ - alliteration_density: Alliterative word pairs per 100 words.
642
+ - assonance_density: Assonant word pairs per 100 words.
643
+ - consonance_density: Consonant-repeating pairs per 100 words.
644
+
645
+ Syllable complexity:
646
+ - mean_consonant_cluster_length: Average length of consonant clusters.
647
+ - initial_cluster_ratio: Words starting with consonant clusters.
648
+ - final_cluster_ratio: Words ending with consonant clusters.
649
+
650
+ Stress patterns (metrical feet):
651
+ - iambic_ratio: Unstressed-stressed pairs / total feet.
652
+ - trochaic_ratio: Stressed-unstressed pairs / total feet.
653
+ - dactylic_ratio: Stressed-unstressed-unstressed trigrams / total feet.
654
+ - anapestic_ratio: Unstressed-unstressed-stressed trigrams / total feet.
655
+
656
+ Dependencies:
657
+ Requires the ``pronouncing`` package for CMU dictionary access.
658
+ Install with: ``pip install pystylometry[readability]``
659
+
660
+ References:
661
+ Fabb, N., & Halle, M. (2008). Meter in Poetry: A New Theory.
662
+ Cambridge University Press.
663
+ Greene, E., Bodrumlu, T., & Knight, K. (2010). Automatic analysis of
664
+ rhythmic poetry with applications to generation and translation.
665
+ Proceedings of EMNLP, 524-533.
666
+ Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm
667
+ and text comprehension. Memory & Cognition, 33(3), 388-396.
668
+
35
669
  Args:
36
- text: Input text to analyze
670
+ text: Input text to analyze. For reliable prosodic statistics, at
671
+ least 100+ words are recommended. Shorter texts will produce
672
+ valid but potentially unstable metrics.
37
673
 
38
674
  Returns:
39
675
  RhythmProsodyResult with syllable patterns, rhythmic regularity,
40
676
  phonological features, stress patterns, and complexity metrics.
677
+ See ``_types.RhythmProsodyResult`` for complete field documentation.
41
678
 
42
679
  Example:
43
- >>> result = compute_rhythm_prosody("Sample text with rhythm...")
680
+ >>> result = compute_rhythm_prosody("The quick brown fox jumps over the lazy dog.")
44
681
  >>> print(f"Syllables/word: {result.mean_syllables_per_word:.2f}")
45
682
  >>> print(f"Rhythmic regularity: {result.rhythmic_regularity:.3f}")
46
683
  >>> print(f"Alliteration density: {result.alliteration_density:.2f}")
47
684
  """
48
- # TODO: Implement rhythm and prosody analysis
49
- # GitHub Issue #25: https://github.com/craigtrim/pystylometry/issues/25
50
- raise NotImplementedError(
51
- "Rhythm and prosody metrics not yet implemented. "
52
- "See GitHub Issue #25: https://github.com/craigtrim/pystylometry/issues/25"
685
+ # Handle empty text
686
+ if not text or not text.strip():
687
+ return RhythmProsodyResult(
688
+ mean_syllables_per_word=0.0,
689
+ syllable_std_dev=0.0,
690
+ polysyllabic_ratio=0.0,
691
+ monosyllabic_ratio=0.0,
692
+ rhythmic_regularity=0.0,
693
+ syllable_cv=0.0,
694
+ stress_pattern_entropy=0.0,
695
+ sentence_length_alternation=0.0,
696
+ sentence_rhythm_score=0.0,
697
+ alliteration_density=0.0,
698
+ assonance_density=0.0,
699
+ consonance_density=0.0,
700
+ mean_consonant_cluster_length=0.0,
701
+ initial_cluster_ratio=0.0,
702
+ final_cluster_ratio=0.0,
703
+ iambic_ratio=0.0,
704
+ trochaic_ratio=0.0,
705
+ dactylic_ratio=0.0,
706
+ anapestic_ratio=0.0,
707
+ metadata={"word_count": 0, "warning": "Empty text"},
708
+ )
709
+
710
+ words = _extract_words(text)
711
+ if not words:
712
+ return RhythmProsodyResult(
713
+ mean_syllables_per_word=0.0,
714
+ syllable_std_dev=0.0,
715
+ polysyllabic_ratio=0.0,
716
+ monosyllabic_ratio=0.0,
717
+ rhythmic_regularity=0.0,
718
+ syllable_cv=0.0,
719
+ stress_pattern_entropy=0.0,
720
+ sentence_length_alternation=0.0,
721
+ sentence_rhythm_score=0.0,
722
+ alliteration_density=0.0,
723
+ assonance_density=0.0,
724
+ consonance_density=0.0,
725
+ mean_consonant_cluster_length=0.0,
726
+ initial_cluster_ratio=0.0,
727
+ final_cluster_ratio=0.0,
728
+ iambic_ratio=0.0,
729
+ trochaic_ratio=0.0,
730
+ dactylic_ratio=0.0,
731
+ anapestic_ratio=0.0,
732
+ metadata={"word_count": 0, "warning": "No words found"},
733
+ )
734
+
735
+ # =========================================================================
736
+ # SYLLABLE PATTERNS
737
+ # =========================================================================
738
+ mean_syl, syl_std, poly_ratio, mono_ratio, syl_counts = _compute_syllable_metrics(words)
739
+
740
+ # =========================================================================
741
+ # RHYTHMIC REGULARITY
742
+ # =========================================================================
743
+ regularity, cv = _compute_rhythmic_regularity(syl_counts)
744
+
745
+ # =========================================================================
746
+ # STRESS PATTERN ENTROPY
747
+ # =========================================================================
748
+ stress_entropy = _compute_stress_entropy(words)
749
+
750
+ # =========================================================================
751
+ # SENTENCE RHYTHM
752
+ # =========================================================================
753
+ alternation, rhythm_score = _compute_sentence_rhythm(text)
754
+
755
+ # =========================================================================
756
+ # PHONOLOGICAL FEATURES
757
+ # =========================================================================
758
+ alliteration = _compute_alliteration(words)
759
+ assonance = _compute_assonance(words)
760
+ consonance = _compute_consonance(words)
761
+
762
+ # =========================================================================
763
+ # CONSONANT CLUSTER COMPLEXITY
764
+ # =========================================================================
765
+ mean_cluster, initial_ratio, final_ratio = _compute_cluster_metrics(words)
766
+
767
+ # =========================================================================
768
+ # METRICAL FOOT ESTIMATION
769
+ # =========================================================================
770
+ iambic, trochaic, dactylic, anapestic = _compute_metrical_feet(words)
771
+
772
+ # =========================================================================
773
+ # METADATA
774
+ # =========================================================================
775
+ # Collect per-word stress patterns for downstream analysis
776
+ word_stress_patterns: dict[str, list[int]] = {}
777
+ for word in set(words):
778
+ stress = _get_stress_pattern(word)
779
+ if stress:
780
+ word_stress_patterns[word.lower()] = stress
781
+
782
+ cmu_coverage = len(word_stress_patterns) / len(set(words)) if words else 0.0
783
+
784
+ metadata: dict[str, Any] = {
785
+ "word_count": len(words),
786
+ "unique_words": len(set(w.lower() for w in words)),
787
+ "sentence_count": len(_split_sentences(text)),
788
+ "total_syllables": sum(syl_counts),
789
+ "cmu_coverage": cmu_coverage,
790
+ "syllable_distribution": dict(Counter(syl_counts)),
791
+ "word_stress_patterns": word_stress_patterns,
792
+ }
793
+
794
+ return RhythmProsodyResult(
795
+ mean_syllables_per_word=mean_syl,
796
+ syllable_std_dev=syl_std,
797
+ polysyllabic_ratio=poly_ratio,
798
+ monosyllabic_ratio=mono_ratio,
799
+ rhythmic_regularity=regularity,
800
+ syllable_cv=cv,
801
+ stress_pattern_entropy=stress_entropy,
802
+ sentence_length_alternation=alternation,
803
+ sentence_rhythm_score=rhythm_score,
804
+ alliteration_density=alliteration,
805
+ assonance_density=assonance,
806
+ consonance_density=consonance,
807
+ mean_consonant_cluster_length=mean_cluster,
808
+ initial_cluster_ratio=initial_ratio,
809
+ final_cluster_ratio=final_ratio,
810
+ iambic_ratio=iambic,
811
+ trochaic_ratio=trochaic,
812
+ dactylic_ratio=dactylic,
813
+ anapestic_ratio=anapestic,
814
+ metadata=metadata,
53
815
  )