pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. pystylometry/__init__.py +1 -2
  2. pystylometry/_normalize.py +277 -0
  3. pystylometry/_types.py +1224 -2
  4. pystylometry/_utils.py +4 -0
  5. pystylometry/authorship/__init__.py +4 -0
  6. pystylometry/authorship/additional_methods.py +100 -0
  7. pystylometry/character/__init__.py +15 -0
  8. pystylometry/character/character_metrics.py +301 -0
  9. pystylometry/lexical/__init__.py +13 -6
  10. pystylometry/lexical/advanced_diversity.py +641 -0
  11. pystylometry/lexical/function_words.py +391 -0
  12. pystylometry/lexical/hapax.py +154 -7
  13. pystylometry/lexical/mtld.py +83 -7
  14. pystylometry/lexical/ttr.py +83 -0
  15. pystylometry/lexical/word_frequency_sophistication.py +581 -0
  16. pystylometry/lexical/yule.py +34 -7
  17. pystylometry/ngrams/__init__.py +2 -0
  18. pystylometry/ngrams/extended_ngrams.py +235 -0
  19. pystylometry/prosody/__init__.py +12 -0
  20. pystylometry/prosody/rhythm_prosody.py +53 -0
  21. pystylometry/readability/__init__.py +12 -0
  22. pystylometry/readability/additional_formulas.py +985 -0
  23. pystylometry/readability/ari.py +93 -17
  24. pystylometry/readability/coleman_liau.py +102 -9
  25. pystylometry/readability/complex_words.py +531 -0
  26. pystylometry/readability/flesch.py +59 -14
  27. pystylometry/readability/gunning_fog.py +194 -25
  28. pystylometry/readability/smog.py +31 -14
  29. pystylometry/readability/syllables.py +137 -30
  30. pystylometry/stylistic/__init__.py +20 -0
  31. pystylometry/stylistic/cohesion_coherence.py +45 -0
  32. pystylometry/stylistic/genre_register.py +45 -0
  33. pystylometry/stylistic/markers.py +131 -0
  34. pystylometry/stylistic/vocabulary_overlap.py +47 -0
  35. pystylometry/syntactic/__init__.py +4 -0
  36. pystylometry/syntactic/advanced_syntactic.py +432 -0
  37. pystylometry/syntactic/pos_ratios.py +104 -13
  38. pystylometry/syntactic/sentence_stats.py +57 -13
  39. pystylometry/syntactic/sentence_types.py +470 -0
  40. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
  41. pystylometry-1.0.0.dist-info/RECORD +46 -0
  42. {pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
  43. pystylometry-0.1.0.dist-info/RECORD +0 -26
@@ -15,7 +15,7 @@ def compute_yule(text: str) -> YuleResult:
15
15
 
16
16
  Formula:
17
17
  K = 10⁴ × (Σm²×Vm - N) / N²
18
- I = (V² / Σm²×Vm) - (1/N)
18
+ I = V² / (Σm²×Vm - N)
19
19
 
20
20
  Where:
21
21
  - N = total tokens
@@ -33,16 +33,29 @@ def compute_yule(text: str) -> YuleResult:
33
33
  Returns:
34
34
  YuleResult with .yule_k, .yule_i, and metadata
35
35
 
36
+ Note: For empty input or when Σm²×Vm = N (perfectly uniform vocabulary),
37
+ metrics will be float('nan') to indicate undefined values.
38
+
36
39
  Example:
37
40
  >>> result = compute_yule("The quick brown fox jumps over the lazy dog.")
38
41
  >>> print(f"Yule's K: {result.yule_k:.2f}")
39
42
  >>> print(f"Yule's I: {result.yule_i:.2f}")
43
+
44
+ >>> # Empty input returns NaN
45
+ >>> import math
46
+ >>> result_empty = compute_yule("")
47
+ >>> math.isnan(result_empty.yule_k)
48
+ True
40
49
  """
41
50
  tokens = tokenize(text.lower())
42
51
  N = len(tokens) # noqa: N806
43
52
 
44
53
  if N == 0:
45
- return YuleResult(yule_k=0.0, yule_i=0.0, metadata={"token_count": 0, "vocabulary_size": 0})
54
+ return YuleResult(
55
+ yule_k=float("nan"),
56
+ yule_i=float("nan"),
57
+ metadata={"token_count": 0, "vocabulary_size": 0},
58
+ )
46
59
 
47
60
  # Count frequency of each token
48
61
  freq_counter = Counter(tokens)
@@ -50,11 +63,25 @@ def compute_yule(text: str) -> YuleResult:
50
63
 
51
64
  # Count how many words occur with each frequency
52
65
  # Vm[m] = number of words that occur exactly m times
53
- # freq_of_freqs = Counter(freq_counter.values()) # TODO: Will be needed for Yule's K
54
-
55
- # TODO: Implement Yule's K and I calculations
56
- yule_k = 0.0 # Placeholder
57
- yule_i = 0.0 # Placeholder
66
+ freq_of_freqs = Counter(freq_counter.values())
67
+
68
+ # Calculate Σm²×Vm (sum of m-squared times Vm for all m)
69
+ # This is the sum across all frequency levels of:
70
+ # (frequency)² × (count of words at that frequency)
71
+ sum_m2_vm = sum(m * m * vm for m, vm in freq_of_freqs.items())
72
+
73
+ # Yule's K: 10⁴ × (Σm²×Vm - N) / N²
74
+ # K measures vocabulary repetitiveness (higher K = more repetitive)
75
+ yule_k = 10_000 * (sum_m2_vm - N) / (N * N)
76
+
77
+ # Yule's I: V² / (Σm²×Vm - N)
78
+ # I is the inverse measure (higher I = more diverse)
79
+ # If Σm²×Vm = N (perfectly uniform vocabulary), denominator is 0, return NaN
80
+ denominator = sum_m2_vm - N
81
+ if denominator == 0:
82
+ yule_i = float("nan")
83
+ else:
84
+ yule_i = (V * V) / denominator
58
85
 
59
86
  return YuleResult(
60
87
  yule_k=yule_k,
@@ -5,9 +5,11 @@ from .entropy import (
5
5
  compute_ngram_entropy,
6
6
  compute_word_bigram_entropy,
7
7
  )
8
+ from .extended_ngrams import compute_extended_ngrams
8
9
 
9
10
  __all__ = [
10
11
  "compute_ngram_entropy",
11
12
  "compute_character_bigram_entropy",
12
13
  "compute_word_bigram_entropy",
14
+ "compute_extended_ngrams",
13
15
  ]
@@ -0,0 +1,235 @@
1
+ """Extended n-gram features for authorship attribution.
2
+
3
+ This module provides comprehensive n-gram analysis beyond basic bigram/trigram
4
+ entropy. Features include frequency distributions for higher-order n-grams,
5
+ skipgrams (n-grams with gaps), and POS n-grams, all valuable for stylometric
6
+ analysis and authorship attribution.
7
+
8
+ Related GitHub Issue:
9
+ #19 - Extended N-gram Features
10
+ https://github.com/craigtrim/pystylometry/issues/19
11
+
12
+ Features implemented:
13
+ - Word trigrams and 4-grams (frequency distributions, top n-grams)
14
+ - Skipgrams (n-grams with gaps, e.g., "the * dog")
15
+ - POS n-grams (part-of-speech tag sequences)
16
+ - Character trigrams and 4-grams
17
+ - N-gram diversity metrics
18
+ - Entropy calculations for each n-gram order
19
+
20
+ References:
21
+ Guthrie, D., Allison, B., Liu, W., Guthrie, L., & Wilks, Y. (2006).
22
+ A closer look at skip-gram modelling. LREC.
23
+ Stamatatos, E. (2009). A survey of modern authorship attribution methods.
24
+ JASIST, 60(3), 538-556.
25
+ Kešelj, V., et al. (2003). N-gram-based author profiles for authorship
26
+ attribution. PACLING.
27
+ """
28
+
29
+ from .._types import ExtendedNgramResult
30
+
31
+
32
+ def compute_extended_ngrams(
33
+ text: str,
34
+ top_n: int = 20,
35
+ include_pos_ngrams: bool = True,
36
+ spacy_model: str = "en_core_web_sm",
37
+ ) -> ExtendedNgramResult:
38
+ """
39
+ Compute extended n-gram features for stylometric analysis.
40
+
41
+ Analyzes text to extract comprehensive n-gram statistics including
42
+ word trigrams/4-grams, skipgrams, POS n-grams, and character n-grams.
43
+ These features are powerful for authorship attribution because they
44
+ capture both lexical and syntactic patterns.
45
+
46
+ Related GitHub Issue:
47
+ #19 - Extended N-gram Features
48
+ https://github.com/craigtrim/pystylometry/issues/19
49
+
50
+ Why extended n-grams matter:
51
+
52
+ Word N-grams:
53
+ - Capture phrasal patterns and collocations
54
+ - Trigrams/4-grams more distinctive than bigrams
55
+ - Reveal preferred multi-word expressions
56
+ - Author-specific phrase preferences
57
+
58
+ Skipgrams:
59
+ - N-grams with gaps (e.g., "I * to" matches "I want to", "I have to")
60
+ - Capture syntactic frames independent of specific words
61
+ - Less sparse than contiguous n-grams
62
+ - Model long-distance dependencies
63
+
64
+ POS N-grams:
65
+ - Abstract syntactic patterns (e.g., "DET ADJ NOUN")
66
+ - Independent of vocabulary
67
+ - Capture grammatical preferences
68
+ - Complement word n-grams
69
+
70
+ Character N-grams:
71
+ - Language-independent features
72
+ - Capture morphological patterns
73
+ - Effective for short texts
74
+ - Robust to OCR errors
75
+
76
+ N-gram Types:
77
+
78
+ Contiguous Word N-grams:
79
+ - Trigrams: sequences of 3 words ("in the world")
80
+ - 4-grams: sequences of 4 words ("at the end of")
81
+
82
+ Skipgrams:
83
+ - 2-skipgrams with gap 1: "word1 _ word3"
84
+ - 3-skipgrams with gap 1: "word1 _ word3 word4"
85
+ - Variable gap sizes possible
86
+
87
+ POS N-grams:
88
+ - POS trigrams: "DET ADJ NOUN" (the quick fox)
89
+ - POS 4-grams: "VERB DET ADJ NOUN" (saw the quick fox)
90
+
91
+ Character N-grams:
92
+ - Character trigrams: "the", "he ", "e w"
93
+ - Character 4-grams: "the ", "he w", "e wo"
94
+
95
+ Args:
96
+ text: Input text to analyze. Should contain at least 100+ words for
97
+ meaningful n-gram statistics. Shorter texts will have sparse
98
+ distributions.
99
+ top_n: Number of most frequent n-grams to return for each type.
100
+ Default is 20. Larger values provide more detail but increase
101
+ result size.
102
+ include_pos_ngrams: Whether to compute POS n-grams. Requires spaCy
103
+ and is slower. Default is True. Set to False for
104
+ faster computation without syntactic features.
105
+ spacy_model: spaCy model for POS tagging (if include_pos_ngrams=True).
106
+ Default is "en_core_web_sm".
107
+
108
+ Returns:
109
+ ExtendedNgramResult containing:
110
+
111
+ Word n-grams:
112
+ - top_word_trigrams: Most frequent word trigrams with counts
113
+ - top_word_4grams: Most frequent word 4-grams with counts
114
+ - word_trigram_count: Total unique word trigrams
115
+ - word_4gram_count: Total unique word 4-grams
116
+ - word_trigram_entropy: Shannon entropy of trigram distribution
117
+ - word_4gram_entropy: Shannon entropy of 4-gram distribution
118
+
119
+ Skipgrams:
120
+ - top_skipgrams_2_1: Top 2-skipgrams with gap of 1
121
+ - top_skipgrams_3_1: Top 3-skipgrams with gap of 1
122
+ - skipgram_2_1_count: Unique 2-skipgrams
123
+ - skipgram_3_1_count: Unique 3-skipgrams
124
+
125
+ POS n-grams (if include_pos_ngrams=True):
126
+ - top_pos_trigrams: Most frequent POS trigrams with counts
127
+ - top_pos_4grams: Most frequent POS 4-grams with counts
128
+ - pos_trigram_count: Unique POS trigrams
129
+ - pos_4gram_count: Unique POS 4-grams
130
+ - pos_trigram_entropy: Shannon entropy of POS trigram distribution
131
+
132
+ Character n-grams:
133
+ - top_char_trigrams: Most frequent character trigrams with counts
134
+ - top_char_4grams: Most frequent character 4-grams with counts
135
+ - char_trigram_entropy: Shannon entropy of char trigram distribution
136
+ - char_4gram_entropy: Shannon entropy of char 4-gram distribution
137
+
138
+ Metadata:
139
+ - Full frequency distributions
140
+ - Parameters used
141
+ - Token counts
142
+ - etc.
143
+
144
+ Example:
145
+ >>> result = compute_extended_ngrams("Sample text for analysis...")
146
+ >>> print(f"Top word trigrams: {result.top_word_trigrams[:3]}")
147
+ Top word trigrams: [('in the world', 5), ('of the world', 4), ('at the time', 3)]
148
+ >>> print(f"Word trigram entropy: {result.word_trigram_entropy:.2f}")
149
+ Word trigram entropy: 4.32
150
+ >>> print(f"Top POS trigrams: {result.top_pos_trigrams[:3]}")
151
+ Top POS trigrams: [('DET ADJ NOUN', 12), ('VERB DET NOUN', 8), ('DET NOUN VERB', 6)]
152
+
153
+ >>> # Compare authors using n-grams
154
+ >>> author1 = compute_extended_ngrams("Text by author 1...")
155
+ >>> author2 = compute_extended_ngrams("Text by author 2...")
156
+ >>> # Compare top_word_trigrams for distinctive phrases
157
+
158
+ Note:
159
+ - Memory usage scales with text length and n-gram order
160
+ - Longer texts have more unique n-grams (higher counts)
161
+ - POS n-grams require spaCy (slower but valuable)
162
+ - Character n-grams include whitespace
163
+ - Skipgrams can be very sparse (many unique patterns)
164
+ - Entropy values higher for more diverse n-gram distributions
165
+ """
166
+ # TODO: Implement extended n-gram analysis
167
+ # GitHub Issue #19: https://github.com/craigtrim/pystylometry/issues/19
168
+ #
169
+ # Implementation steps:
170
+ #
171
+ # Word N-grams:
172
+ # 1. Tokenize text into words (lowercase, basic cleaning)
173
+ # 2. Generate word trigrams:
174
+ # - Slide window of size 3 across word list
175
+ # - Create tuples of 3 consecutive words
176
+ # - Count frequency of each trigram
177
+ # 3. Generate word 4-grams (similar, window size 4)
178
+ # 4. Sort by frequency, extract top_n for each
179
+ # 5. Calculate Shannon entropy for each distribution:
180
+ # H = -sum(p * log2(p)) where p = freq / total
181
+ #
182
+ # Skipgrams:
183
+ # 6. Generate 2-skipgrams with gap 1:
184
+ # - For each position i: (word[i], word[i+2])
185
+ # - Skips middle word
186
+ # - Count frequencies
187
+ # 7. Generate 3-skipgrams with gap 1:
188
+ # - For each position i: (word[i], word[i+2], word[i+3])
189
+ # - Pattern: word, skip, word, word
190
+ # - Count frequencies
191
+ # 8. Sort and extract top_n skipgrams
192
+ #
193
+ # POS N-grams (if include_pos_ngrams):
194
+ # 9. Load spaCy model for POS tagging
195
+ # 10. Parse text to get POS tags for each word
196
+ # 11. Generate POS trigrams (same as word trigrams, but use POS tags)
197
+ # 12. Generate POS 4-grams
198
+ # 13. Count frequencies, extract top_n
199
+ # 14. Calculate Shannon entropy
200
+ #
201
+ # Character N-grams:
202
+ # 15. Generate character trigrams:
203
+ # - Slide window of size 3 across character sequence
204
+ # - Include spaces and punctuation
205
+ # - Count frequencies
206
+ # 16. Generate character 4-grams (window size 4)
207
+ # 17. Sort and extract top_n for each
208
+ # 18. Calculate Shannon entropy
209
+ #
210
+ # Diversity Metrics:
211
+ # 19. Count total unique n-grams for each type
212
+ # 20. Calculate type-token ratios (unique / total)
213
+ #
214
+ # Metadata:
215
+ # 21. Store full frequency distributions (optional, can be large)
216
+ # 22. Store parameters: top_n, include_pos_ngrams, model
217
+ # 23. Store token/character counts
218
+ #
219
+ # Helper Functions Needed:
220
+ # - generate_ngrams(sequence, n) -> list[tuple]
221
+ # - generate_skipgrams(sequence, n, gap) -> list[tuple]
222
+ # - calculate_shannon_entropy(freq_dist) -> float
223
+ # - get_top_n(freq_dist, n) -> list[tuple]
224
+ #
225
+ # Return ExtendedNgramResult
226
+ #
227
+ # Optimization notes:
228
+ # - Use Counter from collections for frequency counting
229
+ # - Consider sampling for very long texts
230
+ # - Limit maximum n-gram types to prevent memory issues
231
+ # - POS tagging is slowest step - make it optional
232
+ raise NotImplementedError(
233
+ "Extended n-gram features not yet implemented. "
234
+ "See GitHub Issue #19: https://github.com/craigtrim/pystylometry/issues/19"
235
+ )
@@ -0,0 +1,12 @@
1
+ """Rhythm and prosody metrics for written text.
2
+
3
+ Related GitHub Issue:
4
+ #25 - Rhythm and Prosody Metrics
5
+ https://github.com/craigtrim/pystylometry/issues/25
6
+ """
7
+
8
+ from .rhythm_prosody import compute_rhythm_prosody
9
+
10
+ __all__ = [
11
+ "compute_rhythm_prosody",
12
+ ]
@@ -0,0 +1,53 @@
1
+ """Rhythm and prosody metrics for written text.
2
+
3
+ This module captures the musical qualities of written language, including
4
+ stress patterns, syllable rhythms, and phonological features. While traditionally
5
+ studied in spoken language, written text preserves many rhythmic patterns.
6
+
7
+ Related GitHub Issue:
8
+ #25 - Rhythm and Prosody Metrics
9
+ https://github.com/craigtrim/pystylometry/issues/25
10
+
11
+ Features analyzed:
12
+ - Syllable patterns and stress patterns
13
+ - Rhythmic regularity (coefficient of variation)
14
+ - Phonological features (alliteration, assonance)
15
+ - Syllable complexity (consonant clusters)
16
+ - Sentence rhythm
17
+ - Polysyllabic word usage
18
+
19
+ References:
20
+ Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm and
21
+ text comprehension. Memory & Cognition, 33(3), 388-396.
22
+ """
23
+
24
+ from .._types import RhythmProsodyResult
25
+
26
+
27
+ def compute_rhythm_prosody(text: str) -> RhythmProsodyResult:
28
+ """
29
+ Compute rhythm and prosody metrics for written text.
30
+
31
+ Related GitHub Issue:
32
+ #25 - Rhythm and Prosody Metrics
33
+ https://github.com/craigtrim/pystylometry/issues/25
34
+
35
+ Args:
36
+ text: Input text to analyze
37
+
38
+ Returns:
39
+ RhythmProsodyResult with syllable patterns, rhythmic regularity,
40
+ phonological features, stress patterns, and complexity metrics.
41
+
42
+ Example:
43
+ >>> result = compute_rhythm_prosody("Sample text with rhythm...")
44
+ >>> print(f"Syllables/word: {result.mean_syllables_per_word:.2f}")
45
+ >>> print(f"Rhythmic regularity: {result.rhythmic_regularity:.3f}")
46
+ >>> print(f"Alliteration density: {result.alliteration_density:.2f}")
47
+ """
48
+ # TODO: Implement rhythm and prosody analysis
49
+ # GitHub Issue #25: https://github.com/craigtrim/pystylometry/issues/25
50
+ raise NotImplementedError(
51
+ "Rhythm and prosody metrics not yet implemented. "
52
+ "See GitHub Issue #25: https://github.com/craigtrim/pystylometry/issues/25"
53
+ )
@@ -1,5 +1,12 @@
1
1
  """Readability metrics."""
2
2
 
3
+ from .additional_formulas import (
4
+ compute_dale_chall,
5
+ compute_forcast,
6
+ compute_fry,
7
+ compute_linsear_write,
8
+ compute_powers_sumner_kearl,
9
+ )
3
10
  from .ari import compute_ari
4
11
  from .coleman_liau import compute_coleman_liau
5
12
  from .flesch import compute_flesch
@@ -12,4 +19,9 @@ __all__ = [
12
19
  "compute_gunning_fog",
13
20
  "compute_coleman_liau",
14
21
  "compute_ari",
22
+ "compute_dale_chall",
23
+ "compute_linsear_write",
24
+ "compute_fry",
25
+ "compute_forcast",
26
+ "compute_powers_sumner_kearl",
15
27
  ]