pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""Extended n-gram features for authorship attribution.
|
|
2
|
+
|
|
3
|
+
This module provides comprehensive n-gram analysis beyond basic bigram/trigram
|
|
4
|
+
entropy. Features include frequency distributions for higher-order n-grams,
|
|
5
|
+
skipgrams (n-grams with gaps), and POS n-grams, all valuable for stylometric
|
|
6
|
+
analysis and authorship attribution.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issue:
|
|
9
|
+
#19 - Extended N-gram Features
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/19
|
|
11
|
+
|
|
12
|
+
Features implemented:
|
|
13
|
+
- Word trigrams and 4-grams (frequency distributions, top n-grams)
|
|
14
|
+
- Skipgrams (n-grams with gaps, e.g., "the * dog")
|
|
15
|
+
- POS n-grams (part-of-speech tag sequences)
|
|
16
|
+
- Character trigrams and 4-grams
|
|
17
|
+
- N-gram diversity metrics
|
|
18
|
+
- Entropy calculations for each n-gram order
|
|
19
|
+
|
|
20
|
+
References:
|
|
21
|
+
Guthrie, D., Allison, B., Liu, W., Guthrie, L., & Wilks, Y. (2006).
|
|
22
|
+
A closer look at skip-gram modelling. LREC.
|
|
23
|
+
Stamatatos, E. (2009). A survey of modern authorship attribution methods.
|
|
24
|
+
JASIST, 60(3), 538-556.
|
|
25
|
+
Kešelj, V., et al. (2003). N-gram-based author profiles for authorship
|
|
26
|
+
attribution. PACLING.
|
|
27
|
+
"""
|
|
28
|
+
|
|
29
|
+
from .._types import ExtendedNgramResult
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def compute_extended_ngrams(
|
|
33
|
+
text: str,
|
|
34
|
+
top_n: int = 20,
|
|
35
|
+
include_pos_ngrams: bool = True,
|
|
36
|
+
spacy_model: str = "en_core_web_sm",
|
|
37
|
+
) -> ExtendedNgramResult:
|
|
38
|
+
"""
|
|
39
|
+
Compute extended n-gram features for stylometric analysis.
|
|
40
|
+
|
|
41
|
+
Analyzes text to extract comprehensive n-gram statistics including
|
|
42
|
+
word trigrams/4-grams, skipgrams, POS n-grams, and character n-grams.
|
|
43
|
+
These features are powerful for authorship attribution because they
|
|
44
|
+
capture both lexical and syntactic patterns.
|
|
45
|
+
|
|
46
|
+
Related GitHub Issue:
|
|
47
|
+
#19 - Extended N-gram Features
|
|
48
|
+
https://github.com/craigtrim/pystylometry/issues/19
|
|
49
|
+
|
|
50
|
+
Why extended n-grams matter:
|
|
51
|
+
|
|
52
|
+
Word N-grams:
|
|
53
|
+
- Capture phrasal patterns and collocations
|
|
54
|
+
- Trigrams/4-grams more distinctive than bigrams
|
|
55
|
+
- Reveal preferred multi-word expressions
|
|
56
|
+
- Author-specific phrase preferences
|
|
57
|
+
|
|
58
|
+
Skipgrams:
|
|
59
|
+
- N-grams with gaps (e.g., "I * to" matches "I want to", "I have to")
|
|
60
|
+
- Capture syntactic frames independent of specific words
|
|
61
|
+
- Less sparse than contiguous n-grams
|
|
62
|
+
- Model long-distance dependencies
|
|
63
|
+
|
|
64
|
+
POS N-grams:
|
|
65
|
+
- Abstract syntactic patterns (e.g., "DET ADJ NOUN")
|
|
66
|
+
- Independent of vocabulary
|
|
67
|
+
- Capture grammatical preferences
|
|
68
|
+
- Complement word n-grams
|
|
69
|
+
|
|
70
|
+
Character N-grams:
|
|
71
|
+
- Language-independent features
|
|
72
|
+
- Capture morphological patterns
|
|
73
|
+
- Effective for short texts
|
|
74
|
+
- Robust to OCR errors
|
|
75
|
+
|
|
76
|
+
N-gram Types:
|
|
77
|
+
|
|
78
|
+
Contiguous Word N-grams:
|
|
79
|
+
- Trigrams: sequences of 3 words ("in the world")
|
|
80
|
+
- 4-grams: sequences of 4 words ("at the end of")
|
|
81
|
+
|
|
82
|
+
Skipgrams:
|
|
83
|
+
- 2-skipgrams with gap 1: "word1 _ word3"
|
|
84
|
+
- 3-skipgrams with gap 1: "word1 _ word3 word4"
|
|
85
|
+
- Variable gap sizes possible
|
|
86
|
+
|
|
87
|
+
POS N-grams:
|
|
88
|
+
- POS trigrams: "DET ADJ NOUN" (the quick fox)
|
|
89
|
+
- POS 4-grams: "VERB DET ADJ NOUN" (saw the quick fox)
|
|
90
|
+
|
|
91
|
+
Character N-grams:
|
|
92
|
+
- Character trigrams: "the", "he ", "e w"
|
|
93
|
+
- Character 4-grams: "the ", "he w", "e wo"
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
text: Input text to analyze. Should contain at least 100+ words for
|
|
97
|
+
meaningful n-gram statistics. Shorter texts will have sparse
|
|
98
|
+
distributions.
|
|
99
|
+
top_n: Number of most frequent n-grams to return for each type.
|
|
100
|
+
Default is 20. Larger values provide more detail but increase
|
|
101
|
+
result size.
|
|
102
|
+
include_pos_ngrams: Whether to compute POS n-grams. Requires spaCy
|
|
103
|
+
and is slower. Default is True. Set to False for
|
|
104
|
+
faster computation without syntactic features.
|
|
105
|
+
spacy_model: spaCy model for POS tagging (if include_pos_ngrams=True).
|
|
106
|
+
Default is "en_core_web_sm".
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
ExtendedNgramResult containing:
|
|
110
|
+
|
|
111
|
+
Word n-grams:
|
|
112
|
+
- top_word_trigrams: Most frequent word trigrams with counts
|
|
113
|
+
- top_word_4grams: Most frequent word 4-grams with counts
|
|
114
|
+
- word_trigram_count: Total unique word trigrams
|
|
115
|
+
- word_4gram_count: Total unique word 4-grams
|
|
116
|
+
- word_trigram_entropy: Shannon entropy of trigram distribution
|
|
117
|
+
- word_4gram_entropy: Shannon entropy of 4-gram distribution
|
|
118
|
+
|
|
119
|
+
Skipgrams:
|
|
120
|
+
- top_skipgrams_2_1: Top 2-skipgrams with gap of 1
|
|
121
|
+
- top_skipgrams_3_1: Top 3-skipgrams with gap of 1
|
|
122
|
+
- skipgram_2_1_count: Unique 2-skipgrams
|
|
123
|
+
- skipgram_3_1_count: Unique 3-skipgrams
|
|
124
|
+
|
|
125
|
+
POS n-grams (if include_pos_ngrams=True):
|
|
126
|
+
- top_pos_trigrams: Most frequent POS trigrams with counts
|
|
127
|
+
- top_pos_4grams: Most frequent POS 4-grams with counts
|
|
128
|
+
- pos_trigram_count: Unique POS trigrams
|
|
129
|
+
- pos_4gram_count: Unique POS 4-grams
|
|
130
|
+
- pos_trigram_entropy: Shannon entropy of POS trigram distribution
|
|
131
|
+
|
|
132
|
+
Character n-grams:
|
|
133
|
+
- top_char_trigrams: Most frequent character trigrams with counts
|
|
134
|
+
- top_char_4grams: Most frequent character 4-grams with counts
|
|
135
|
+
- char_trigram_entropy: Shannon entropy of char trigram distribution
|
|
136
|
+
- char_4gram_entropy: Shannon entropy of char 4-gram distribution
|
|
137
|
+
|
|
138
|
+
Metadata:
|
|
139
|
+
- Full frequency distributions
|
|
140
|
+
- Parameters used
|
|
141
|
+
- Token counts
|
|
142
|
+
- etc.
|
|
143
|
+
|
|
144
|
+
Example:
|
|
145
|
+
>>> result = compute_extended_ngrams("Sample text for analysis...")
|
|
146
|
+
>>> print(f"Top word trigrams: {result.top_word_trigrams[:3]}")
|
|
147
|
+
Top word trigrams: [('in the world', 5), ('of the world', 4), ('at the time', 3)]
|
|
148
|
+
>>> print(f"Word trigram entropy: {result.word_trigram_entropy:.2f}")
|
|
149
|
+
Word trigram entropy: 4.32
|
|
150
|
+
>>> print(f"Top POS trigrams: {result.top_pos_trigrams[:3]}")
|
|
151
|
+
Top POS trigrams: [('DET ADJ NOUN', 12), ('VERB DET NOUN', 8), ('DET NOUN VERB', 6)]
|
|
152
|
+
|
|
153
|
+
>>> # Compare authors using n-grams
|
|
154
|
+
>>> author1 = compute_extended_ngrams("Text by author 1...")
|
|
155
|
+
>>> author2 = compute_extended_ngrams("Text by author 2...")
|
|
156
|
+
>>> # Compare top_word_trigrams for distinctive phrases
|
|
157
|
+
|
|
158
|
+
Note:
|
|
159
|
+
- Memory usage scales with text length and n-gram order
|
|
160
|
+
- Longer texts have more unique n-grams (higher counts)
|
|
161
|
+
- POS n-grams require spaCy (slower but valuable)
|
|
162
|
+
- Character n-grams include whitespace
|
|
163
|
+
- Skipgrams can be very sparse (many unique patterns)
|
|
164
|
+
- Entropy values higher for more diverse n-gram distributions
|
|
165
|
+
"""
|
|
166
|
+
# TODO: Implement extended n-gram analysis
|
|
167
|
+
# GitHub Issue #19: https://github.com/craigtrim/pystylometry/issues/19
|
|
168
|
+
#
|
|
169
|
+
# Implementation steps:
|
|
170
|
+
#
|
|
171
|
+
# Word N-grams:
|
|
172
|
+
# 1. Tokenize text into words (lowercase, basic cleaning)
|
|
173
|
+
# 2. Generate word trigrams:
|
|
174
|
+
# - Slide window of size 3 across word list
|
|
175
|
+
# - Create tuples of 3 consecutive words
|
|
176
|
+
# - Count frequency of each trigram
|
|
177
|
+
# 3. Generate word 4-grams (similar, window size 4)
|
|
178
|
+
# 4. Sort by frequency, extract top_n for each
|
|
179
|
+
# 5. Calculate Shannon entropy for each distribution:
|
|
180
|
+
# H = -sum(p * log2(p)) where p = freq / total
|
|
181
|
+
#
|
|
182
|
+
# Skipgrams:
|
|
183
|
+
# 6. Generate 2-skipgrams with gap 1:
|
|
184
|
+
# - For each position i: (word[i], word[i+2])
|
|
185
|
+
# - Skips middle word
|
|
186
|
+
# - Count frequencies
|
|
187
|
+
# 7. Generate 3-skipgrams with gap 1:
|
|
188
|
+
# - For each position i: (word[i], word[i+2], word[i+3])
|
|
189
|
+
# - Pattern: word, skip, word, word
|
|
190
|
+
# - Count frequencies
|
|
191
|
+
# 8. Sort and extract top_n skipgrams
|
|
192
|
+
#
|
|
193
|
+
# POS N-grams (if include_pos_ngrams):
|
|
194
|
+
# 9. Load spaCy model for POS tagging
|
|
195
|
+
# 10. Parse text to get POS tags for each word
|
|
196
|
+
# 11. Generate POS trigrams (same as word trigrams, but use POS tags)
|
|
197
|
+
# 12. Generate POS 4-grams
|
|
198
|
+
# 13. Count frequencies, extract top_n
|
|
199
|
+
# 14. Calculate Shannon entropy
|
|
200
|
+
#
|
|
201
|
+
# Character N-grams:
|
|
202
|
+
# 15. Generate character trigrams:
|
|
203
|
+
# - Slide window of size 3 across character sequence
|
|
204
|
+
# - Include spaces and punctuation
|
|
205
|
+
# - Count frequencies
|
|
206
|
+
# 16. Generate character 4-grams (window size 4)
|
|
207
|
+
# 17. Sort and extract top_n for each
|
|
208
|
+
# 18. Calculate Shannon entropy
|
|
209
|
+
#
|
|
210
|
+
# Diversity Metrics:
|
|
211
|
+
# 19. Count total unique n-grams for each type
|
|
212
|
+
# 20. Calculate type-token ratios (unique / total)
|
|
213
|
+
#
|
|
214
|
+
# Metadata:
|
|
215
|
+
# 21. Store full frequency distributions (optional, can be large)
|
|
216
|
+
# 22. Store parameters: top_n, include_pos_ngrams, model
|
|
217
|
+
# 23. Store token/character counts
|
|
218
|
+
#
|
|
219
|
+
# Helper Functions Needed:
|
|
220
|
+
# - generate_ngrams(sequence, n) -> list[tuple]
|
|
221
|
+
# - generate_skipgrams(sequence, n, gap) -> list[tuple]
|
|
222
|
+
# - calculate_shannon_entropy(freq_dist) -> float
|
|
223
|
+
# - get_top_n(freq_dist, n) -> list[tuple]
|
|
224
|
+
#
|
|
225
|
+
# Return ExtendedNgramResult
|
|
226
|
+
#
|
|
227
|
+
# Optimization notes:
|
|
228
|
+
# - Use Counter from collections for frequency counting
|
|
229
|
+
# - Consider sampling for very long texts
|
|
230
|
+
# - Limit maximum n-gram types to prevent memory issues
|
|
231
|
+
# - POS tagging is slowest step - make it optional
|
|
232
|
+
raise NotImplementedError(
|
|
233
|
+
"Extended n-gram features not yet implemented. "
|
|
234
|
+
"See GitHub Issue #19: https://github.com/craigtrim/pystylometry/issues/19"
|
|
235
|
+
)
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
"""Rhythm and prosody metrics for written text.
|
|
2
|
+
|
|
3
|
+
Related GitHub Issue:
|
|
4
|
+
#25 - Rhythm and Prosody Metrics
|
|
5
|
+
https://github.com/craigtrim/pystylometry/issues/25
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from .rhythm_prosody import compute_rhythm_prosody
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"compute_rhythm_prosody",
|
|
12
|
+
]
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Rhythm and prosody metrics for written text.
|
|
2
|
+
|
|
3
|
+
This module captures the musical qualities of written language, including
|
|
4
|
+
stress patterns, syllable rhythms, and phonological features. While traditionally
|
|
5
|
+
studied in spoken language, written text preserves many rhythmic patterns.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#25 - Rhythm and Prosody Metrics
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/25
|
|
10
|
+
|
|
11
|
+
Features analyzed:
|
|
12
|
+
- Syllable patterns and stress patterns
|
|
13
|
+
- Rhythmic regularity (coefficient of variation)
|
|
14
|
+
- Phonological features (alliteration, assonance)
|
|
15
|
+
- Syllable complexity (consonant clusters)
|
|
16
|
+
- Sentence rhythm
|
|
17
|
+
- Polysyllabic word usage
|
|
18
|
+
|
|
19
|
+
References:
|
|
20
|
+
Lea, R. B., Mulligan, E. J., & Walton, J. H. (2005). Sentence rhythm and
|
|
21
|
+
text comprehension. Memory & Cognition, 33(3), 388-396.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from .._types import RhythmProsodyResult
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def compute_rhythm_prosody(text: str) -> RhythmProsodyResult:
|
|
28
|
+
"""
|
|
29
|
+
Compute rhythm and prosody metrics for written text.
|
|
30
|
+
|
|
31
|
+
Related GitHub Issue:
|
|
32
|
+
#25 - Rhythm and Prosody Metrics
|
|
33
|
+
https://github.com/craigtrim/pystylometry/issues/25
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
text: Input text to analyze
|
|
37
|
+
|
|
38
|
+
Returns:
|
|
39
|
+
RhythmProsodyResult with syllable patterns, rhythmic regularity,
|
|
40
|
+
phonological features, stress patterns, and complexity metrics.
|
|
41
|
+
|
|
42
|
+
Example:
|
|
43
|
+
>>> result = compute_rhythm_prosody("Sample text with rhythm...")
|
|
44
|
+
>>> print(f"Syllables/word: {result.mean_syllables_per_word:.2f}")
|
|
45
|
+
>>> print(f"Rhythmic regularity: {result.rhythmic_regularity:.3f}")
|
|
46
|
+
>>> print(f"Alliteration density: {result.alliteration_density:.2f}")
|
|
47
|
+
"""
|
|
48
|
+
# TODO: Implement rhythm and prosody analysis
|
|
49
|
+
# GitHub Issue #25: https://github.com/craigtrim/pystylometry/issues/25
|
|
50
|
+
raise NotImplementedError(
|
|
51
|
+
"Rhythm and prosody metrics not yet implemented. "
|
|
52
|
+
"See GitHub Issue #25: https://github.com/craigtrim/pystylometry/issues/25"
|
|
53
|
+
)
|
|
@@ -1,5 +1,12 @@
|
|
|
1
1
|
"""Readability metrics."""
|
|
2
2
|
|
|
3
|
+
from .additional_formulas import (
|
|
4
|
+
compute_dale_chall,
|
|
5
|
+
compute_forcast,
|
|
6
|
+
compute_fry,
|
|
7
|
+
compute_linsear_write,
|
|
8
|
+
compute_powers_sumner_kearl,
|
|
9
|
+
)
|
|
3
10
|
from .ari import compute_ari
|
|
4
11
|
from .coleman_liau import compute_coleman_liau
|
|
5
12
|
from .flesch import compute_flesch
|
|
@@ -12,4 +19,9 @@ __all__ = [
|
|
|
12
19
|
"compute_gunning_fog",
|
|
13
20
|
"compute_coleman_liau",
|
|
14
21
|
"compute_ari",
|
|
22
|
+
"compute_dale_chall",
|
|
23
|
+
"compute_linsear_write",
|
|
24
|
+
"compute_fry",
|
|
25
|
+
"compute_forcast",
|
|
26
|
+
"compute_powers_sumner_kearl",
|
|
15
27
|
]
|