pystylometry 1.1.0__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +17 -1
- pystylometry/_types.py +206 -0
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +9 -6
- pystylometry/authorship/additional_methods.py +262 -17
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +8 -1
- pystylometry/character/README.md +17 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/__init__.py +3 -0
- pystylometry/lexical/repetition.py +506 -0
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/viz/README.md +27 -0
- pystylometry-1.3.1.dist-info/LICENSE +21 -0
- pystylometry-1.3.1.dist-info/METADATA +79 -0
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/RECORD +31 -16
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/WHEEL +1 -1
- pystylometry-1.1.0.dist-info/METADATA +0 -278
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/entry_points.txt +0 -0
|
@@ -26,7 +26,174 @@ References:
|
|
|
26
26
|
attribution. PACLING.
|
|
27
27
|
"""
|
|
28
28
|
|
|
29
|
+
from __future__ import annotations
|
|
30
|
+
|
|
31
|
+
import math
|
|
32
|
+
from collections import Counter
|
|
33
|
+
from typing import Sequence
|
|
34
|
+
|
|
29
35
|
from .._types import ExtendedNgramResult
|
|
36
|
+
from .._utils import advanced_tokenize
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _generate_ngrams(sequence: Sequence[str], n: int) -> list[tuple[str, ...]]:
|
|
40
|
+
"""
|
|
41
|
+
Generate n-grams from a sequence.
|
|
42
|
+
|
|
43
|
+
Slides a window of size n across the sequence and yields tuples
|
|
44
|
+
of n consecutive elements.
|
|
45
|
+
|
|
46
|
+
Related GitHub Issue:
|
|
47
|
+
#19 - Extended N-gram Features
|
|
48
|
+
https://github.com/craigtrim/pystylometry/issues/19
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
sequence: List of tokens (words or characters)
|
|
52
|
+
n: Size of the n-gram (e.g., 3 for trigrams)
|
|
53
|
+
|
|
54
|
+
Returns:
|
|
55
|
+
List of n-gram tuples
|
|
56
|
+
|
|
57
|
+
Example:
|
|
58
|
+
>>> _generate_ngrams(["the", "quick", "brown", "fox"], 2)
|
|
59
|
+
[('the', 'quick'), ('quick', 'brown'), ('brown', 'fox')]
|
|
60
|
+
"""
|
|
61
|
+
if len(sequence) < n:
|
|
62
|
+
return []
|
|
63
|
+
return [tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)]
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _generate_skipgrams(sequence: Sequence[str], n: int, gap: int) -> list[tuple[str, ...]]:
|
|
67
|
+
"""
|
|
68
|
+
Generate skipgrams (n-grams with gaps) from a sequence.
|
|
69
|
+
|
|
70
|
+
Skipgrams capture non-contiguous word patterns. For example, with n=2 and
|
|
71
|
+
gap=1, "the quick brown fox" yields ("the", "brown"), ("quick", "fox").
|
|
72
|
+
This captures syntactic frames independent of specific intervening words.
|
|
73
|
+
|
|
74
|
+
Related GitHub Issue:
|
|
75
|
+
#19 - Extended N-gram Features
|
|
76
|
+
https://github.com/craigtrim/pystylometry/issues/19
|
|
77
|
+
|
|
78
|
+
References:
|
|
79
|
+
Guthrie, D., et al. (2006). A closer look at skip-gram modelling. LREC.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
sequence: List of tokens
|
|
83
|
+
n: Number of words to include in each skipgram
|
|
84
|
+
gap: Number of words to skip between included words
|
|
85
|
+
|
|
86
|
+
Returns:
|
|
87
|
+
List of skipgram tuples
|
|
88
|
+
|
|
89
|
+
Example:
|
|
90
|
+
>>> _generate_skipgrams(["the", "quick", "brown", "fox"], 2, 1)
|
|
91
|
+
[('the', 'brown'), ('quick', 'fox')]
|
|
92
|
+
>>> _generate_skipgrams(["a", "b", "c", "d", "e"], 3, 1)
|
|
93
|
+
[('a', 'c', 'd'), ('b', 'd', 'e')]
|
|
94
|
+
"""
|
|
95
|
+
if n < 2:
|
|
96
|
+
return list(tuple([s]) for s in sequence)
|
|
97
|
+
|
|
98
|
+
# Total span needed: we need n items with (n-1) gaps of size `gap`
|
|
99
|
+
# First item at position i, subsequent items at i + (gap+1), i + 2*(gap+1), ...
|
|
100
|
+
# For n=2, gap=1: positions [i, i+2] -> span of 3
|
|
101
|
+
# For n=3, gap=1: positions [i, i+2, i+3] (first gap, then contiguous)
|
|
102
|
+
# Actually for skipgrams like "word1 _ word3 word4" (n=3, gap=1):
|
|
103
|
+
# positions [i, i+2, i+3]
|
|
104
|
+
# The pattern is: first word, skip `gap`, then n-1 contiguous words
|
|
105
|
+
|
|
106
|
+
skipgrams = []
|
|
107
|
+
|
|
108
|
+
# Pattern: first word at i, then skip `gap` words, then n-1 contiguous words
|
|
109
|
+
# Total span = 1 + gap + (n-1) = n + gap
|
|
110
|
+
total_span = n + gap
|
|
111
|
+
if len(sequence) < total_span:
|
|
112
|
+
return []
|
|
113
|
+
|
|
114
|
+
for i in range(len(sequence) - total_span + 1):
|
|
115
|
+
# First word
|
|
116
|
+
gram = [sequence[i]]
|
|
117
|
+
# Skip `gap` words, then take n-1 contiguous words
|
|
118
|
+
for j in range(n - 1):
|
|
119
|
+
gram.append(sequence[i + gap + 1 + j])
|
|
120
|
+
skipgrams.append(tuple(gram))
|
|
121
|
+
|
|
122
|
+
return skipgrams
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def _calculate_shannon_entropy(counter: Counter[tuple[str, ...]]) -> float:
|
|
126
|
+
"""
|
|
127
|
+
Calculate Shannon entropy of a frequency distribution.
|
|
128
|
+
|
|
129
|
+
Shannon entropy measures the uncertainty or information content in a
|
|
130
|
+
distribution. Higher entropy indicates more uniform (diverse) distributions,
|
|
131
|
+
while lower entropy indicates a few dominant n-grams.
|
|
132
|
+
|
|
133
|
+
Related GitHub Issue:
|
|
134
|
+
#19 - Extended N-gram Features
|
|
135
|
+
https://github.com/craigtrim/pystylometry/issues/19
|
|
136
|
+
|
|
137
|
+
Formula:
|
|
138
|
+
H = -Σ p(x) * log2(p(x))
|
|
139
|
+
where p(x) = count(x) / total
|
|
140
|
+
|
|
141
|
+
Args:
|
|
142
|
+
counter: Counter object with n-gram frequencies
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Shannon entropy in bits. Higher values indicate more diversity.
|
|
146
|
+
|
|
147
|
+
Example:
|
|
148
|
+
>>> from collections import Counter
|
|
149
|
+
>>> _calculate_shannon_entropy(Counter({"a": 1, "b": 1, "c": 1, "d": 1}))
|
|
150
|
+
2.0 # Maximum entropy for 4 equally likely outcomes
|
|
151
|
+
"""
|
|
152
|
+
if not counter:
|
|
153
|
+
return 0.0
|
|
154
|
+
|
|
155
|
+
total = sum(counter.values())
|
|
156
|
+
if total == 0:
|
|
157
|
+
return 0.0
|
|
158
|
+
|
|
159
|
+
entropy = 0.0
|
|
160
|
+
for count in counter.values():
|
|
161
|
+
if count > 0:
|
|
162
|
+
p = count / total
|
|
163
|
+
entropy -= p * math.log2(p)
|
|
164
|
+
|
|
165
|
+
return entropy
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def _format_ngram(ngram: tuple[str, ...]) -> str:
|
|
169
|
+
"""
|
|
170
|
+
Format an n-gram tuple as a readable string.
|
|
171
|
+
|
|
172
|
+
Args:
|
|
173
|
+
ngram: Tuple of tokens
|
|
174
|
+
|
|
175
|
+
Returns:
|
|
176
|
+
Space-joined string for word n-grams, concatenated string for characters
|
|
177
|
+
|
|
178
|
+
Example:
|
|
179
|
+
>>> _format_ngram(("the", "quick", "fox"))
|
|
180
|
+
'the quick fox'
|
|
181
|
+
"""
|
|
182
|
+
return " ".join(ngram)
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _get_top_ngrams(counter: Counter[tuple[str, ...]], n: int) -> list[tuple[str, int]]:
|
|
186
|
+
"""
|
|
187
|
+
Get top n most frequent n-grams formatted as strings.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
counter: Counter of n-gram tuples
|
|
191
|
+
n: Number of top items to return
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
List of (ngram_string, count) tuples sorted by frequency
|
|
195
|
+
"""
|
|
196
|
+
return [(_format_ngram(ngram), count) for ngram, count in counter.most_common(n)]
|
|
30
197
|
|
|
31
198
|
|
|
32
199
|
def compute_extended_ngrams(
|
|
@@ -163,73 +330,151 @@ def compute_extended_ngrams(
|
|
|
163
330
|
- Skipgrams can be very sparse (many unique patterns)
|
|
164
331
|
- Entropy values higher for more diverse n-gram distributions
|
|
165
332
|
"""
|
|
166
|
-
#
|
|
167
|
-
#
|
|
168
|
-
#
|
|
169
|
-
|
|
170
|
-
#
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
#
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
#
|
|
177
|
-
#
|
|
178
|
-
#
|
|
179
|
-
|
|
180
|
-
#
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
#
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
#
|
|
189
|
-
#
|
|
190
|
-
#
|
|
191
|
-
|
|
192
|
-
#
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
#
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
#
|
|
201
|
-
#
|
|
202
|
-
#
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
333
|
+
# =========================================================================
|
|
334
|
+
# TOKENIZATION
|
|
335
|
+
# =========================================================================
|
|
336
|
+
|
|
337
|
+
# Word tokenization: lowercase, strip punctuation for word n-grams
|
|
338
|
+
words = advanced_tokenize(text, lowercase=True, strip_punctuation=True)
|
|
339
|
+
|
|
340
|
+
# Character sequence: lowercase but preserve spaces (for character n-grams)
|
|
341
|
+
chars = list(text.lower())
|
|
342
|
+
|
|
343
|
+
# =========================================================================
|
|
344
|
+
# WORD N-GRAMS
|
|
345
|
+
# =========================================================================
|
|
346
|
+
|
|
347
|
+
# Generate word trigrams (3-grams)
|
|
348
|
+
word_trigrams = _generate_ngrams(words, 3)
|
|
349
|
+
word_trigram_counter: Counter[tuple[str, ...]] = Counter(word_trigrams)
|
|
350
|
+
|
|
351
|
+
# Generate word 4-grams
|
|
352
|
+
word_4grams = _generate_ngrams(words, 4)
|
|
353
|
+
word_4gram_counter: Counter[tuple[str, ...]] = Counter(word_4grams)
|
|
354
|
+
|
|
355
|
+
# =========================================================================
|
|
356
|
+
# SKIPGRAMS
|
|
357
|
+
# =========================================================================
|
|
358
|
+
|
|
359
|
+
# 2-skipgrams with gap of 1: (word1, word3) skipping word2
|
|
360
|
+
skipgrams_2_1 = _generate_skipgrams(words, 2, 1)
|
|
361
|
+
skipgram_2_1_counter: Counter[tuple[str, ...]] = Counter(skipgrams_2_1)
|
|
362
|
+
|
|
363
|
+
# 3-skipgrams with gap of 1: (word1, word3, word4) skipping word2
|
|
364
|
+
skipgrams_3_1 = _generate_skipgrams(words, 3, 1)
|
|
365
|
+
skipgram_3_1_counter: Counter[tuple[str, ...]] = Counter(skipgrams_3_1)
|
|
366
|
+
|
|
367
|
+
# =========================================================================
|
|
368
|
+
# POS N-GRAMS (optional, requires spaCy)
|
|
369
|
+
# =========================================================================
|
|
370
|
+
|
|
371
|
+
pos_trigram_counter: Counter[tuple[str, ...]] = Counter()
|
|
372
|
+
pos_4gram_counter: Counter[tuple[str, ...]] = Counter()
|
|
373
|
+
pos_trigram_entropy = 0.0
|
|
374
|
+
|
|
375
|
+
if include_pos_ngrams:
|
|
376
|
+
try:
|
|
377
|
+
import spacy
|
|
378
|
+
|
|
379
|
+
# Load spaCy model
|
|
380
|
+
try:
|
|
381
|
+
nlp = spacy.load(spacy_model)
|
|
382
|
+
except OSError:
|
|
383
|
+
# Model not installed - provide helpful message
|
|
384
|
+
raise ImportError(
|
|
385
|
+
f"spaCy model '{spacy_model}' not found. "
|
|
386
|
+
f"Install with: python -m spacy download {spacy_model}"
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
# Process text and extract POS tags
|
|
390
|
+
doc = nlp(text)
|
|
391
|
+
pos_tags = [token.pos_ for token in doc if not token.is_space]
|
|
392
|
+
|
|
393
|
+
# Generate POS trigrams
|
|
394
|
+
pos_trigrams = _generate_ngrams(pos_tags, 3)
|
|
395
|
+
pos_trigram_counter = Counter(pos_trigrams)
|
|
396
|
+
|
|
397
|
+
# Generate POS 4-grams
|
|
398
|
+
pos_4grams = _generate_ngrams(pos_tags, 4)
|
|
399
|
+
pos_4gram_counter = Counter(pos_4grams)
|
|
400
|
+
|
|
401
|
+
pos_trigram_entropy = _calculate_shannon_entropy(pos_trigram_counter)
|
|
402
|
+
|
|
403
|
+
except ImportError:
|
|
404
|
+
# spaCy not installed - leave POS results empty
|
|
405
|
+
pass
|
|
406
|
+
|
|
407
|
+
# =========================================================================
|
|
408
|
+
# CHARACTER N-GRAMS
|
|
409
|
+
# =========================================================================
|
|
410
|
+
|
|
411
|
+
# Character trigrams
|
|
412
|
+
char_trigrams = _generate_ngrams(chars, 3)
|
|
413
|
+
char_trigram_counter: Counter[tuple[str, ...]] = Counter(char_trigrams)
|
|
414
|
+
|
|
415
|
+
# Character 4-grams
|
|
416
|
+
char_4grams = _generate_ngrams(chars, 4)
|
|
417
|
+
char_4gram_counter: Counter[tuple[str, ...]] = Counter(char_4grams)
|
|
418
|
+
|
|
419
|
+
# =========================================================================
|
|
420
|
+
# ENTROPY CALCULATIONS
|
|
421
|
+
# =========================================================================
|
|
422
|
+
|
|
423
|
+
word_trigram_entropy = _calculate_shannon_entropy(word_trigram_counter)
|
|
424
|
+
word_4gram_entropy = _calculate_shannon_entropy(word_4gram_counter)
|
|
425
|
+
char_trigram_entropy = _calculate_shannon_entropy(char_trigram_counter)
|
|
426
|
+
char_4gram_entropy = _calculate_shannon_entropy(char_4gram_counter)
|
|
427
|
+
|
|
428
|
+
# =========================================================================
|
|
429
|
+
# BUILD RESULT
|
|
430
|
+
# =========================================================================
|
|
431
|
+
|
|
432
|
+
return ExtendedNgramResult(
|
|
433
|
+
# Word n-grams
|
|
434
|
+
top_word_trigrams=_get_top_ngrams(word_trigram_counter, top_n),
|
|
435
|
+
top_word_4grams=_get_top_ngrams(word_4gram_counter, top_n),
|
|
436
|
+
word_trigram_count=len(word_trigram_counter),
|
|
437
|
+
word_4gram_count=len(word_4gram_counter),
|
|
438
|
+
word_trigram_entropy=word_trigram_entropy,
|
|
439
|
+
word_4gram_entropy=word_4gram_entropy,
|
|
440
|
+
# Skipgrams
|
|
441
|
+
top_skipgrams_2_1=_get_top_ngrams(skipgram_2_1_counter, top_n),
|
|
442
|
+
top_skipgrams_3_1=_get_top_ngrams(skipgram_3_1_counter, top_n),
|
|
443
|
+
skipgram_2_1_count=len(skipgram_2_1_counter),
|
|
444
|
+
skipgram_3_1_count=len(skipgram_3_1_counter),
|
|
445
|
+
# POS n-grams
|
|
446
|
+
top_pos_trigrams=_get_top_ngrams(pos_trigram_counter, top_n),
|
|
447
|
+
top_pos_4grams=_get_top_ngrams(pos_4gram_counter, top_n),
|
|
448
|
+
pos_trigram_count=len(pos_trigram_counter),
|
|
449
|
+
pos_4gram_count=len(pos_4gram_counter),
|
|
450
|
+
pos_trigram_entropy=pos_trigram_entropy,
|
|
451
|
+
# Character n-grams
|
|
452
|
+
top_char_trigrams=_get_top_ngrams(char_trigram_counter, top_n),
|
|
453
|
+
top_char_4grams=_get_top_ngrams(char_4gram_counter, top_n),
|
|
454
|
+
char_trigram_entropy=char_trigram_entropy,
|
|
455
|
+
char_4gram_entropy=char_4gram_entropy,
|
|
456
|
+
# Metadata
|
|
457
|
+
metadata={
|
|
458
|
+
"parameters": {
|
|
459
|
+
"top_n": top_n,
|
|
460
|
+
"include_pos_ngrams": include_pos_ngrams,
|
|
461
|
+
"spacy_model": spacy_model if include_pos_ngrams else None,
|
|
462
|
+
},
|
|
463
|
+
"token_count": len(words),
|
|
464
|
+
"character_count": len(chars),
|
|
465
|
+
"word_trigram_tokens": len(word_trigrams),
|
|
466
|
+
"word_4gram_tokens": len(word_4grams),
|
|
467
|
+
"char_trigram_tokens": len(char_trigrams),
|
|
468
|
+
"char_4gram_tokens": len(char_4grams),
|
|
469
|
+
"full_distributions": {
|
|
470
|
+
"word_trigrams": dict(word_trigram_counter.most_common(100)),
|
|
471
|
+
"word_4grams": dict(word_4gram_counter.most_common(100)),
|
|
472
|
+
"skipgrams_2_1": dict(skipgram_2_1_counter.most_common(100)),
|
|
473
|
+
"skipgrams_3_1": dict(skipgram_3_1_counter.most_common(100)),
|
|
474
|
+
"pos_trigrams": dict(pos_trigram_counter.most_common(100)),
|
|
475
|
+
"pos_4grams": dict(pos_4gram_counter.most_common(100)),
|
|
476
|
+
"char_trigrams": dict(char_trigram_counter.most_common(100)),
|
|
477
|
+
"char_4grams": dict(char_4gram_counter.most_common(100)),
|
|
478
|
+
},
|
|
479
|
+
},
|
|
235
480
|
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# prosody
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Rhythm and stress pattern analysis for written text.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Function | What It Measures |
|
|
11
|
+
|------|----------|-----------------|
|
|
12
|
+
| `rhythm_prosody.py` | `compute_rhythm_prosody` | Syllable stress patterns, rhythm regularity, prose rhythm metrics |
|
|
13
|
+
|
|
14
|
+
## See Also
|
|
15
|
+
|
|
16
|
+
- [`readability/syllables.py`](../readability/) for the syllable counting engine
|
|
17
|
+
- [`syntactic/`](../syntactic/) for sentence structure features that interact with prosodic rhythm
|