pystylometry 1.3.0__py3-none-any.whl → 1.3.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/_types.py +152 -0
- pystylometry/lexical/__init__.py +3 -0
- pystylometry/lexical/repetition.py +506 -0
- pystylometry-1.3.1.dist-info/LICENSE +21 -0
- pystylometry-1.3.1.dist-info/METADATA +79 -0
- {pystylometry-1.3.0.dist-info → pystylometry-1.3.1.dist-info}/RECORD +8 -6
- {pystylometry-1.3.0.dist-info → pystylometry-1.3.1.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/METADATA +0 -136
- {pystylometry-1.3.0.dist-info → pystylometry-1.3.1.dist-info}/entry_points.txt +0 -0
pystylometry/_types.py
CHANGED
|
@@ -370,6 +370,158 @@ class TTRResult:
|
|
|
370
370
|
metadata: dict[str, Any]
|
|
371
371
|
|
|
372
372
|
|
|
373
|
+
# ===== Repetition Detection Results =====
|
|
374
|
+
# Related to GitHub Issue #28: Verbal tics detection for slop analysis
|
|
375
|
+
# https://github.com/craigtrim/pystylometry/issues/28
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
@dataclass
|
|
379
|
+
class RepetitiveWord:
|
|
380
|
+
"""A single word flagged as abnormally repetitive.
|
|
381
|
+
|
|
382
|
+
The repetition_score is the ratio of observed count to expected count
|
|
383
|
+
based on the word's frequency in the British National Corpus (BNC).
|
|
384
|
+
Higher scores indicate stronger overrepresentation.
|
|
385
|
+
|
|
386
|
+
Related GitHub Issue:
|
|
387
|
+
#28 - Verbal tics detection for slop analysis
|
|
388
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
389
|
+
|
|
390
|
+
Attributes:
|
|
391
|
+
word: The flagged word (lowercased).
|
|
392
|
+
count: Observed count in the text.
|
|
393
|
+
expected_count: Expected count based on BNC relative frequency × text length.
|
|
394
|
+
0.0 if word not found in BNC.
|
|
395
|
+
repetition_score: count / expected_count. float('inf') if expected_count is 0.
|
|
396
|
+
bnc_bucket: BNC frequency bucket (1-100, 1=most frequent). None if not in BNC.
|
|
397
|
+
chunk_counts: Per-chunk occurrence counts (for distribution analysis).
|
|
398
|
+
distribution_entropy: Shannon entropy of the word's chunk distribution.
|
|
399
|
+
Low entropy = suspiciously even spread (model tic).
|
|
400
|
+
High entropy = clustered usage (human writing about a specific scene).
|
|
401
|
+
distribution_variance: Variance of per-chunk counts.
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
word: str
|
|
405
|
+
count: int
|
|
406
|
+
expected_count: float
|
|
407
|
+
repetition_score: float
|
|
408
|
+
bnc_bucket: int | None
|
|
409
|
+
chunk_counts: list[int]
|
|
410
|
+
distribution_entropy: float
|
|
411
|
+
distribution_variance: float
|
|
412
|
+
|
|
413
|
+
|
|
414
|
+
@dataclass
|
|
415
|
+
class RepetitiveUnigramsResult:
|
|
416
|
+
"""Result from repetitive unigram detection.
|
|
417
|
+
|
|
418
|
+
Identifies content words that appear far more frequently than expected
|
|
419
|
+
based on their frequency in the British National Corpus (BNC, ~100M tokens).
|
|
420
|
+
This is a key indicator of AI-generated "slop" where models exhibit verbal
|
|
421
|
+
tics — repeating certain words with suspicious regularity.
|
|
422
|
+
|
|
423
|
+
Related GitHub Issue:
|
|
424
|
+
#28 - Verbal tics detection for slop analysis
|
|
425
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
426
|
+
|
|
427
|
+
The slop_score provides a single aggregate metric:
|
|
428
|
+
slop_score = flagged_words_per_10k × mean_repetition_score
|
|
429
|
+
|
|
430
|
+
Where:
|
|
431
|
+
- flagged_words_per_10k = count of flagged words / (total content words / 10000)
|
|
432
|
+
- mean_repetition_score = mean repetition_score across all flagged words
|
|
433
|
+
|
|
434
|
+
Higher slop_score = more likely AI-generated verbal tics.
|
|
435
|
+
|
|
436
|
+
References:
|
|
437
|
+
British National Corpus Consortium. (2007). The British National Corpus,
|
|
438
|
+
version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
|
|
439
|
+
|
|
440
|
+
Example:
|
|
441
|
+
>>> result = compute_repetitive_unigrams(text)
|
|
442
|
+
>>> for w in result.repetitive_words[:5]:
|
|
443
|
+
... print(f"{w.word}: {w.count}x (expected {w.expected_count:.1f}, "
|
|
444
|
+
... f"score {w.repetition_score:.1f})")
|
|
445
|
+
shimmered: 23x (expected 0.1, score 266.2)
|
|
446
|
+
>>> result.slop_score
|
|
447
|
+
42.7
|
|
448
|
+
"""
|
|
449
|
+
|
|
450
|
+
repetitive_words: list[RepetitiveWord] # Sorted by repetition_score descending
|
|
451
|
+
total_content_words: int
|
|
452
|
+
flagged_count: int # Number of words exceeding threshold
|
|
453
|
+
flagged_words_per_10k: float # flagged_count / (total_content_words / 10000)
|
|
454
|
+
mean_repetition_score: float # Mean score across flagged words
|
|
455
|
+
slop_score: float # Aggregate: flagged_words_per_10k × mean_repetition_score
|
|
456
|
+
total_content_words_dist: Distribution
|
|
457
|
+
chunk_size: int
|
|
458
|
+
chunk_count: int
|
|
459
|
+
metadata: dict[str, Any]
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
@dataclass
|
|
463
|
+
class RepetitiveNgram:
|
|
464
|
+
"""A single n-gram flagged as abnormally repetitive.
|
|
465
|
+
|
|
466
|
+
Content n-grams (bigrams, trigrams, etc.) should rarely repeat verbatim
|
|
467
|
+
in natural writing. N-grams that repeat beyond a length-scaled threshold
|
|
468
|
+
are flagged.
|
|
469
|
+
|
|
470
|
+
Related GitHub Issue:
|
|
471
|
+
#28 - Verbal tics detection for slop analysis
|
|
472
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
473
|
+
|
|
474
|
+
Attributes:
|
|
475
|
+
ngram: The flagged n-gram as a tuple of words.
|
|
476
|
+
count: Observed count in the text.
|
|
477
|
+
frequency_per_10k: Occurrences per 10,000 n-grams.
|
|
478
|
+
chunk_counts: Per-chunk occurrence counts.
|
|
479
|
+
distribution_entropy: Shannon entropy of the n-gram's chunk distribution.
|
|
480
|
+
distribution_variance: Variance of per-chunk counts.
|
|
481
|
+
"""
|
|
482
|
+
|
|
483
|
+
ngram: tuple[str, ...]
|
|
484
|
+
count: int
|
|
485
|
+
frequency_per_10k: float
|
|
486
|
+
chunk_counts: list[int]
|
|
487
|
+
distribution_entropy: float
|
|
488
|
+
distribution_variance: float
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
@dataclass
|
|
492
|
+
class RepetitiveNgramsResult:
|
|
493
|
+
"""Result from repetitive n-gram detection.
|
|
494
|
+
|
|
495
|
+
Detects bigrams, trigrams, or higher-order n-grams that repeat more than
|
|
496
|
+
expected within the text. No external corpus is required — content n-grams
|
|
497
|
+
should not repeat verbatim often in natural writing.
|
|
498
|
+
|
|
499
|
+
N-grams composed entirely of function words (e.g., "of the", "in a") are
|
|
500
|
+
excluded since their repetition is expected.
|
|
501
|
+
|
|
502
|
+
Related GitHub Issue:
|
|
503
|
+
#28 - Verbal tics detection for slop analysis
|
|
504
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
505
|
+
|
|
506
|
+
Example:
|
|
507
|
+
>>> result = compute_repetitive_ngrams(text, n=2)
|
|
508
|
+
>>> for ng in result.repetitive_ngrams[:5]:
|
|
509
|
+
... print(f"{' '.join(ng.ngram)}: {ng.count}x "
|
|
510
|
+
... f"({ng.frequency_per_10k:.1f} per 10k)")
|
|
511
|
+
uncomfortable truth: 8x (1.6 per 10k)
|
|
512
|
+
"""
|
|
513
|
+
|
|
514
|
+
repetitive_ngrams: list[RepetitiveNgram] # Sorted by count descending
|
|
515
|
+
n: int | tuple[int, ...] # N-gram order(s) analyzed
|
|
516
|
+
total_ngrams: int
|
|
517
|
+
flagged_count: int
|
|
518
|
+
flagged_per_10k: float # flagged_count / (total_ngrams / 10000)
|
|
519
|
+
total_ngrams_dist: Distribution
|
|
520
|
+
chunk_size: int
|
|
521
|
+
chunk_count: int
|
|
522
|
+
metadata: dict[str, Any]
|
|
523
|
+
|
|
524
|
+
|
|
373
525
|
# ===== Readability Results =====
|
|
374
526
|
|
|
375
527
|
|
pystylometry/lexical/__init__.py
CHANGED
|
@@ -5,6 +5,7 @@ from .advanced_diversity import compute_hdd, compute_mattr, compute_msttr, compu
|
|
|
5
5
|
from .function_words import compute_function_words
|
|
6
6
|
from .hapax import compute_hapax_ratios, compute_hapax_with_lexicon_analysis
|
|
7
7
|
from .mtld import compute_mtld
|
|
8
|
+
from .repetition import compute_repetitive_ngrams, compute_repetitive_unigrams
|
|
8
9
|
from .ttr import compute_ttr
|
|
9
10
|
from .word_frequency_sophistication import compute_word_frequency_sophistication
|
|
10
11
|
from .yule import compute_yule
|
|
@@ -21,4 +22,6 @@ __all__ = [
|
|
|
21
22
|
"compute_hdd",
|
|
22
23
|
"compute_msttr",
|
|
23
24
|
"compute_word_frequency_sophistication",
|
|
25
|
+
"compute_repetitive_unigrams",
|
|
26
|
+
"compute_repetitive_ngrams",
|
|
24
27
|
]
|
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
"""Repetitive word and n-gram detection for verbal tics / slop analysis.
|
|
2
|
+
|
|
3
|
+
This module detects abnormally repetitive words and phrases in text — a common
|
|
4
|
+
pattern in AI-generated content ("slop") where certain content words and phrases
|
|
5
|
+
appear far more frequently than expected.
|
|
6
|
+
|
|
7
|
+
Generative models exhibit "verbal tics": they repeatedly use certain words and
|
|
8
|
+
phrases throughout generated text. Examples include "shimmered", "flickered",
|
|
9
|
+
"obsidian", "a testament to", "an uncomfortable truth". These patterns differ
|
|
10
|
+
from natural human writing where content words appear when contextually relevant,
|
|
11
|
+
repetition clusters around specific scenes or topics, and unusual words don't
|
|
12
|
+
appear with suspiciously even distribution.
|
|
13
|
+
|
|
14
|
+
Two functions are provided:
|
|
15
|
+
|
|
16
|
+
compute_repetitive_unigrams:
|
|
17
|
+
Compares observed word frequencies against the British National Corpus
|
|
18
|
+
(BNC, ~100M tokens) baseline. Words that appear far more than their
|
|
19
|
+
BNC relative frequency predicts are flagged.
|
|
20
|
+
|
|
21
|
+
compute_repetitive_ngrams:
|
|
22
|
+
Detects content n-grams (bigrams, trigrams, etc.) that repeat more
|
|
23
|
+
than expected. No external corpus is required — content n-grams should
|
|
24
|
+
not repeat verbatim often in natural writing.
|
|
25
|
+
|
|
26
|
+
Both functions support chunked analysis to reveal distribution patterns:
|
|
27
|
+
- Even distribution across text = suspicious (model's consistent tic)
|
|
28
|
+
- Clustered distribution = likely intentional (human describing a scene)
|
|
29
|
+
|
|
30
|
+
Related GitHub Issue:
|
|
31
|
+
#28 - Verbal tics detection for slop analysis
|
|
32
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
33
|
+
|
|
34
|
+
Dependencies:
|
|
35
|
+
- bnc-lookup >= 1.3.0 (optional, in lexical group)
|
|
36
|
+
Provides expected_count() and bucket() for BNC baseline comparison.
|
|
37
|
+
|
|
38
|
+
References:
|
|
39
|
+
British National Corpus Consortium. (2007). The British National Corpus,
|
|
40
|
+
version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
|
|
41
|
+
Kilgarriff, A. (2001). BNC database and word frequency lists.
|
|
42
|
+
https://www.kilgarriff.co.uk/bnc-readme.html
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
from __future__ import annotations
|
|
46
|
+
|
|
47
|
+
import math
|
|
48
|
+
import statistics
|
|
49
|
+
from collections import Counter
|
|
50
|
+
|
|
51
|
+
from .._types import (
|
|
52
|
+
Distribution,
|
|
53
|
+
RepetitiveNgram,
|
|
54
|
+
RepetitiveNgramsResult,
|
|
55
|
+
RepetitiveUnigramsResult,
|
|
56
|
+
RepetitiveWord,
|
|
57
|
+
chunk_text,
|
|
58
|
+
make_distribution,
|
|
59
|
+
)
|
|
60
|
+
from .._utils import check_optional_dependency, tokenize
|
|
61
|
+
from .function_words import (
|
|
62
|
+
AUXILIARIES,
|
|
63
|
+
CONJUNCTIONS,
|
|
64
|
+
DETERMINERS,
|
|
65
|
+
PARTICLES,
|
|
66
|
+
PREPOSITIONS,
|
|
67
|
+
PRONOUNS,
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Union of all function word sets — used to filter out non-content words
|
|
71
|
+
_FUNCTION_WORDS = DETERMINERS | PREPOSITIONS | CONJUNCTIONS | PRONOUNS | AUXILIARIES | PARTICLES
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _chunk_entropy(chunk_counts: list[int]) -> float:
|
|
75
|
+
"""Compute Shannon entropy of a word's distribution across chunks.
|
|
76
|
+
|
|
77
|
+
Entropy measures how evenly a word is distributed across chunks.
|
|
78
|
+
Low entropy means the word appears evenly (suspicious for rare words).
|
|
79
|
+
High entropy means the word is concentrated in specific chunks (natural).
|
|
80
|
+
|
|
81
|
+
Formula:
|
|
82
|
+
H = -sum(p_i * log2(p_i)) for each chunk i where p_i > 0
|
|
83
|
+
p_i = count_in_chunk_i / total_count
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
chunk_counts: Per-chunk occurrence counts.
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
Shannon entropy in bits. 0.0 if the word appears in only one chunk.
|
|
90
|
+
Returns 0.0 for empty or all-zero counts.
|
|
91
|
+
"""
|
|
92
|
+
total = sum(chunk_counts)
|
|
93
|
+
if total == 0:
|
|
94
|
+
return 0.0
|
|
95
|
+
|
|
96
|
+
entropy = 0.0
|
|
97
|
+
for count in chunk_counts:
|
|
98
|
+
if count > 0:
|
|
99
|
+
p = count / total
|
|
100
|
+
entropy -= p * math.log2(p)
|
|
101
|
+
|
|
102
|
+
return entropy
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _tokenize_content_words(text: str) -> list[str]:
|
|
106
|
+
"""Tokenize text and return only lowercase alphabetic content words.
|
|
107
|
+
|
|
108
|
+
Filters out:
|
|
109
|
+
- Non-alphabetic tokens (punctuation, numbers)
|
|
110
|
+
- Function words (determiners, prepositions, conjunctions,
|
|
111
|
+
pronouns, auxiliaries, particles)
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
text: Input text.
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
List of lowercase content word tokens.
|
|
118
|
+
"""
|
|
119
|
+
tokens = tokenize(text.lower())
|
|
120
|
+
return [t for t in tokens if t.isalpha() and t not in _FUNCTION_WORDS]
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def compute_repetitive_unigrams(
|
|
124
|
+
text: str,
|
|
125
|
+
threshold: float = 3.0,
|
|
126
|
+
chunk_size: int = 1000,
|
|
127
|
+
min_count: int = 3,
|
|
128
|
+
) -> RepetitiveUnigramsResult:
|
|
129
|
+
"""Detect content words that repeat far more than expected based on BNC frequencies.
|
|
130
|
+
|
|
131
|
+
For each content word in the text, computes:
|
|
132
|
+
expected_count = BNC_relative_frequency(word) * text_length
|
|
133
|
+
repetition_score = observed_count / expected_count
|
|
134
|
+
|
|
135
|
+
Words exceeding the threshold score and minimum count are flagged.
|
|
136
|
+
|
|
137
|
+
This function uses native chunked analysis to capture distribution patterns
|
|
138
|
+
across the text. Words that are evenly distributed (low entropy) are more
|
|
139
|
+
suspicious than words clustered in specific sections.
|
|
140
|
+
|
|
141
|
+
Related GitHub Issue:
|
|
142
|
+
#28 - Verbal tics detection for slop analysis
|
|
143
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
144
|
+
|
|
145
|
+
References:
|
|
146
|
+
British National Corpus Consortium. (2007). The British National Corpus,
|
|
147
|
+
version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
text: Input text to analyze.
|
|
151
|
+
threshold: Minimum repetition_score (observed/expected) to flag a word.
|
|
152
|
+
Default 3.0 means the word must appear at least 3x more than expected.
|
|
153
|
+
chunk_size: Number of words per chunk for distribution analysis (default: 1000).
|
|
154
|
+
min_count: Minimum observed count to flag a word. Prevents flagging words
|
|
155
|
+
that appear only once or twice, which aren't meaningfully repetitive
|
|
156
|
+
regardless of their score. Default: 3.
|
|
157
|
+
Returns:
|
|
158
|
+
RepetitiveUnigramsResult with flagged words, aggregate scores, and metadata.
|
|
159
|
+
|
|
160
|
+
Example:
|
|
161
|
+
>>> result = compute_repetitive_unigrams(novel_text)
|
|
162
|
+
>>> for w in result.repetitive_words[:5]:
|
|
163
|
+
... print(f"{w.word}: {w.count}x (expected {w.expected_count:.1f}, "
|
|
164
|
+
... f"score {w.repetition_score:.1f})")
|
|
165
|
+
shimmered: 23x (expected 0.1, score 266.2)
|
|
166
|
+
obsidian: 18x (expected 0.0, score 450.0)
|
|
167
|
+
>>> print(f"Slop score: {result.slop_score:.1f}")
|
|
168
|
+
Slop score: 42.7
|
|
169
|
+
"""
|
|
170
|
+
check_optional_dependency("bnc_lookup", "lexical")
|
|
171
|
+
|
|
172
|
+
from bnc_lookup import bucket as bnc_bucket # type: ignore[import-untyped]
|
|
173
|
+
from bnc_lookup import expected_count as bnc_expected_count # type: ignore[import-untyped]
|
|
174
|
+
|
|
175
|
+
# Chunk the text
|
|
176
|
+
chunks = chunk_text(text, chunk_size)
|
|
177
|
+
|
|
178
|
+
# Tokenize each chunk into content words
|
|
179
|
+
chunk_tokens: list[list[str]] = [_tokenize_content_words(chunk) for chunk in chunks]
|
|
180
|
+
|
|
181
|
+
# Count content words per chunk
|
|
182
|
+
chunk_counters: list[Counter[str]] = [Counter(tokens) for tokens in chunk_tokens]
|
|
183
|
+
content_words_per_chunk = [len(tokens) for tokens in chunk_tokens]
|
|
184
|
+
|
|
185
|
+
# Build global content word counts
|
|
186
|
+
global_counter: Counter[str] = Counter()
|
|
187
|
+
for counter in chunk_counters:
|
|
188
|
+
global_counter.update(counter)
|
|
189
|
+
|
|
190
|
+
total_content_words = sum(global_counter.values())
|
|
191
|
+
|
|
192
|
+
# Handle empty text
|
|
193
|
+
if total_content_words == 0:
|
|
194
|
+
empty_dist = Distribution(
|
|
195
|
+
values=[],
|
|
196
|
+
mean=float("nan"),
|
|
197
|
+
median=float("nan"),
|
|
198
|
+
std=0.0,
|
|
199
|
+
range=0.0,
|
|
200
|
+
iqr=0.0,
|
|
201
|
+
)
|
|
202
|
+
return RepetitiveUnigramsResult(
|
|
203
|
+
repetitive_words=[],
|
|
204
|
+
total_content_words=0,
|
|
205
|
+
flagged_count=0,
|
|
206
|
+
flagged_words_per_10k=0.0,
|
|
207
|
+
mean_repetition_score=0.0,
|
|
208
|
+
slop_score=0.0,
|
|
209
|
+
total_content_words_dist=empty_dist,
|
|
210
|
+
chunk_size=chunk_size,
|
|
211
|
+
chunk_count=len(chunks),
|
|
212
|
+
metadata={"threshold": threshold, "min_count": min_count},
|
|
213
|
+
)
|
|
214
|
+
|
|
215
|
+
# Evaluate each content word against BNC baseline
|
|
216
|
+
flagged: list[RepetitiveWord] = []
|
|
217
|
+
|
|
218
|
+
for word, observed in global_counter.items():
|
|
219
|
+
if observed < min_count:
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
# Get BNC expected count for this word given our text length
|
|
223
|
+
expected = bnc_expected_count(word, total_content_words)
|
|
224
|
+
word_bucket = bnc_bucket(word)
|
|
225
|
+
|
|
226
|
+
if expected is None or expected == 0.0:
|
|
227
|
+
# Word not in BNC or has zero expected frequency
|
|
228
|
+
# Any repeated occurrence is notable
|
|
229
|
+
score = float("inf")
|
|
230
|
+
expected_val = 0.0
|
|
231
|
+
else:
|
|
232
|
+
expected_val = expected
|
|
233
|
+
score = observed / expected_val
|
|
234
|
+
|
|
235
|
+
if score >= threshold:
|
|
236
|
+
# Build per-chunk counts for this word
|
|
237
|
+
per_chunk = [counter.get(word, 0) for counter in chunk_counters]
|
|
238
|
+
entropy = _chunk_entropy(per_chunk)
|
|
239
|
+
variance = statistics.variance(per_chunk) if len(per_chunk) > 1 else 0.0
|
|
240
|
+
|
|
241
|
+
flagged.append(
|
|
242
|
+
RepetitiveWord(
|
|
243
|
+
word=word,
|
|
244
|
+
count=observed,
|
|
245
|
+
expected_count=expected_val,
|
|
246
|
+
repetition_score=score,
|
|
247
|
+
bnc_bucket=word_bucket,
|
|
248
|
+
chunk_counts=per_chunk,
|
|
249
|
+
distribution_entropy=entropy,
|
|
250
|
+
distribution_variance=variance,
|
|
251
|
+
)
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Sort by repetition_score descending (inf sorts last with key trick)
|
|
255
|
+
flagged.sort(
|
|
256
|
+
key=lambda w: (
|
|
257
|
+
-w.repetition_score if w.repetition_score != float("inf") else -1e18,
|
|
258
|
+
-w.count,
|
|
259
|
+
)
|
|
260
|
+
)
|
|
261
|
+
|
|
262
|
+
# Compute aggregate metrics
|
|
263
|
+
flagged_count = len(flagged)
|
|
264
|
+
flagged_words_per_10k = (
|
|
265
|
+
flagged_count / (total_content_words / 10_000) if total_content_words > 0 else 0.0
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
# Mean repetition score (exclude inf for meaningful average)
|
|
269
|
+
finite_scores = [w.repetition_score for w in flagged if w.repetition_score != float("inf")]
|
|
270
|
+
mean_rep_score = statistics.mean(finite_scores) if finite_scores else 0.0
|
|
271
|
+
|
|
272
|
+
slop_score = flagged_words_per_10k * mean_rep_score
|
|
273
|
+
|
|
274
|
+
# Content words distribution
|
|
275
|
+
content_dist = (
|
|
276
|
+
make_distribution([float(c) for c in content_words_per_chunk])
|
|
277
|
+
if content_words_per_chunk
|
|
278
|
+
else Distribution(
|
|
279
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
280
|
+
)
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
return RepetitiveUnigramsResult(
|
|
284
|
+
repetitive_words=flagged,
|
|
285
|
+
total_content_words=total_content_words,
|
|
286
|
+
flagged_count=flagged_count,
|
|
287
|
+
flagged_words_per_10k=flagged_words_per_10k,
|
|
288
|
+
mean_repetition_score=mean_rep_score,
|
|
289
|
+
slop_score=slop_score,
|
|
290
|
+
total_content_words_dist=content_dist,
|
|
291
|
+
chunk_size=chunk_size,
|
|
292
|
+
chunk_count=len(chunks),
|
|
293
|
+
metadata={
|
|
294
|
+
"threshold": threshold,
|
|
295
|
+
"min_count": min_count,
|
|
296
|
+
"total_unique_content_words": len(global_counter),
|
|
297
|
+
"inf_score_count": sum(1 for w in flagged if w.repetition_score == float("inf")),
|
|
298
|
+
},
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def _validate_n(n: int | tuple[int, ...]) -> tuple[int, ...]:
|
|
303
|
+
"""Validate and normalize the n-gram order parameter.
|
|
304
|
+
|
|
305
|
+
Args:
|
|
306
|
+
n: Single integer or tuple of integers specifying n-gram orders.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
Sorted tuple of unique valid n-gram orders.
|
|
310
|
+
|
|
311
|
+
Raises:
|
|
312
|
+
ValueError: If any value is outside the range [2, 5] or input is empty.
|
|
313
|
+
"""
|
|
314
|
+
values: tuple[int, ...]
|
|
315
|
+
if isinstance(n, int):
|
|
316
|
+
values = (n,)
|
|
317
|
+
else:
|
|
318
|
+
values = tuple(sorted(set(n)))
|
|
319
|
+
|
|
320
|
+
if not values:
|
|
321
|
+
raise ValueError("n must specify at least one n-gram order.")
|
|
322
|
+
|
|
323
|
+
for v in values:
|
|
324
|
+
if v < 2:
|
|
325
|
+
raise ValueError(
|
|
326
|
+
f"n-gram order {v} is too small. Minimum is 2 (bigrams). "
|
|
327
|
+
f"For single-word repetition, use compute_repetitive_unigrams() instead."
|
|
328
|
+
)
|
|
329
|
+
if v > 5:
|
|
330
|
+
raise ValueError(
|
|
331
|
+
f"n-gram order {v} is too large. Maximum is 5. "
|
|
332
|
+
f"N-grams of order 6+ are too sparse to produce meaningful repetition "
|
|
333
|
+
f"signals in typical texts (they rarely repeat even once)."
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
return values
|
|
337
|
+
|
|
338
|
+
|
|
339
|
+
def _is_content_ngram(ngram: tuple[str, ...]) -> bool:
|
|
340
|
+
"""Check if an n-gram contains at least one content word.
|
|
341
|
+
|
|
342
|
+
An n-gram composed entirely of function words (e.g., "of the", "in a")
|
|
343
|
+
is expected to repeat and should not be flagged.
|
|
344
|
+
|
|
345
|
+
Args:
|
|
346
|
+
ngram: Tuple of words.
|
|
347
|
+
|
|
348
|
+
Returns:
|
|
349
|
+
True if at least one word is not a function word.
|
|
350
|
+
"""
|
|
351
|
+
return any(word not in _FUNCTION_WORDS for word in ngram)
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
def compute_repetitive_ngrams(
|
|
355
|
+
text: str,
|
|
356
|
+
n: int | tuple[int, ...] = (2, 3),
|
|
357
|
+
chunk_size: int = 1000,
|
|
358
|
+
min_count: int = 3,
|
|
359
|
+
) -> RepetitiveNgramsResult:
|
|
360
|
+
"""Detect content n-grams that repeat more than expected within the text.
|
|
361
|
+
|
|
362
|
+
Content n-grams (bigrams, trigrams, etc.) should rarely repeat verbatim in
|
|
363
|
+
natural writing. This function flags n-grams that exceed a length-scaled
|
|
364
|
+
threshold, filtering out n-grams composed entirely of function words.
|
|
365
|
+
|
|
366
|
+
No external corpus is required — the threshold is computed internally based
|
|
367
|
+
on text length. Any content n-gram appearing more than
|
|
368
|
+
max(min_count, total_ngrams / 10000) times is flagged.
|
|
369
|
+
|
|
370
|
+
Related GitHub Issue:
|
|
371
|
+
#28 - Verbal tics detection for slop analysis
|
|
372
|
+
https://github.com/craigtrim/pystylometry/issues/28
|
|
373
|
+
|
|
374
|
+
Args:
|
|
375
|
+
text: Input text to analyze.
|
|
376
|
+
n: N-gram order(s) to analyze. Can be a single integer (e.g., 2 for
|
|
377
|
+
bigrams) or a tuple of integers (e.g., (2, 3) for bigrams and
|
|
378
|
+
trigrams). Valid range: 2 to 5. Default: (2, 3).
|
|
379
|
+
- Values below 2 are rejected (use compute_repetitive_unigrams
|
|
380
|
+
for single words).
|
|
381
|
+
- Values above 5 are rejected (n-grams of order 6+ are too sparse
|
|
382
|
+
to produce meaningful repetition signals).
|
|
383
|
+
chunk_size: Number of words per chunk for distribution analysis (default: 1000).
|
|
384
|
+
min_count: Minimum count to flag an n-gram. Default: 3.
|
|
385
|
+
|
|
386
|
+
Returns:
|
|
387
|
+
RepetitiveNgramsResult with flagged n-grams, counts, and metadata.
|
|
388
|
+
|
|
389
|
+
Example:
|
|
390
|
+
>>> result = compute_repetitive_ngrams(text, n=2)
|
|
391
|
+
>>> for ng in result.repetitive_ngrams[:5]:
|
|
392
|
+
... print(f"{' '.join(ng.ngram)}: {ng.count}x")
|
|
393
|
+
uncomfortable truth: 8x
|
|
394
|
+
>>> result = compute_repetitive_ngrams(text, n=(2, 3, 4))
|
|
395
|
+
>>> print(f"Flagged: {result.flagged_count} n-grams")
|
|
396
|
+
"""
|
|
397
|
+
# Validate n parameter
|
|
398
|
+
n_values = _validate_n(n)
|
|
399
|
+
|
|
400
|
+
# Chunk the text
|
|
401
|
+
chunks = chunk_text(text, chunk_size)
|
|
402
|
+
|
|
403
|
+
# Tokenize each chunk — lowercase alpha only (but keep function words
|
|
404
|
+
# so n-grams spanning content+function words are preserved; we filter
|
|
405
|
+
# all-function-word n-grams separately)
|
|
406
|
+
chunk_tokens: list[list[str]] = []
|
|
407
|
+
for chunk in chunks:
|
|
408
|
+
tokens = tokenize(chunk.lower())
|
|
409
|
+
chunk_tokens.append([t for t in tokens if t.isalpha()])
|
|
410
|
+
|
|
411
|
+
# Build n-grams per chunk for each requested order
|
|
412
|
+
# chunk_ngram_counters[chunk_idx] aggregates across all n values
|
|
413
|
+
chunk_ngram_counters: list[Counter[tuple[str, ...]]] = [Counter() for _ in chunks]
|
|
414
|
+
total_ngram_count = 0
|
|
415
|
+
|
|
416
|
+
for chunk_idx, tokens in enumerate(chunk_tokens):
|
|
417
|
+
for nv in n_values:
|
|
418
|
+
for i in range(len(tokens) - nv + 1):
|
|
419
|
+
ngram = tuple(tokens[i : i + nv])
|
|
420
|
+
if _is_content_ngram(ngram):
|
|
421
|
+
chunk_ngram_counters[chunk_idx][ngram] += 1
|
|
422
|
+
total_ngram_count += 1
|
|
423
|
+
|
|
424
|
+
# Build global counts
|
|
425
|
+
global_ngram_counter: Counter[tuple[str, ...]] = Counter()
|
|
426
|
+
for counter in chunk_ngram_counters:
|
|
427
|
+
global_ngram_counter.update(counter)
|
|
428
|
+
|
|
429
|
+
# Determine threshold: any content n-gram appearing more than this is flagged
|
|
430
|
+
length_threshold = max(min_count, total_ngram_count // 10_000)
|
|
431
|
+
|
|
432
|
+
# Handle empty text
|
|
433
|
+
if total_ngram_count == 0:
|
|
434
|
+
empty_dist = Distribution(
|
|
435
|
+
values=[],
|
|
436
|
+
mean=float("nan"),
|
|
437
|
+
median=float("nan"),
|
|
438
|
+
std=0.0,
|
|
439
|
+
range=0.0,
|
|
440
|
+
iqr=0.0,
|
|
441
|
+
)
|
|
442
|
+
return RepetitiveNgramsResult(
|
|
443
|
+
repetitive_ngrams=[],
|
|
444
|
+
n=n,
|
|
445
|
+
total_ngrams=0,
|
|
446
|
+
flagged_count=0,
|
|
447
|
+
flagged_per_10k=0.0,
|
|
448
|
+
total_ngrams_dist=empty_dist,
|
|
449
|
+
chunk_size=chunk_size,
|
|
450
|
+
chunk_count=len(chunks),
|
|
451
|
+
metadata={"min_count": min_count, "effective_threshold": length_threshold},
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
# Flag n-grams exceeding threshold
|
|
455
|
+
flagged: list[RepetitiveNgram] = []
|
|
456
|
+
|
|
457
|
+
for ngram, count in global_ngram_counter.items():
|
|
458
|
+
if count >= length_threshold:
|
|
459
|
+
per_chunk = [counter.get(ngram, 0) for counter in chunk_ngram_counters]
|
|
460
|
+
entropy = _chunk_entropy(per_chunk)
|
|
461
|
+
variance = statistics.variance(per_chunk) if len(per_chunk) > 1 else 0.0
|
|
462
|
+
freq_per_10k = count / (total_ngram_count / 10_000) if total_ngram_count > 0 else 0.0
|
|
463
|
+
|
|
464
|
+
flagged.append(
|
|
465
|
+
RepetitiveNgram(
|
|
466
|
+
ngram=ngram,
|
|
467
|
+
count=count,
|
|
468
|
+
frequency_per_10k=freq_per_10k,
|
|
469
|
+
chunk_counts=per_chunk,
|
|
470
|
+
distribution_entropy=entropy,
|
|
471
|
+
distribution_variance=variance,
|
|
472
|
+
)
|
|
473
|
+
)
|
|
474
|
+
|
|
475
|
+
# Sort by count descending
|
|
476
|
+
flagged.sort(key=lambda ng: -ng.count)
|
|
477
|
+
|
|
478
|
+
flagged_count = len(flagged)
|
|
479
|
+
flagged_per_10k = flagged_count / (total_ngram_count / 10_000) if total_ngram_count > 0 else 0.0
|
|
480
|
+
|
|
481
|
+
# N-grams per chunk distribution
|
|
482
|
+
ngrams_per_chunk = [sum(counter.values()) for counter in chunk_ngram_counters]
|
|
483
|
+
ngrams_dist = (
|
|
484
|
+
make_distribution([float(c) for c in ngrams_per_chunk])
|
|
485
|
+
if ngrams_per_chunk
|
|
486
|
+
else Distribution(
|
|
487
|
+
values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
|
|
488
|
+
)
|
|
489
|
+
)
|
|
490
|
+
|
|
491
|
+
return RepetitiveNgramsResult(
|
|
492
|
+
repetitive_ngrams=flagged,
|
|
493
|
+
n=n,
|
|
494
|
+
total_ngrams=total_ngram_count,
|
|
495
|
+
flagged_count=flagged_count,
|
|
496
|
+
flagged_per_10k=flagged_per_10k,
|
|
497
|
+
total_ngrams_dist=ngrams_dist,
|
|
498
|
+
chunk_size=chunk_size,
|
|
499
|
+
chunk_count=len(chunks),
|
|
500
|
+
metadata={
|
|
501
|
+
"min_count": min_count,
|
|
502
|
+
"effective_threshold": length_threshold,
|
|
503
|
+
"n_values": list(n_values),
|
|
504
|
+
"total_unique_ngrams": len(global_ngram_counter),
|
|
505
|
+
},
|
|
506
|
+
)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Craig Trim
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
Metadata-Version: 2.1
|
|
2
|
+
Name: pystylometry
|
|
3
|
+
Version: 1.3.1
|
|
4
|
+
Summary: Comprehensive Python package for stylometric analysis
|
|
5
|
+
License: MIT
|
|
6
|
+
Keywords: stylometry,nlp,text-analysis,authorship,readability,lexical-diversity,readability-metrics
|
|
7
|
+
Author: Craig Trim
|
|
8
|
+
Author-email: craigtrim@gmail.com
|
|
9
|
+
Requires-Python: >=3.9,<4.0
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Intended Audience :: Science/Research
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Programming Language :: Python :: 3
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
20
|
+
Classifier: Topic :: Text Processing :: Linguistic
|
|
21
|
+
Classifier: Typing :: Typed
|
|
22
|
+
Requires-Dist: stylometry-ttr (>=1.0.3,<2.0.0)
|
|
23
|
+
Project-URL: Homepage, https://github.com/craigtrim/pystylometry
|
|
24
|
+
Project-URL: Issues, https://github.com/craigtrim/pystylometry/issues
|
|
25
|
+
Project-URL: Repository, https://github.com/craigtrim/pystylometry
|
|
26
|
+
Description-Content-Type: text/markdown
|
|
27
|
+
|
|
28
|
+
# pystylometry
|
|
29
|
+
|
|
30
|
+
[](https://badge.fury.io/py/pystylometry)
|
|
31
|
+
[](https://pepy.tech/project/pystylometry)
|
|
32
|
+
[](https://pepy.tech/project/pystylometry)
|
|
33
|
+
[](https://www.python.org/downloads/)
|
|
34
|
+
[](https://opensource.org/licenses/MIT)
|
|
35
|
+
[]()
|
|
36
|
+
|
|
37
|
+
Stylometric analysis and authorship attribution for Python. 50+ metrics across 11 modules, from vocabulary diversity to AI-generation detection.
|
|
38
|
+
|
|
39
|
+
## Install
|
|
40
|
+
|
|
41
|
+
```bash
|
|
42
|
+
pip install pystylometry # Core (lexical metrics)
|
|
43
|
+
pip install pystylometry[all] # Everything
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
## Modules
|
|
47
|
+
|
|
48
|
+
| Module | Metrics | Description |
|
|
49
|
+
|--------|---------|-------------|
|
|
50
|
+
| [**lexical**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/lexical) | TTR, MTLD, Yule's K/I, Hapax, MATTR, VocD-D, HD-D, MSTTR, function words, word frequency | Vocabulary diversity and richness |
|
|
51
|
+
| [**readability**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/readability) | Flesch, Flesch-Kincaid, SMOG, Gunning Fog, Coleman-Liau, ARI, Dale-Chall, Fry, FORCAST, Linsear Write, Powers-Sumner-Kearl | Grade-level and difficulty scoring |
|
|
52
|
+
| [**syntactic**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/syntactic) | POS ratios, sentence types, parse tree depth, clausal density, passive voice, T-units, dependency distance | Sentence and parse structure (requires spaCy) |
|
|
53
|
+
| [**authorship**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/authorship) | Burrows' Delta, Cosine Delta, Zeta, Kilgarriff chi-squared, MinMax, John's Delta, NCD | Author attribution and text comparison |
|
|
54
|
+
| [**stylistic**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/stylistic) | Contractions, hedges, intensifiers, modals, punctuation, vocabulary overlap (Jaccard/Dice/Cosine/KL), cohesion, genre/register | Style markers and text similarity |
|
|
55
|
+
| [**character**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/character) | Letter frequencies, digit/uppercase ratios, special characters, whitespace | Character-level fingerprinting |
|
|
56
|
+
| [**ngrams**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/ngrams) | Word/character/POS n-grams, Shannon entropy, skipgrams | N-gram profiles and entropy |
|
|
57
|
+
| [**dialect**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/dialect) | British/American classification, spelling/grammar/vocabulary markers, markedness | Regional dialect detection |
|
|
58
|
+
| [**consistency**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/consistency) | Sliding-window chi-squared drift, pattern classification | Intra-document style analysis |
|
|
59
|
+
| [**prosody**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/prosody) | Syllable stress, rhythm regularity | Prose rhythm (requires spaCy) |
|
|
60
|
+
| [**viz**](https://github.com/craigtrim/pystylometry/tree/master/pystylometry/viz) | Timeline, scatter, report (PNG + interactive HTML) | Drift detection visualization |
|
|
61
|
+
|
|
62
|
+
## Development
|
|
63
|
+
|
|
64
|
+
```bash
|
|
65
|
+
git clone https://github.com/craigtrim/pystylometry && cd pystylometry
|
|
66
|
+
pip install -e ".[dev,all]"
|
|
67
|
+
make test # 1022 tests
|
|
68
|
+
make lint # ruff + mypy
|
|
69
|
+
make all # lint + test + build
|
|
70
|
+
```
|
|
71
|
+
|
|
72
|
+
## License
|
|
73
|
+
|
|
74
|
+
MIT
|
|
75
|
+
|
|
76
|
+
## Author
|
|
77
|
+
|
|
78
|
+
Craig Trim -- craigtrim@gmail.com
|
|
79
|
+
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
pystylometry/README.md,sha256=WFOtCAF3qtDTgGG3a_jTjNSwVgpQEXI1PKqbVBfyo1M,2366
|
|
2
2
|
pystylometry/__init__.py,sha256=Z6zkHlX05SUeObDca9dL1Gkfq4UPBWbU2M4sp4fVj78,9220
|
|
3
3
|
pystylometry/_normalize.py,sha256=7tdfgAKg5CI2d4eoDypmFqOVByoxpwgUUZD6vyBH86A,8679
|
|
4
|
-
pystylometry/_types.py,sha256=
|
|
4
|
+
pystylometry/_types.py,sha256=g6XzwCHeMAIBfexId6Pd9EQfJzvZ0KYMfD4kpS5T7BQ,82284
|
|
5
5
|
pystylometry/_utils.py,sha256=CXTx4KDJ_6iiHcc2OXqOYs-izhLf_ZEmJFKdHyd7q34,5282
|
|
6
6
|
pystylometry/authorship/README.md,sha256=zNXCpLj7nczPnYykJnCUw3y-kxfC9mWZmngi3nfw6us,1016
|
|
7
7
|
pystylometry/authorship/__init__.py,sha256=D7m38hWi_62o1ZDSrghLCfob9YsykTht4K37wiVgHfg,1530
|
|
@@ -24,11 +24,12 @@ pystylometry/dialect/_data/dialect_markers.json,sha256=DthluOA6q0rG_8IrCrFIYWh_E
|
|
|
24
24
|
pystylometry/dialect/_loader.py,sha256=M2ATp-5754v_yX9EWvBP0r5qgNf8xlL8XadVsVb_Hco,12989
|
|
25
25
|
pystylometry/dialect/detector.py,sha256=9x0ZuIfTIjsmdNSx0Ezy5AC0SAFtC4kVw11iOSBd9gQ,20147
|
|
26
26
|
pystylometry/lexical/README.md,sha256=cFQ7KRZV4ubsQwIlOH3YHTbhhNl5X91Sr3zcn-3x0HI,1185
|
|
27
|
-
pystylometry/lexical/__init__.py,sha256=
|
|
27
|
+
pystylometry/lexical/__init__.py,sha256=_VpemdfVYZYXHP4ulTItoyegJ-3lE85wlfzDCpseaNE,898
|
|
28
28
|
pystylometry/lexical/advanced_diversity.py,sha256=rL1hlNqTnaEFcA2v4oBJlojHZMTqdvvm4jYXTFGVpYE,25664
|
|
29
29
|
pystylometry/lexical/function_words.py,sha256=eel9bq_qWgWlvG0NtDiouilMt9kaFqz2rh3add2UC4U,17832
|
|
30
30
|
pystylometry/lexical/hapax.py,sha256=djTqZyZIYXa3GRiPoy6TTGHPm0wCRNJ9U0Rwnf5NoDk,12173
|
|
31
31
|
pystylometry/lexical/mtld.py,sha256=XpeCF8sOXZhWbaazHGuqm08mrOf_DYfkfGGAltWnyy4,7101
|
|
32
|
+
pystylometry/lexical/repetition.py,sha256=A9L0oNwfnCepVkWy57kjHV47Pw4M6fZXEl25hBVdq2s,18318
|
|
32
33
|
pystylometry/lexical/ttr.py,sha256=iEsXkoSPyZEyiiFwKatKA8KhLRukD7RDRvyRkRQOTsk,5848
|
|
33
34
|
pystylometry/lexical/word_frequency_sophistication.py,sha256=OHOS0fBvd1Bz8zsJk-pJbWLTgImmBd-aewQnp_kq8BY,38828
|
|
34
35
|
pystylometry/lexical/yule.py,sha256=NXggha8jmQCu4i-qKZpISwyJBqNpuPHyVR86BLDLgio,5192
|
|
@@ -70,7 +71,8 @@ pystylometry/viz/jsx/_base.py,sha256=nd7kEc13fUcRMom3A5jqjGyTy-djIeydq2k3oPHZIHY
|
|
|
70
71
|
pystylometry/viz/jsx/report.py,sha256=DbbHnnNAEi5tmVg4PmiHb17vkBBXujyE4x1CfVBiOBw,25857
|
|
71
72
|
pystylometry/viz/jsx/timeline.py,sha256=hor-xnBa6oVkSqN0AEZUCQFBOB-iTfHSFZHiEfeakPA,30716
|
|
72
73
|
pystylometry/viz/jsx/viewer.py,sha256=3LO49d_2bRf_P-P-2oSKpKx4N8Ugo4oCLb3DtvyNxXI,43716
|
|
73
|
-
pystylometry-1.3.
|
|
74
|
-
pystylometry-1.3.
|
|
75
|
-
pystylometry-1.3.
|
|
76
|
-
pystylometry-1.3.
|
|
74
|
+
pystylometry-1.3.1.dist-info/LICENSE,sha256=vou5JCLAT5nHcsUv-AkjUYAihYfN9mwPDXxV2DHyHBo,1067
|
|
75
|
+
pystylometry-1.3.1.dist-info/METADATA,sha256=Nn-0-ABq9tykuxWpC79GkhHO71oWLnAseh0z9R3mycs,4813
|
|
76
|
+
pystylometry-1.3.1.dist-info/WHEEL,sha256=sP946D7jFCHeNz5Iq4fL4Lu-PrWrFsgfLXbbkciIZwg,88
|
|
77
|
+
pystylometry-1.3.1.dist-info/entry_points.txt,sha256=iHOaFXlyiwcQM1LlID2gWGmN4DBLdTSpKGjttU8tgm8,113
|
|
78
|
+
pystylometry-1.3.1.dist-info/RECORD,,
|
|
@@ -1,136 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: pystylometry
|
|
3
|
-
Version: 1.3.0
|
|
4
|
-
Summary: Comprehensive Python package for stylometric analysis
|
|
5
|
-
License: MIT
|
|
6
|
-
Keywords: stylometry,nlp,text-analysis,authorship,readability,lexical-diversity,readability-metrics
|
|
7
|
-
Author: Craig Trim
|
|
8
|
-
Author-email: craigtrim@gmail.com
|
|
9
|
-
Requires-Python: >=3.9,<4.0
|
|
10
|
-
Classifier: Development Status :: 4 - Beta
|
|
11
|
-
Classifier: Intended Audience :: Developers
|
|
12
|
-
Classifier: Intended Audience :: Science/Research
|
|
13
|
-
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
-
Classifier: Programming Language :: Python :: 3
|
|
15
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
16
|
-
Classifier: Programming Language :: Python :: 3.10
|
|
17
|
-
Classifier: Programming Language :: Python :: 3.11
|
|
18
|
-
Classifier: Programming Language :: Python :: 3.12
|
|
19
|
-
Classifier: Programming Language :: Python :: 3.13
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.14
|
|
21
|
-
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
22
|
-
Classifier: Topic :: Text Processing :: Linguistic
|
|
23
|
-
Classifier: Typing :: Typed
|
|
24
|
-
Requires-Dist: stylometry-ttr (>=1.0.3,<2.0.0)
|
|
25
|
-
Project-URL: Homepage, https://github.com/craigtrim/pystylometry
|
|
26
|
-
Project-URL: Issues, https://github.com/craigtrim/pystylometry/issues
|
|
27
|
-
Project-URL: Repository, https://github.com/craigtrim/pystylometry
|
|
28
|
-
Description-Content-Type: text/markdown
|
|
29
|
-
|
|
30
|
-
# pystylometry
|
|
31
|
-
|
|
32
|
-
[](https://badge.fury.io/py/pystylometry)
|
|
33
|
-
[](https://pepy.tech/project/pystylometry)
|
|
34
|
-
[](https://www.python.org/downloads/)
|
|
35
|
-
[](https://opensource.org/licenses/MIT)
|
|
36
|
-
[]()
|
|
37
|
-
|
|
38
|
-
Stylometric analysis and authorship attribution for Python. 50+ metrics across 11 modules, from vocabulary diversity to AI-generation detection.
|
|
39
|
-
|
|
40
|
-
## Install
|
|
41
|
-
|
|
42
|
-
```bash
|
|
43
|
-
pip install pystylometry # Core (lexical metrics)
|
|
44
|
-
pip install pystylometry[all] # Everything
|
|
45
|
-
```
|
|
46
|
-
|
|
47
|
-
<details>
|
|
48
|
-
<summary>Individual extras</summary>
|
|
49
|
-
|
|
50
|
-
```bash
|
|
51
|
-
pip install pystylometry[readability] # Readability formulas (pronouncing, spaCy)
|
|
52
|
-
pip install pystylometry[syntactic] # POS/parse analysis (spaCy)
|
|
53
|
-
pip install pystylometry[authorship] # Attribution methods
|
|
54
|
-
pip install pystylometry[ngrams] # N-gram entropy
|
|
55
|
-
pip install pystylometry[viz] # Matplotlib visualizations
|
|
56
|
-
```
|
|
57
|
-
</details>
|
|
58
|
-
|
|
59
|
-
## Usage
|
|
60
|
-
|
|
61
|
-
```python
|
|
62
|
-
from pystylometry.lexical import compute_mtld, compute_yule
|
|
63
|
-
from pystylometry.readability import compute_flesch
|
|
64
|
-
|
|
65
|
-
result = compute_mtld(text)
|
|
66
|
-
print(result.mtld_average) # 72.4
|
|
67
|
-
|
|
68
|
-
result = compute_flesch(text)
|
|
69
|
-
print(result.reading_ease) # 65.2
|
|
70
|
-
print(result.grade_level) # 8.1
|
|
71
|
-
```
|
|
72
|
-
|
|
73
|
-
Every function returns a typed dataclass with the score, components, and metadata -- never a bare float.
|
|
74
|
-
|
|
75
|
-
### Unified API
|
|
76
|
-
|
|
77
|
-
```python
|
|
78
|
-
from pystylometry import analyze
|
|
79
|
-
|
|
80
|
-
results = analyze(text, lexical=True, readability=True, syntactic=True)
|
|
81
|
-
```
|
|
82
|
-
|
|
83
|
-
### Style Drift Detection
|
|
84
|
-
|
|
85
|
-
Detect authorship changes, spliced content, and AI-generated text within a single document.
|
|
86
|
-
|
|
87
|
-
```python
|
|
88
|
-
from pystylometry.consistency import compute_kilgarriff_drift
|
|
89
|
-
|
|
90
|
-
result = compute_kilgarriff_drift(document)
|
|
91
|
-
print(result.pattern) # "sudden_spike"
|
|
92
|
-
print(result.pattern_confidence) # 0.71
|
|
93
|
-
print(result.max_location) # Window 23 -- the splice point
|
|
94
|
-
```
|
|
95
|
-
|
|
96
|
-
### CLI
|
|
97
|
-
|
|
98
|
-
```bash
|
|
99
|
-
pystylometry-drift manuscript.txt --window-size=500 --stride=250
|
|
100
|
-
pystylometry-viewer report.html
|
|
101
|
-
```
|
|
102
|
-
|
|
103
|
-
## Modules
|
|
104
|
-
|
|
105
|
-
| Module | Metrics | Description |
|
|
106
|
-
|--------|---------|-------------|
|
|
107
|
-
| [**lexical**](pystylometry/lexical/) | TTR, MTLD, Yule's K/I, Hapax, MATTR, VocD-D, HD-D, MSTTR, function words, word frequency | Vocabulary diversity and richness |
|
|
108
|
-
| [**readability**](pystylometry/readability/) | Flesch, Flesch-Kincaid, SMOG, Gunning Fog, Coleman-Liau, ARI, Dale-Chall, Fry, FORCAST, Linsear Write, Powers-Sumner-Kearl | Grade-level and difficulty scoring |
|
|
109
|
-
| [**syntactic**](pystylometry/syntactic/) | POS ratios, sentence types, parse tree depth, clausal density, passive voice, T-units, dependency distance | Sentence and parse structure (requires spaCy) |
|
|
110
|
-
| [**authorship**](pystylometry/authorship/) | Burrows' Delta, Cosine Delta, Zeta, Kilgarriff chi-squared, MinMax, John's Delta, NCD | Author attribution and text comparison |
|
|
111
|
-
| [**stylistic**](pystylometry/stylistic/) | Contractions, hedges, intensifiers, modals, punctuation, vocabulary overlap (Jaccard/Dice/Cosine/KL), cohesion, genre/register | Style markers and text similarity |
|
|
112
|
-
| [**character**](pystylometry/character/) | Letter frequencies, digit/uppercase ratios, special characters, whitespace | Character-level fingerprinting |
|
|
113
|
-
| [**ngrams**](pystylometry/ngrams/) | Word/character/POS n-grams, Shannon entropy, skipgrams | N-gram profiles and entropy |
|
|
114
|
-
| [**dialect**](pystylometry/dialect/) | British/American classification, spelling/grammar/vocabulary markers, markedness | Regional dialect detection |
|
|
115
|
-
| [**consistency**](pystylometry/consistency/) | Sliding-window chi-squared drift, pattern classification | Intra-document style analysis |
|
|
116
|
-
| [**prosody**](pystylometry/prosody/) | Syllable stress, rhythm regularity | Prose rhythm (requires spaCy) |
|
|
117
|
-
| [**viz**](pystylometry/viz/) | Timeline, scatter, report (PNG + interactive HTML) | Drift detection visualization |
|
|
118
|
-
|
|
119
|
-
## Development
|
|
120
|
-
|
|
121
|
-
```bash
|
|
122
|
-
git clone https://github.com/craigtrim/pystylometry && cd pystylometry
|
|
123
|
-
pip install -e ".[dev,all]"
|
|
124
|
-
make test # 1022 tests
|
|
125
|
-
make lint # ruff + mypy
|
|
126
|
-
make all # lint + test + build
|
|
127
|
-
```
|
|
128
|
-
|
|
129
|
-
## License
|
|
130
|
-
|
|
131
|
-
MIT
|
|
132
|
-
|
|
133
|
-
## Author
|
|
134
|
-
|
|
135
|
-
Craig Trim -- craigtrim@gmail.com
|
|
136
|
-
|
|
File without changes
|