pystylometry 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +17 -1
- pystylometry/_types.py +54 -0
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +9 -6
- pystylometry/authorship/additional_methods.py +262 -17
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +8 -1
- pystylometry/character/README.md +17 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/viz/README.md +27 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.0.dist-info}/RECORD +28 -15
- pystylometry-1.1.0.dist-info/METADATA +0 -278
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +0 -0
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.0.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Compression-based authorship attribution using Normalized Compression Distance.
|
|
2
|
+
|
|
3
|
+
This module implements the Normalized Compression Distance (NCD) method for
|
|
4
|
+
authorship attribution. NCD is a language-independent similarity metric based
|
|
5
|
+
on Kolmogorov complexity, approximated through real-world compressors.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#24 - Additional Authorship Attribution Methods
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
10
|
+
|
|
11
|
+
The core insight is that if two texts share statistical regularities (as texts
|
|
12
|
+
by the same author tend to), compressing them together yields better compression
|
|
13
|
+
than compressing separately. This captures deep patterns including vocabulary,
|
|
14
|
+
syntax, and stylistic preferences without requiring explicit feature engineering.
|
|
15
|
+
|
|
16
|
+
References:
|
|
17
|
+
Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
|
|
18
|
+
IEEE Transactions on Information Theory, 51(4), 1523-1545.
|
|
19
|
+
Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
|
|
20
|
+
zipping. Physical Review Letters, 88(4), 048702.
|
|
21
|
+
Li, M., et al. (2004). The similarity metric. IEEE Transactions on
|
|
22
|
+
Information Theory, 50(12), 3250-3264.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import bz2
|
|
28
|
+
import gzip
|
|
29
|
+
import zlib
|
|
30
|
+
|
|
31
|
+
from .._types import CompressionResult
|
|
32
|
+
|
|
33
|
+
# Supported compressors mapped to their compress functions
|
|
34
|
+
_COMPRESSORS: dict[str, type] = {
|
|
35
|
+
"gzip": type(None), # placeholder, handled below
|
|
36
|
+
"zlib": type(None),
|
|
37
|
+
"bz2": type(None),
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
_VALID_COMPRESSORS = frozenset({"gzip", "zlib", "bz2"})
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _compress(data: bytes, compressor: str) -> bytes:
|
|
44
|
+
"""Compress data using the specified algorithm.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
data: Raw bytes to compress.
|
|
48
|
+
compressor: One of "gzip", "zlib", or "bz2".
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Compressed bytes.
|
|
52
|
+
"""
|
|
53
|
+
if compressor == "gzip":
|
|
54
|
+
return gzip.compress(data)
|
|
55
|
+
if compressor == "zlib":
|
|
56
|
+
return zlib.compress(data)
|
|
57
|
+
if compressor == "bz2":
|
|
58
|
+
return bz2.compress(data)
|
|
59
|
+
raise ValueError(f"Unknown compressor: {compressor}") # pragma: no cover
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def compute_compression_distance(
|
|
63
|
+
text1: str,
|
|
64
|
+
text2: str,
|
|
65
|
+
compressor: str = "gzip",
|
|
66
|
+
) -> CompressionResult:
|
|
67
|
+
"""
|
|
68
|
+
Compute Normalized Compression Distance (NCD) between two texts.
|
|
69
|
+
|
|
70
|
+
NCD approximates the normalized information distance, a universal similarity
|
|
71
|
+
metric based on Kolmogorov complexity. Since Kolmogorov complexity is
|
|
72
|
+
uncomputable, NCD uses real-world compressors as practical approximations.
|
|
73
|
+
|
|
74
|
+
Related GitHub Issue:
|
|
75
|
+
#24 - Additional Authorship Attribution Methods
|
|
76
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
77
|
+
|
|
78
|
+
Algorithm:
|
|
79
|
+
1. Encode both texts as UTF-8 bytes
|
|
80
|
+
2. Compress text1, text2, and their concatenation separately
|
|
81
|
+
3. Compute NCD using the formula:
|
|
82
|
+
NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
|
|
83
|
+
|
|
84
|
+
Interpretation:
|
|
85
|
+
- NCD ~ 0: Texts are maximally similar (identical information content)
|
|
86
|
+
- NCD ~ 1: Texts are maximally different (no shared information)
|
|
87
|
+
- Values slightly above 1.0 are possible due to compressor overhead
|
|
88
|
+
- Typical same-author pairs: 0.3-0.6
|
|
89
|
+
- Typical different-author pairs: 0.6-0.9
|
|
90
|
+
|
|
91
|
+
Compressor choice:
|
|
92
|
+
- "gzip" (default): Good balance of speed and accuracy; most widely used
|
|
93
|
+
in NCD literature. Uses Lempel-Ziv (LZ77) algorithm.
|
|
94
|
+
- "zlib": Same underlying algorithm as gzip but lower overhead. Slightly
|
|
95
|
+
faster, very similar results.
|
|
96
|
+
- "bz2": Uses Burrows-Wheeler transform. Better compression but slower.
|
|
97
|
+
May capture different patterns than LZ-based methods.
|
|
98
|
+
|
|
99
|
+
References:
|
|
100
|
+
Cilibrasi, R., & Vitanyi, P. M. (2005). Clustering by compression.
|
|
101
|
+
IEEE Transactions on Information Theory, 51(4), 1523-1545.
|
|
102
|
+
Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
|
|
103
|
+
zipping. Physical Review Letters, 88(4), 048702.
|
|
104
|
+
Li, M., et al. (2004). The similarity metric. IEEE Transactions on
|
|
105
|
+
Information Theory, 50(12), 3250-3264.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
text1: First text for comparison.
|
|
109
|
+
text2: Second text for comparison.
|
|
110
|
+
compressor: Compression algorithm to use ("gzip", "zlib", or "bz2").
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
CompressionResult with NCD score and compression details.
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
ValueError: If compressor is not one of "gzip", "zlib", "bz2".
|
|
117
|
+
|
|
118
|
+
Example:
|
|
119
|
+
>>> result = compute_compression_distance(text_by_author_a, text_by_author_b)
|
|
120
|
+
>>> print(f"NCD: {result.ncd:.3f}")
|
|
121
|
+
>>> print(f"Compressor: {result.compressor}")
|
|
122
|
+
>>> if result.ncd < 0.5:
|
|
123
|
+
... print("Texts likely by same author")
|
|
124
|
+
"""
|
|
125
|
+
if compressor not in _VALID_COMPRESSORS:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"compressor must be one of {sorted(_VALID_COMPRESSORS)}, got '{compressor}'"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Encode texts as bytes
|
|
131
|
+
bytes1 = text1.encode("utf-8")
|
|
132
|
+
bytes2 = text2.encode("utf-8")
|
|
133
|
+
bytes_combined = bytes1 + bytes2
|
|
134
|
+
|
|
135
|
+
# Compress each component
|
|
136
|
+
compressed1 = _compress(bytes1, compressor)
|
|
137
|
+
compressed2 = _compress(bytes2, compressor)
|
|
138
|
+
compressed_combined = _compress(bytes_combined, compressor)
|
|
139
|
+
|
|
140
|
+
c1 = len(compressed1)
|
|
141
|
+
c2 = len(compressed2)
|
|
142
|
+
c12 = len(compressed_combined)
|
|
143
|
+
|
|
144
|
+
# NCD formula: (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
|
|
145
|
+
min_c = min(c1, c2)
|
|
146
|
+
max_c = max(c1, c2)
|
|
147
|
+
|
|
148
|
+
if max_c == 0:
|
|
149
|
+
# Both texts are empty
|
|
150
|
+
ncd = 0.0
|
|
151
|
+
else:
|
|
152
|
+
ncd = (c12 - min_c) / max_c
|
|
153
|
+
|
|
154
|
+
# Compute compression ratios for metadata
|
|
155
|
+
raw1 = len(bytes1)
|
|
156
|
+
raw2 = len(bytes2)
|
|
157
|
+
raw_combined = len(bytes_combined)
|
|
158
|
+
|
|
159
|
+
return CompressionResult(
|
|
160
|
+
ncd=ncd,
|
|
161
|
+
compressor=compressor,
|
|
162
|
+
text1_compressed_size=c1,
|
|
163
|
+
text2_compressed_size=c2,
|
|
164
|
+
combined_compressed_size=c12,
|
|
165
|
+
metadata={
|
|
166
|
+
"text1_raw_size": raw1,
|
|
167
|
+
"text2_raw_size": raw2,
|
|
168
|
+
"combined_raw_size": raw_combined,
|
|
169
|
+
"text1_compression_ratio": c1 / raw1 if raw1 > 0 else 0.0,
|
|
170
|
+
"text2_compression_ratio": c2 / raw2 if raw2 > 0 else 0.0,
|
|
171
|
+
"combined_compression_ratio": c12 / raw_combined if raw_combined > 0 else 0.0,
|
|
172
|
+
"min_compressed": min_c,
|
|
173
|
+
"max_compressed": max_c,
|
|
174
|
+
},
|
|
175
|
+
)
|
|
@@ -249,6 +249,7 @@ def compute_kilgarriff(
|
|
|
249
249
|
text1: str,
|
|
250
250
|
text2: str,
|
|
251
251
|
n_words: int = 500,
|
|
252
|
+
top_features: int = 20,
|
|
252
253
|
) -> KilgarriffResult:
|
|
253
254
|
"""
|
|
254
255
|
Compute Kilgarriff's chi-squared distance between two texts.
|
|
@@ -292,6 +293,8 @@ def compute_kilgarriff(
|
|
|
292
293
|
text2: Second text for comparison
|
|
293
294
|
n_words: Number of most frequent words to analyze (default: 500).
|
|
294
295
|
Higher values provide finer discrimination but require longer texts.
|
|
296
|
+
top_features: Number of most distinctive features to return (default: 20).
|
|
297
|
+
Controls the length of most_distinctive_features in the result.
|
|
295
298
|
|
|
296
299
|
Returns:
|
|
297
300
|
KilgarriffResult containing:
|
|
@@ -318,6 +321,10 @@ def compute_kilgarriff(
|
|
|
318
321
|
violated in text analysis. The raw chi_squared value is more reliable
|
|
319
322
|
for relative comparisons between text pairs.
|
|
320
323
|
"""
|
|
324
|
+
# Validate top_features
|
|
325
|
+
if top_features < 1:
|
|
326
|
+
raise ValueError("top_features must be >= 1")
|
|
327
|
+
|
|
321
328
|
# Tokenize and lowercase
|
|
322
329
|
# Using lowercase ensures "The" and "the" are counted together
|
|
323
330
|
tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
|
|
@@ -337,7 +344,7 @@ def compute_kilgarriff(
|
|
|
337
344
|
p_value=p_value,
|
|
338
345
|
degrees_of_freedom=df,
|
|
339
346
|
feature_count=len(contributions),
|
|
340
|
-
most_distinctive_features=contributions[:
|
|
347
|
+
most_distinctive_features=contributions[:top_features],
|
|
341
348
|
metadata={
|
|
342
349
|
**details,
|
|
343
350
|
"all_contributions": contributions, # Full list for detailed analysis
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
# character
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Character-level features for stylometric fingerprinting.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Function | What It Measures |
|
|
11
|
+
|------|----------|-----------------|
|
|
12
|
+
| `character_metrics.py` | `compute_character_metrics` | Letter frequencies, digit ratios, uppercase ratios, special character usage, whitespace patterns |
|
|
13
|
+
|
|
14
|
+
## See Also
|
|
15
|
+
|
|
16
|
+
- [`ngrams/`](../ngrams/) for character-level bigram entropy
|
|
17
|
+
- [`stylistic/markers.py`](../stylistic/) for punctuation style analysis
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# consistency
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Intra-document style drift detection using sliding-window chi-squared analysis.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Function | What It Does |
|
|
11
|
+
|------|----------|-------------|
|
|
12
|
+
| `drift.py` | `compute_kilgarriff_drift` | Detects stylistic drift, splice points, and AI-generation signatures |
|
|
13
|
+
| `_thresholds.py` | _(internal)_ | Classification thresholds for pattern detection |
|
|
14
|
+
|
|
15
|
+
## Detected Patterns
|
|
16
|
+
|
|
17
|
+
| Pattern | Meaning |
|
|
18
|
+
|---------|---------|
|
|
19
|
+
| `consistent` | Natural human variation throughout |
|
|
20
|
+
| `gradual_drift` | Style shifts progressively over the document |
|
|
21
|
+
| `sudden_spike` | Abrupt discontinuity (possible splice or paste) |
|
|
22
|
+
| `suspiciously_uniform` | Unnaturally low variation (possible AI generation) |
|
|
23
|
+
|
|
24
|
+
## See Also
|
|
25
|
+
|
|
26
|
+
- [`authorship/kilgarriff.py`](../authorship/) -- the underlying chi-squared method (between-text comparison)
|
|
27
|
+
- [`viz/`](../viz/) for timeline and report visualizations of drift results
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
# dialect
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Regional dialect detection (British vs. American English) with markedness scoring.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Function | What It Does |
|
|
11
|
+
|------|----------|-------------|
|
|
12
|
+
| `detector.py` | `compute_dialect` | Classifies text dialect, computes British/American scores, markedness |
|
|
13
|
+
| `_loader.py` | `get_markers`, `DialectMarkers` | Loads and caches extensible JSON marker database |
|
|
14
|
+
| `_data/dialect_markers.json` | _(data)_ | Vocabulary, spelling, grammar, and eye-dialect markers |
|
|
15
|
+
|
|
16
|
+
## Detection Categories
|
|
17
|
+
|
|
18
|
+
- **Vocabulary** -- flat/apartment, lorry/truck, boot/trunk
|
|
19
|
+
- **Spelling** -- colour/color, organise/organize, centre/center
|
|
20
|
+
- **Grammar** -- collective noun agreement, "have got" patterns
|
|
21
|
+
- **Eye dialect** -- gonna, wanna (register markers, not true dialect)
|
|
22
|
+
|
|
23
|
+
## See Also
|
|
24
|
+
|
|
25
|
+
- [`stylistic/`](../stylistic/) for broader style marker analysis
|
|
26
|
+
- [`stylistic/genre_register.py`](../stylistic/) for formality and register classification
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
# lexical
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Vocabulary diversity, richness, and frequency analysis.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Functions | What It Measures |
|
|
11
|
+
|------|-----------|-----------------|
|
|
12
|
+
| `ttr.py` | `compute_ttr` | Type-Token Ratio (basic vocabulary diversity) |
|
|
13
|
+
| `mtld.py` | `compute_mtld` | Measure of Textual Lexical Diversity |
|
|
14
|
+
| `yule.py` | `compute_yule` | Yule's K and I (frequency spectrum measures) |
|
|
15
|
+
| `hapax.py` | `compute_hapax_ratios`, `compute_hapax_with_lexicon_analysis` | Hapax legomena, Honore's R, Sichel's S |
|
|
16
|
+
| `advanced_diversity.py` | `compute_vocd_d`, `compute_mattr`, `compute_hdd`, `compute_msttr` | VocD-D, MATTR, HD-D, MSTTR |
|
|
17
|
+
| `function_words.py` | `compute_function_words` | Function word frequencies by category |
|
|
18
|
+
| `word_frequency_sophistication.py` | `compute_word_frequency_sophistication` | Vocabulary sophistication via frequency bands |
|
|
19
|
+
|
|
20
|
+
## See Also
|
|
21
|
+
|
|
22
|
+
- [`authorship/`](../authorship/) uses lexical features for attribution
|
|
23
|
+
- [`stylistic/vocabulary_overlap.py`](../stylistic/) for Jaccard, Dice, and KL divergence between texts
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# ngrams
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
N-gram generation, entropy computation, and sequence analysis.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Functions | What It Measures |
|
|
11
|
+
|------|-----------|-----------------|
|
|
12
|
+
| `entropy.py` | `compute_ngram_entropy`, `compute_character_bigram_entropy`, `compute_word_bigram_entropy` | Shannon entropy at character and word n-gram levels |
|
|
13
|
+
| `extended_ngrams.py` | `compute_extended_ngrams` | Word, character, and POS n-gram profiles with frequency distributions |
|
|
14
|
+
|
|
15
|
+
## See Also
|
|
16
|
+
|
|
17
|
+
- [`syntactic/`](../syntactic/) provides POS tags consumed by `compute_extended_ngrams(text, pos=True)`
|
|
18
|
+
- [`character/`](../character/) for character-level features without n-gram structure
|