pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# authorship
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Authorship attribution methods for comparing texts and determining likely authorship.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Functions | Method |
|
|
11
|
+
|------|-----------|--------|
|
|
12
|
+
| `burrows_delta.py` | `compute_burrows_delta`, `compute_cosine_delta` | Classic Delta and angular distance variant |
|
|
13
|
+
| `zeta.py` | `compute_zeta` | Zeta method for marker word detection |
|
|
14
|
+
| `kilgarriff.py` | `compute_kilgarriff` | Chi-squared corpus comparison |
|
|
15
|
+
| `additional_methods.py` | `compute_minmax`, `compute_johns_delta` | MinMax distance, Quadratic/Weighted Delta |
|
|
16
|
+
| `compression.py` | `compute_compression_distance` | Normalized Compression Distance (NCD) |
|
|
17
|
+
|
|
18
|
+
## See Also
|
|
19
|
+
|
|
20
|
+
- [`consistency/`](../consistency/) applies `compute_kilgarriff` in sliding windows for intra-document drift detection
|
|
21
|
+
- [`lexical/`](../lexical/) provides the vocabulary features many attribution methods rely on
|
|
@@ -1,14 +1,38 @@
|
|
|
1
|
-
"""Authorship attribution metrics.
|
|
1
|
+
"""Authorship attribution metrics.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
This module provides methods for authorship attribution - comparing texts to
|
|
4
|
+
determine whether they were written by the same author. Available methods
|
|
5
|
+
include classic approaches (Burrows' Delta, Zeta), statistical methods
|
|
6
|
+
(Kilgarriff's chi-squared), and information-theoretic methods (NCD).
|
|
7
|
+
|
|
8
|
+
Related GitHub Issues:
|
|
9
|
+
#24 - Additional Authorship Attribution Methods
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
11
|
+
#31 - Classical Stylometric Methods from Programming Historian
|
|
12
|
+
https://github.com/craigtrim/pystylometry/issues/31
|
|
13
|
+
|
|
14
|
+
Available Functions:
|
|
15
|
+
compute_burrows_delta: Classic Delta method for authorship distance
|
|
16
|
+
compute_cosine_delta: Angular distance variant of Delta
|
|
17
|
+
compute_zeta: Zeta method for marker word detection
|
|
18
|
+
compute_kilgarriff: Chi-squared method for corpus comparison
|
|
19
|
+
compute_minmax: Burrows' original min-max distance method
|
|
20
|
+
compute_johns_delta: Delta variations (quadratic, weighted)
|
|
21
|
+
compute_compression_distance: Normalized Compression Distance (NCD)
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from .additional_methods import compute_johns_delta, compute_minmax
|
|
4
25
|
from .burrows_delta import compute_burrows_delta, compute_cosine_delta
|
|
26
|
+
from .compression import compute_compression_distance
|
|
27
|
+
from .kilgarriff import compute_kilgarriff
|
|
5
28
|
from .zeta import compute_zeta
|
|
6
29
|
|
|
7
30
|
__all__ = [
|
|
8
31
|
"compute_burrows_delta",
|
|
32
|
+
"compute_compression_distance",
|
|
9
33
|
"compute_cosine_delta",
|
|
10
|
-
"
|
|
34
|
+
"compute_johns_delta",
|
|
11
35
|
"compute_kilgarriff",
|
|
12
36
|
"compute_minmax",
|
|
13
|
-
"
|
|
37
|
+
"compute_zeta",
|
|
14
38
|
]
|
|
@@ -8,65 +8,150 @@ Related GitHub Issue:
|
|
|
8
8
|
https://github.com/craigtrim/pystylometry/issues/24
|
|
9
9
|
|
|
10
10
|
Methods implemented:
|
|
11
|
-
- Kilgarriff's Chi-squared
|
|
12
|
-
- Min-Max (Burrows' original method)
|
|
13
|
-
- John Burrows' Delta variations
|
|
11
|
+
- Kilgarriff's Chi-squared -> See kilgarriff.py (Issue #31)
|
|
12
|
+
- Min-Max distance (Burrows' original method)
|
|
13
|
+
- John Burrows' Delta variations (Quadratic, Weighted)
|
|
14
14
|
|
|
15
15
|
References:
|
|
16
16
|
Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics.
|
|
17
17
|
Burrows, J. F. (1992). Not unless you ask nicely. Literary and Linguistic Computing.
|
|
18
18
|
Burrows, J. (2005). Who wrote Shamela? Literary and Linguistic Computing.
|
|
19
|
+
Argamon, S. (2008). Interpreting Burrows's Delta. Literary and Linguistic Computing.
|
|
19
20
|
"""
|
|
20
21
|
|
|
21
|
-
from
|
|
22
|
+
from __future__ import annotations
|
|
22
23
|
|
|
24
|
+
import math
|
|
25
|
+
from collections import Counter
|
|
23
26
|
|
|
24
|
-
|
|
27
|
+
from .._types import JohnsBurrowsResult, MinMaxResult
|
|
28
|
+
from .._utils import tokenize
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def compute_minmax(text1: str, text2: str, mfw: int = 100) -> MinMaxResult:
|
|
25
32
|
"""
|
|
26
|
-
Compute
|
|
33
|
+
Compute Min-Max distance between two texts.
|
|
34
|
+
|
|
35
|
+
This is Burrows' original method from his 1992 paper, before the development
|
|
36
|
+
of Delta. It normalizes word frequencies using min-max scaling and computes
|
|
37
|
+
the mean absolute distance between normalized frequency vectors.
|
|
27
38
|
|
|
28
39
|
Related GitHub Issue:
|
|
29
40
|
#24 - Additional Authorship Attribution Methods
|
|
30
41
|
https://github.com/craigtrim/pystylometry/issues/24
|
|
31
42
|
|
|
43
|
+
Algorithm:
|
|
44
|
+
1. Tokenize both texts and build frequency distributions
|
|
45
|
+
2. Identify the top N most frequent words in the joint corpus
|
|
46
|
+
3. Compute relative frequencies for each word in each text
|
|
47
|
+
4. Normalize each word's frequencies using min-max scaling:
|
|
48
|
+
normalized(f) = (f - min) / (max - min)
|
|
49
|
+
5. Compute mean absolute difference of normalized frequencies
|
|
50
|
+
|
|
51
|
+
Interpretation:
|
|
52
|
+
- Lower values indicate more similar texts (likely same author)
|
|
53
|
+
- Higher values indicate more different texts
|
|
54
|
+
- Scale: 0.0 (identical) to 1.0 (maximally different)
|
|
55
|
+
|
|
56
|
+
References:
|
|
57
|
+
Burrows, J. F. (1992). Not unless you ask nicely: The interpretative
|
|
58
|
+
nexus between analysis and information. Literary and Linguistic
|
|
59
|
+
Computing, 7(2), 91-109.
|
|
60
|
+
|
|
32
61
|
Args:
|
|
33
62
|
text1: First text for comparison
|
|
34
63
|
text2: Second text for comparison
|
|
35
|
-
mfw: Number of most frequent words to analyze
|
|
64
|
+
mfw: Number of most frequent words to analyze (default: 100)
|
|
36
65
|
|
|
37
66
|
Returns:
|
|
38
|
-
|
|
39
|
-
|
|
67
|
+
MinMaxResult with min-max distance and distinctive features.
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
>>> result = compute_minmax(text_by_author_a, text_by_author_b)
|
|
71
|
+
>>> print(f"MinMax distance: {result.minmax_distance:.3f}")
|
|
72
|
+
>>> print(f"Most distinctive: {result.most_distinctive_features[0]}")
|
|
40
73
|
"""
|
|
41
|
-
#
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
"Kilgarriff's chi-squared not yet implemented. "
|
|
45
|
-
"See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
|
|
46
|
-
)
|
|
74
|
+
# Tokenize and lowercase
|
|
75
|
+
tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
|
|
76
|
+
tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
|
|
47
77
|
|
|
78
|
+
if not tokens1 or not tokens2:
|
|
79
|
+
return MinMaxResult(
|
|
80
|
+
minmax_distance=0.0,
|
|
81
|
+
feature_count=0,
|
|
82
|
+
most_distinctive_features=[],
|
|
83
|
+
metadata={
|
|
84
|
+
"text1_size": len(tokens1),
|
|
85
|
+
"text2_size": len(tokens2),
|
|
86
|
+
"warning": "One or both texts are empty",
|
|
87
|
+
},
|
|
88
|
+
)
|
|
48
89
|
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
90
|
+
# Build frequency distributions
|
|
91
|
+
freq1 = Counter(tokens1)
|
|
92
|
+
freq2 = Counter(tokens2)
|
|
93
|
+
size1 = len(tokens1)
|
|
94
|
+
size2 = len(tokens2)
|
|
52
95
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
96
|
+
# Joint corpus: top N most frequent words
|
|
97
|
+
joint: Counter[str] = Counter()
|
|
98
|
+
joint.update(freq1)
|
|
99
|
+
joint.update(freq2)
|
|
100
|
+
top_words = [word for word, _ in joint.most_common(mfw)]
|
|
56
101
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
102
|
+
if not top_words:
|
|
103
|
+
return MinMaxResult(
|
|
104
|
+
minmax_distance=0.0,
|
|
105
|
+
feature_count=0,
|
|
106
|
+
most_distinctive_features=[],
|
|
107
|
+
metadata={
|
|
108
|
+
"text1_size": size1,
|
|
109
|
+
"text2_size": size2,
|
|
110
|
+
"warning": "No common words found",
|
|
111
|
+
},
|
|
112
|
+
)
|
|
61
113
|
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
114
|
+
# Relative frequencies
|
|
115
|
+
rel1 = [freq1.get(w, 0) / size1 for w in top_words]
|
|
116
|
+
rel2 = [freq2.get(w, 0) / size2 for w in top_words]
|
|
117
|
+
|
|
118
|
+
# Min-Max normalization per feature across both texts
|
|
119
|
+
# Then compute absolute distance
|
|
120
|
+
contributions: list[tuple[str, float]] = []
|
|
121
|
+
total_distance = 0.0
|
|
122
|
+
|
|
123
|
+
for i, word in enumerate(top_words):
|
|
124
|
+
f1, f2 = rel1[i], rel2[i]
|
|
125
|
+
max_val = max(f1, f2)
|
|
126
|
+
|
|
127
|
+
if max_val > 0:
|
|
128
|
+
# Min-Max normalized distance for this feature
|
|
129
|
+
dist = abs(f1 - f2) / max_val
|
|
130
|
+
else:
|
|
131
|
+
dist = 0.0
|
|
132
|
+
|
|
133
|
+
total_distance += dist
|
|
134
|
+
contributions.append((word, dist))
|
|
135
|
+
|
|
136
|
+
# Sort contributions by magnitude
|
|
137
|
+
contributions.sort(key=lambda x: x[1], reverse=True)
|
|
138
|
+
|
|
139
|
+
# Mean distance across all features
|
|
140
|
+
minmax_distance = total_distance / len(top_words) if top_words else 0.0
|
|
141
|
+
|
|
142
|
+
return MinMaxResult(
|
|
143
|
+
minmax_distance=minmax_distance,
|
|
144
|
+
feature_count=len(top_words),
|
|
145
|
+
most_distinctive_features=contributions[:20],
|
|
146
|
+
metadata={
|
|
147
|
+
"text1_size": size1,
|
|
148
|
+
"text2_size": size2,
|
|
149
|
+
"text1_vocab": len(freq1),
|
|
150
|
+
"text2_vocab": len(freq2),
|
|
151
|
+
"mfw_requested": mfw,
|
|
152
|
+
"method": "minmax_1992",
|
|
153
|
+
"all_contributions": contributions,
|
|
154
|
+
},
|
|
70
155
|
)
|
|
71
156
|
|
|
72
157
|
|
|
@@ -79,22 +164,157 @@ def compute_johns_delta(
|
|
|
79
164
|
"""
|
|
80
165
|
Compute John Burrows' Delta variations.
|
|
81
166
|
|
|
167
|
+
This implements alternative formulations of Burrows' Delta metric beyond
|
|
168
|
+
the standard mean absolute z-score difference. The quadratic variant uses
|
|
169
|
+
squared z-score differences (Euclidean distance), while the weighted variant
|
|
170
|
+
applies inverse-rank weighting so higher-frequency words contribute more.
|
|
171
|
+
|
|
82
172
|
Related GitHub Issue:
|
|
83
173
|
#24 - Additional Authorship Attribution Methods
|
|
84
174
|
https://github.com/craigtrim/pystylometry/issues/24
|
|
85
175
|
|
|
176
|
+
Methods:
|
|
177
|
+
- "quadratic": Euclidean distance of z-scores
|
|
178
|
+
Delta_Q = sqrt(sum((z1_i - z2_i)^2) / n)
|
|
179
|
+
|
|
180
|
+
- "weighted": Inverse-rank weighted Delta
|
|
181
|
+
Delta_W = sum(w_i * |z1_i - z2_i|) / sum(w_i)
|
|
182
|
+
where w_i = 1 / rank_i
|
|
183
|
+
|
|
184
|
+
Interpretation:
|
|
185
|
+
- Lower values indicate more similar texts (likely same author)
|
|
186
|
+
- Quadratic Delta penalizes large deviations more than standard Delta
|
|
187
|
+
- Weighted Delta emphasizes the most frequent words
|
|
188
|
+
|
|
189
|
+
References:
|
|
190
|
+
Burrows, J. (2005). Who wrote Shamela? Verifying the authorship of a
|
|
191
|
+
parodic text. Literary and Linguistic Computing, 20(4), 437-450.
|
|
192
|
+
Argamon, S. (2008). Interpreting Burrows's Delta: Geometric and
|
|
193
|
+
probabilistic foundations. Literary and Linguistic Computing,
|
|
194
|
+
23(2), 131-147.
|
|
195
|
+
|
|
86
196
|
Args:
|
|
87
197
|
text1: First text for comparison
|
|
88
198
|
text2: Second text for comparison
|
|
89
|
-
mfw: Number of most frequent words to analyze
|
|
90
|
-
method: Delta variation ("quadratic"
|
|
199
|
+
mfw: Number of most frequent words to analyze (default: 100)
|
|
200
|
+
method: Delta variation ("quadratic" or "weighted")
|
|
91
201
|
|
|
92
202
|
Returns:
|
|
93
203
|
JohnsBurrowsResult with delta score and method details.
|
|
204
|
+
|
|
205
|
+
Example:
|
|
206
|
+
>>> result = compute_johns_delta(text1, text2, method="quadratic")
|
|
207
|
+
>>> print(f"Quadratic Delta: {result.delta_score:.3f}")
|
|
94
208
|
"""
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
209
|
+
if method not in ("quadratic", "weighted"):
|
|
210
|
+
raise ValueError(f"method must be 'quadratic' or 'weighted', got '{method}'")
|
|
211
|
+
|
|
212
|
+
# Tokenize and lowercase
|
|
213
|
+
tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
|
|
214
|
+
tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
|
|
215
|
+
|
|
216
|
+
if not tokens1 or not tokens2:
|
|
217
|
+
return JohnsBurrowsResult(
|
|
218
|
+
delta_score=0.0,
|
|
219
|
+
method=method,
|
|
220
|
+
feature_count=0,
|
|
221
|
+
most_distinctive_features=[],
|
|
222
|
+
metadata={
|
|
223
|
+
"text1_size": len(tokens1),
|
|
224
|
+
"text2_size": len(tokens2),
|
|
225
|
+
"warning": "One or both texts are empty",
|
|
226
|
+
},
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Build frequency distributions
|
|
230
|
+
freq1 = Counter(tokens1)
|
|
231
|
+
freq2 = Counter(tokens2)
|
|
232
|
+
size1 = len(tokens1)
|
|
233
|
+
size2 = len(tokens2)
|
|
234
|
+
|
|
235
|
+
# Joint corpus: top N most frequent words
|
|
236
|
+
joint: Counter[str] = Counter()
|
|
237
|
+
joint.update(freq1)
|
|
238
|
+
joint.update(freq2)
|
|
239
|
+
top_words = [word for word, _ in joint.most_common(mfw)]
|
|
240
|
+
|
|
241
|
+
if not top_words:
|
|
242
|
+
return JohnsBurrowsResult(
|
|
243
|
+
delta_score=0.0,
|
|
244
|
+
method=method,
|
|
245
|
+
feature_count=0,
|
|
246
|
+
most_distinctive_features=[],
|
|
247
|
+
metadata={
|
|
248
|
+
"text1_size": size1,
|
|
249
|
+
"text2_size": size2,
|
|
250
|
+
"warning": "No common words found",
|
|
251
|
+
},
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Relative frequencies
|
|
255
|
+
rel1 = [freq1.get(w, 0) / size1 for w in top_words]
|
|
256
|
+
rel2 = [freq2.get(w, 0) / size2 for w in top_words]
|
|
257
|
+
|
|
258
|
+
# Mean-normalized differences
|
|
259
|
+
# With only 2 texts, classical z-scores are degenerate: stdev([a,b]) is
|
|
260
|
+
# always |a-b|/sqrt(2), producing identical z-scores (±0.707) for all
|
|
261
|
+
# features with any difference. Instead, we normalize by the mean frequency
|
|
262
|
+
# of each feature across both texts, which preserves discriminative power:
|
|
263
|
+
# normalized_i = (f1_i - f2_i) / mean(f1_i, f2_i)
|
|
264
|
+
# This weights words proportionally to how much they differ relative to
|
|
265
|
+
# their expected frequency, preventing high-frequency words from dominating
|
|
266
|
+
# through absolute differences alone.
|
|
267
|
+
z1: list[float] = []
|
|
268
|
+
z2: list[float] = []
|
|
269
|
+
for i in range(len(top_words)):
|
|
270
|
+
mean_val = (rel1[i] + rel2[i]) / 2
|
|
271
|
+
# Normalize by mean frequency; use epsilon for words absent in both
|
|
272
|
+
norm = mean_val if mean_val > 0 else 1e-10
|
|
273
|
+
z1.append((rel1[i] - mean_val) / norm)
|
|
274
|
+
z2.append((rel2[i] - mean_val) / norm)
|
|
275
|
+
|
|
276
|
+
# Compute distance based on method
|
|
277
|
+
contributions: list[tuple[str, float]] = []
|
|
278
|
+
|
|
279
|
+
if method == "quadratic":
|
|
280
|
+
# Quadratic Delta: root mean squared z-score difference
|
|
281
|
+
squared_diffs: list[float] = []
|
|
282
|
+
for i, word in enumerate(top_words):
|
|
283
|
+
diff_sq = (z1[i] - z2[i]) ** 2
|
|
284
|
+
squared_diffs.append(diff_sq)
|
|
285
|
+
contributions.append((word, diff_sq))
|
|
286
|
+
|
|
287
|
+
delta_score = math.sqrt(sum(squared_diffs) / len(squared_diffs)) if squared_diffs else 0.0
|
|
288
|
+
|
|
289
|
+
else: # weighted
|
|
290
|
+
# Weighted Delta: inverse-rank weighted mean absolute z-score difference
|
|
291
|
+
weighted_diffs: list[float] = []
|
|
292
|
+
weights: list[float] = []
|
|
293
|
+
for i, word in enumerate(top_words):
|
|
294
|
+
weight = 1.0 / (i + 1) # Inverse rank weighting
|
|
295
|
+
abs_diff = abs(z1[i] - z2[i])
|
|
296
|
+
weighted_diffs.append(weight * abs_diff)
|
|
297
|
+
weights.append(weight)
|
|
298
|
+
contributions.append((word, abs_diff))
|
|
299
|
+
|
|
300
|
+
delta_score = sum(weighted_diffs) / sum(weights) if weights else 0.0
|
|
301
|
+
|
|
302
|
+
# Sort contributions by magnitude
|
|
303
|
+
contributions.sort(key=lambda x: x[1], reverse=True)
|
|
304
|
+
|
|
305
|
+
return JohnsBurrowsResult(
|
|
306
|
+
delta_score=delta_score,
|
|
307
|
+
method=method,
|
|
308
|
+
feature_count=len(top_words),
|
|
309
|
+
most_distinctive_features=contributions[:20],
|
|
310
|
+
metadata={
|
|
311
|
+
"text1_size": size1,
|
|
312
|
+
"text2_size": size2,
|
|
313
|
+
"text1_vocab": len(freq1),
|
|
314
|
+
"text2_vocab": len(freq2),
|
|
315
|
+
"mfw_requested": mfw,
|
|
316
|
+
"z_scores_text1": dict(zip(top_words[:20], z1[:20])),
|
|
317
|
+
"z_scores_text2": dict(zip(top_words[:20], z2[:20])),
|
|
318
|
+
"all_contributions": contributions,
|
|
319
|
+
},
|
|
100
320
|
)
|
|
@@ -0,0 +1,175 @@
|
|
|
1
|
+
"""Compression-based authorship attribution using Normalized Compression Distance.
|
|
2
|
+
|
|
3
|
+
This module implements the Normalized Compression Distance (NCD) method for
|
|
4
|
+
authorship attribution. NCD is a language-independent similarity metric based
|
|
5
|
+
on Kolmogorov complexity, approximated through real-world compressors.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issue:
|
|
8
|
+
#24 - Additional Authorship Attribution Methods
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
10
|
+
|
|
11
|
+
The core insight is that if two texts share statistical regularities (as texts
|
|
12
|
+
by the same author tend to), compressing them together yields better compression
|
|
13
|
+
than compressing separately. This captures deep patterns including vocabulary,
|
|
14
|
+
syntax, and stylistic preferences without requiring explicit feature engineering.
|
|
15
|
+
|
|
16
|
+
References:
|
|
17
|
+
Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
|
|
18
|
+
IEEE Transactions on Information Theory, 51(4), 1523-1545.
|
|
19
|
+
Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
|
|
20
|
+
zipping. Physical Review Letters, 88(4), 048702.
|
|
21
|
+
Li, M., et al. (2004). The similarity metric. IEEE Transactions on
|
|
22
|
+
Information Theory, 50(12), 3250-3264.
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
from __future__ import annotations
|
|
26
|
+
|
|
27
|
+
import bz2
|
|
28
|
+
import gzip
|
|
29
|
+
import zlib
|
|
30
|
+
|
|
31
|
+
from .._types import CompressionResult
|
|
32
|
+
|
|
33
|
+
# Supported compressors mapped to their compress functions
|
|
34
|
+
_COMPRESSORS: dict[str, type] = {
|
|
35
|
+
"gzip": type(None), # placeholder, handled below
|
|
36
|
+
"zlib": type(None),
|
|
37
|
+
"bz2": type(None),
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
_VALID_COMPRESSORS = frozenset({"gzip", "zlib", "bz2"})
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _compress(data: bytes, compressor: str) -> bytes:
|
|
44
|
+
"""Compress data using the specified algorithm.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
data: Raw bytes to compress.
|
|
48
|
+
compressor: One of "gzip", "zlib", or "bz2".
|
|
49
|
+
|
|
50
|
+
Returns:
|
|
51
|
+
Compressed bytes.
|
|
52
|
+
"""
|
|
53
|
+
if compressor == "gzip":
|
|
54
|
+
return gzip.compress(data)
|
|
55
|
+
if compressor == "zlib":
|
|
56
|
+
return zlib.compress(data)
|
|
57
|
+
if compressor == "bz2":
|
|
58
|
+
return bz2.compress(data)
|
|
59
|
+
raise ValueError(f"Unknown compressor: {compressor}") # pragma: no cover
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def compute_compression_distance(
|
|
63
|
+
text1: str,
|
|
64
|
+
text2: str,
|
|
65
|
+
compressor: str = "gzip",
|
|
66
|
+
) -> CompressionResult:
|
|
67
|
+
"""
|
|
68
|
+
Compute Normalized Compression Distance (NCD) between two texts.
|
|
69
|
+
|
|
70
|
+
NCD approximates the normalized information distance, a universal similarity
|
|
71
|
+
metric based on Kolmogorov complexity. Since Kolmogorov complexity is
|
|
72
|
+
uncomputable, NCD uses real-world compressors as practical approximations.
|
|
73
|
+
|
|
74
|
+
Related GitHub Issue:
|
|
75
|
+
#24 - Additional Authorship Attribution Methods
|
|
76
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
77
|
+
|
|
78
|
+
Algorithm:
|
|
79
|
+
1. Encode both texts as UTF-8 bytes
|
|
80
|
+
2. Compress text1, text2, and their concatenation separately
|
|
81
|
+
3. Compute NCD using the formula:
|
|
82
|
+
NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
|
|
83
|
+
|
|
84
|
+
Interpretation:
|
|
85
|
+
- NCD ~ 0: Texts are maximally similar (identical information content)
|
|
86
|
+
- NCD ~ 1: Texts are maximally different (no shared information)
|
|
87
|
+
- Values slightly above 1.0 are possible due to compressor overhead
|
|
88
|
+
- Typical same-author pairs: 0.3-0.6
|
|
89
|
+
- Typical different-author pairs: 0.6-0.9
|
|
90
|
+
|
|
91
|
+
Compressor choice:
|
|
92
|
+
- "gzip" (default): Good balance of speed and accuracy; most widely used
|
|
93
|
+
in NCD literature. Uses Lempel-Ziv (LZ77) algorithm.
|
|
94
|
+
- "zlib": Same underlying algorithm as gzip but lower overhead. Slightly
|
|
95
|
+
faster, very similar results.
|
|
96
|
+
- "bz2": Uses Burrows-Wheeler transform. Better compression but slower.
|
|
97
|
+
May capture different patterns than LZ-based methods.
|
|
98
|
+
|
|
99
|
+
References:
|
|
100
|
+
Cilibrasi, R., & Vitanyi, P. M. (2005). Clustering by compression.
|
|
101
|
+
IEEE Transactions on Information Theory, 51(4), 1523-1545.
|
|
102
|
+
Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
|
|
103
|
+
zipping. Physical Review Letters, 88(4), 048702.
|
|
104
|
+
Li, M., et al. (2004). The similarity metric. IEEE Transactions on
|
|
105
|
+
Information Theory, 50(12), 3250-3264.
|
|
106
|
+
|
|
107
|
+
Args:
|
|
108
|
+
text1: First text for comparison.
|
|
109
|
+
text2: Second text for comparison.
|
|
110
|
+
compressor: Compression algorithm to use ("gzip", "zlib", or "bz2").
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
CompressionResult with NCD score and compression details.
|
|
114
|
+
|
|
115
|
+
Raises:
|
|
116
|
+
ValueError: If compressor is not one of "gzip", "zlib", "bz2".
|
|
117
|
+
|
|
118
|
+
Example:
|
|
119
|
+
>>> result = compute_compression_distance(text_by_author_a, text_by_author_b)
|
|
120
|
+
>>> print(f"NCD: {result.ncd:.3f}")
|
|
121
|
+
>>> print(f"Compressor: {result.compressor}")
|
|
122
|
+
>>> if result.ncd < 0.5:
|
|
123
|
+
... print("Texts likely by same author")
|
|
124
|
+
"""
|
|
125
|
+
if compressor not in _VALID_COMPRESSORS:
|
|
126
|
+
raise ValueError(
|
|
127
|
+
f"compressor must be one of {sorted(_VALID_COMPRESSORS)}, got '{compressor}'"
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
# Encode texts as bytes
|
|
131
|
+
bytes1 = text1.encode("utf-8")
|
|
132
|
+
bytes2 = text2.encode("utf-8")
|
|
133
|
+
bytes_combined = bytes1 + bytes2
|
|
134
|
+
|
|
135
|
+
# Compress each component
|
|
136
|
+
compressed1 = _compress(bytes1, compressor)
|
|
137
|
+
compressed2 = _compress(bytes2, compressor)
|
|
138
|
+
compressed_combined = _compress(bytes_combined, compressor)
|
|
139
|
+
|
|
140
|
+
c1 = len(compressed1)
|
|
141
|
+
c2 = len(compressed2)
|
|
142
|
+
c12 = len(compressed_combined)
|
|
143
|
+
|
|
144
|
+
# NCD formula: (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
|
|
145
|
+
min_c = min(c1, c2)
|
|
146
|
+
max_c = max(c1, c2)
|
|
147
|
+
|
|
148
|
+
if max_c == 0:
|
|
149
|
+
# Both texts are empty
|
|
150
|
+
ncd = 0.0
|
|
151
|
+
else:
|
|
152
|
+
ncd = (c12 - min_c) / max_c
|
|
153
|
+
|
|
154
|
+
# Compute compression ratios for metadata
|
|
155
|
+
raw1 = len(bytes1)
|
|
156
|
+
raw2 = len(bytes2)
|
|
157
|
+
raw_combined = len(bytes_combined)
|
|
158
|
+
|
|
159
|
+
return CompressionResult(
|
|
160
|
+
ncd=ncd,
|
|
161
|
+
compressor=compressor,
|
|
162
|
+
text1_compressed_size=c1,
|
|
163
|
+
text2_compressed_size=c2,
|
|
164
|
+
combined_compressed_size=c12,
|
|
165
|
+
metadata={
|
|
166
|
+
"text1_raw_size": raw1,
|
|
167
|
+
"text2_raw_size": raw2,
|
|
168
|
+
"combined_raw_size": raw_combined,
|
|
169
|
+
"text1_compression_ratio": c1 / raw1 if raw1 > 0 else 0.0,
|
|
170
|
+
"text2_compression_ratio": c2 / raw2 if raw2 > 0 else 0.0,
|
|
171
|
+
"combined_compression_ratio": c12 / raw_combined if raw_combined > 0 else 0.0,
|
|
172
|
+
"min_compressed": min_c,
|
|
173
|
+
"max_compressed": max_c,
|
|
174
|
+
},
|
|
175
|
+
)
|