pystylometry 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +17 -1
- pystylometry/_types.py +54 -0
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +9 -6
- pystylometry/authorship/additional_methods.py +262 -17
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +8 -1
- pystylometry/character/README.md +17 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/viz/README.md +27 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.0.dist-info}/RECORD +28 -15
- pystylometry-1.1.0.dist-info/METADATA +0 -278
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +0 -0
- {pystylometry-1.1.0.dist-info → pystylometry-1.3.0.dist-info}/entry_points.txt +0 -0
pystylometry/README.md
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
# pystylometry
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Core package for stylometric analysis and authorship attribution.
|
|
7
|
+
|
|
8
|
+
## Module Map
|
|
9
|
+
|
|
10
|
+
| Module | Purpose | Key Functions |
|
|
11
|
+
|--------|---------|---------------|
|
|
12
|
+
| [`lexical/`](lexical/) | Vocabulary diversity & richness | `compute_mtld`, `compute_yule`, `compute_ttr`, `compute_hapax_ratios` |
|
|
13
|
+
| [`readability/`](readability/) | Text readability scoring | `compute_flesch`, `compute_gunning_fog`, `compute_ari`, `compute_smog` |
|
|
14
|
+
| [`syntactic/`](syntactic/) | Sentence & parse structure | `compute_pos_ratios`, `compute_sentence_types`, `compute_advanced_syntactic` |
|
|
15
|
+
| [`authorship/`](authorship/) | Author attribution & comparison | `compute_burrows_delta`, `compute_kilgarriff`, `compute_compression_distance` |
|
|
16
|
+
| [`stylistic/`](stylistic/) | Style markers & vocabulary overlap | `compute_stylistic_markers`, `compute_vocabulary_overlap`, `compute_genre_register` |
|
|
17
|
+
| [`character/`](character/) | Character-level features | `compute_character_metrics` |
|
|
18
|
+
| [`ngrams/`](ngrams/) | N-gram entropy & sequences | `compute_extended_ngrams`, `compute_ngram_entropy` |
|
|
19
|
+
| [`dialect/`](dialect/) | Regional dialect detection | `compute_dialect` |
|
|
20
|
+
| [`consistency/`](consistency/) | Intra-document drift detection | `compute_kilgarriff_drift` |
|
|
21
|
+
| [`prosody/`](prosody/) | Rhythm & stress patterns | `compute_rhythm_prosody` |
|
|
22
|
+
| [`viz/`](viz/) | Visualization (PNG & interactive HTML) | `plot_drift_timeline`, `export_drift_report_jsx` |
|
|
23
|
+
|
|
24
|
+
## Shared Internals
|
|
25
|
+
|
|
26
|
+
| File | Purpose |
|
|
27
|
+
|------|---------|
|
|
28
|
+
| `_types.py` | All dataclass result types (e.g. `FleschResult`, `MTLDResult`, `KilgarriffDriftResult`) |
|
|
29
|
+
| `_normalize.py` | Text normalization for readability and stylometry pipelines |
|
|
30
|
+
| `_utils.py` | Shared tokenization and helper functions |
|
|
31
|
+
| `tokenizer.py` | Configurable tokenizer with sentence/word splitting |
|
|
32
|
+
| `cli.py` | Command-line interface (`pystylometry analyze`) |
|
|
33
|
+
|
|
34
|
+
## Installation Extras
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
pip install pystylometry # Core (lexical only)
|
|
38
|
+
pip install pystylometry[readability] # + readability
|
|
39
|
+
pip install pystylometry[syntactic] # + syntactic (requires spaCy)
|
|
40
|
+
pip install pystylometry[authorship] # + authorship attribution
|
|
41
|
+
pip install pystylometry[all] # Everything
|
|
42
|
+
```
|
pystylometry/__init__.py
CHANGED
|
@@ -63,18 +63,28 @@ try:
|
|
|
63
63
|
except ImportError:
|
|
64
64
|
_SYNTACTIC_AVAILABLE = False
|
|
65
65
|
|
|
66
|
-
#
|
|
66
|
+
# Prosody requires pronouncing (CMU dictionary) - same dependency as readability
|
|
67
|
+
try:
|
|
68
|
+
from . import prosody # noqa: F401 - Rhythm and prosody metrics (Issue #25)
|
|
69
|
+
|
|
70
|
+
_PROSODY_AVAILABLE = True
|
|
71
|
+
except ImportError:
|
|
72
|
+
_PROSODY_AVAILABLE = False
|
|
73
|
+
|
|
74
|
+
# Authorship, ngrams, dialect, consistency, and stylistic use only stdlib (no external dependencies)
|
|
67
75
|
from . import (
|
|
68
76
|
authorship, # noqa: F401
|
|
69
77
|
consistency, # noqa: F401 - Style drift detection (Issue #36)
|
|
70
78
|
dialect, # noqa: F401
|
|
71
79
|
ngrams, # noqa: F401
|
|
80
|
+
stylistic, # noqa: F401 - Vocabulary overlap and similarity (Issue #21)
|
|
72
81
|
)
|
|
73
82
|
|
|
74
83
|
_AUTHORSHIP_AVAILABLE = True
|
|
75
84
|
_NGRAMS_AVAILABLE = True
|
|
76
85
|
_DIALECT_AVAILABLE = True
|
|
77
86
|
_CONSISTENCY_AVAILABLE = True
|
|
87
|
+
_STYLISTIC_AVAILABLE = True
|
|
78
88
|
|
|
79
89
|
|
|
80
90
|
def analyze(
|
|
@@ -206,6 +216,8 @@ def get_available_modules() -> dict[str, bool]:
|
|
|
206
216
|
"ngrams": _NGRAMS_AVAILABLE,
|
|
207
217
|
"dialect": _DIALECT_AVAILABLE,
|
|
208
218
|
"consistency": _CONSISTENCY_AVAILABLE, # Style drift detection (Issue #36)
|
|
219
|
+
"stylistic": _STYLISTIC_AVAILABLE, # Vocabulary overlap (Issue #21)
|
|
220
|
+
"prosody": _PROSODY_AVAILABLE, # Rhythm and prosody (Issue #25)
|
|
209
221
|
}
|
|
210
222
|
|
|
211
223
|
|
|
@@ -229,3 +241,7 @@ if _DIALECT_AVAILABLE:
|
|
|
229
241
|
__all__.append("dialect")
|
|
230
242
|
if _CONSISTENCY_AVAILABLE:
|
|
231
243
|
__all__.append("consistency")
|
|
244
|
+
if _STYLISTIC_AVAILABLE:
|
|
245
|
+
__all__.append("stylistic")
|
|
246
|
+
if _PROSODY_AVAILABLE:
|
|
247
|
+
__all__.append("prosody")
|
pystylometry/_types.py
CHANGED
|
@@ -1517,6 +1517,7 @@ class VocabularyOverlapResult:
|
|
|
1517
1517
|
- Dice coefficient (2 * intersection / sum of sizes)
|
|
1518
1518
|
- Overlap coefficient (intersection / min(size1, size2))
|
|
1519
1519
|
- Cosine similarity (using word frequency vectors)
|
|
1520
|
+
- KL divergence (asymmetric distributional difference)
|
|
1520
1521
|
- Shared vocabulary size and ratio
|
|
1521
1522
|
- Unique words in each text
|
|
1522
1523
|
- Most distinctive words for each text
|
|
@@ -1526,6 +1527,10 @@ class VocabularyOverlapResult:
|
|
|
1526
1527
|
New Phytologist, 11(2), 37-50.
|
|
1527
1528
|
Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
|
|
1528
1529
|
Retrieval. McGraw-Hill.
|
|
1530
|
+
Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
|
|
1531
|
+
Annals of Mathematical Statistics, 22(1), 79-86.
|
|
1532
|
+
Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
|
|
1533
|
+
MIT Press.
|
|
1529
1534
|
|
|
1530
1535
|
Example:
|
|
1531
1536
|
>>> result = compute_vocabulary_overlap(text1, text2)
|
|
@@ -1539,6 +1544,7 @@ class VocabularyOverlapResult:
|
|
|
1539
1544
|
dice_coefficient: float # 2 * intersection / (size1 + size2)
|
|
1540
1545
|
overlap_coefficient: float # Intersection / min(size1, size2)
|
|
1541
1546
|
cosine_similarity: float # Cosine of frequency vectors
|
|
1547
|
+
kl_divergence: float # Kullback-Leibler divergence (asymmetric, text1 || text2)
|
|
1542
1548
|
|
|
1543
1549
|
# Vocabulary sizes
|
|
1544
1550
|
text1_vocab_size: int # Unique words in text 1
|
|
@@ -1897,6 +1903,54 @@ class JohnsBurrowsResult:
|
|
|
1897
1903
|
metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
|
|
1898
1904
|
|
|
1899
1905
|
|
|
1906
|
+
@dataclass
|
|
1907
|
+
class CompressionResult:
|
|
1908
|
+
"""Result from compression-based authorship attribution.
|
|
1909
|
+
|
|
1910
|
+
Compression-based methods use the Normalized Compression Distance (NCD) to
|
|
1911
|
+
measure similarity between texts. The intuition is that if two texts are
|
|
1912
|
+
similar, compressing them together will yield better compression than
|
|
1913
|
+
compressing separately. This approach is language-independent and captures
|
|
1914
|
+
deep statistical regularities.
|
|
1915
|
+
|
|
1916
|
+
Related GitHub Issue:
|
|
1917
|
+
#24 - Additional Authorship Attribution Methods
|
|
1918
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
1919
|
+
|
|
1920
|
+
Formula:
|
|
1921
|
+
NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
|
|
1922
|
+
|
|
1923
|
+
where C(x) is the compressed size of x, and C(xy) is the compressed
|
|
1924
|
+
size of x and y concatenated.
|
|
1925
|
+
|
|
1926
|
+
Interpretation:
|
|
1927
|
+
- NCD ≈ 0: Texts are very similar
|
|
1928
|
+
- NCD ≈ 1: Texts are very different
|
|
1929
|
+
- Typical same-author pairs: 0.3-0.6
|
|
1930
|
+
- Typical different-author pairs: 0.6-0.9
|
|
1931
|
+
|
|
1932
|
+
References:
|
|
1933
|
+
Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
|
|
1934
|
+
IEEE Transactions on Information Theory, 51(4), 1523-1545.
|
|
1935
|
+
|
|
1936
|
+
Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
|
|
1937
|
+
zipping. Physical Review Letters, 88(4), 048702.
|
|
1938
|
+
|
|
1939
|
+
Example:
|
|
1940
|
+
>>> result = compute_compression_distance(text1, text2)
|
|
1941
|
+
>>> print(f"NCD: {result.ncd:.3f}")
|
|
1942
|
+
>>> if result.ncd < 0.5:
|
|
1943
|
+
... print("Texts likely by same author")
|
|
1944
|
+
"""
|
|
1945
|
+
|
|
1946
|
+
ncd: float # Normalized Compression Distance [0, 1+]
|
|
1947
|
+
compressor: str # Compression algorithm used (e.g., "gzip", "zlib", "bz2")
|
|
1948
|
+
text1_compressed_size: int # Compressed size of text1 alone
|
|
1949
|
+
text2_compressed_size: int # Compressed size of text2 alone
|
|
1950
|
+
combined_compressed_size: int # Compressed size of concatenated texts
|
|
1951
|
+
metadata: dict[str, Any] # Raw sizes, compression ratios, etc.
|
|
1952
|
+
|
|
1953
|
+
|
|
1900
1954
|
# ===== Rhythm and Prosody Results =====
|
|
1901
1955
|
# Related to GitHub Issue #25: Rhythm and Prosody Metrics
|
|
1902
1956
|
# https://github.com/craigtrim/pystylometry/issues/25
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
# authorship
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Authorship attribution methods for comparing texts and determining likely authorship.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Functions | Method |
|
|
11
|
+
|------|-----------|--------|
|
|
12
|
+
| `burrows_delta.py` | `compute_burrows_delta`, `compute_cosine_delta` | Classic Delta and angular distance variant |
|
|
13
|
+
| `zeta.py` | `compute_zeta` | Zeta method for marker word detection |
|
|
14
|
+
| `kilgarriff.py` | `compute_kilgarriff` | Chi-squared corpus comparison |
|
|
15
|
+
| `additional_methods.py` | `compute_minmax`, `compute_johns_delta` | MinMax distance, Quadratic/Weighted Delta |
|
|
16
|
+
| `compression.py` | `compute_compression_distance` | Normalized Compression Distance (NCD) |
|
|
17
|
+
|
|
18
|
+
## See Also
|
|
19
|
+
|
|
20
|
+
- [`consistency/`](../consistency/) applies `compute_kilgarriff` in sliding windows for intra-document drift detection
|
|
21
|
+
- [`lexical/`](../lexical/) provides the vocabulary features many attribution methods rely on
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
|
|
3
3
|
This module provides methods for authorship attribution - comparing texts to
|
|
4
4
|
determine whether they were written by the same author. Available methods
|
|
5
|
-
include classic approaches (Burrows' Delta, Zeta)
|
|
6
|
-
(Kilgarriff's chi-squared).
|
|
5
|
+
include classic approaches (Burrows' Delta, Zeta), statistical methods
|
|
6
|
+
(Kilgarriff's chi-squared), and information-theoretic methods (NCD).
|
|
7
7
|
|
|
8
8
|
Related GitHub Issues:
|
|
9
9
|
#24 - Additional Authorship Attribution Methods
|
|
@@ -16,20 +16,23 @@ Available Functions:
|
|
|
16
16
|
compute_cosine_delta: Angular distance variant of Delta
|
|
17
17
|
compute_zeta: Zeta method for marker word detection
|
|
18
18
|
compute_kilgarriff: Chi-squared method for corpus comparison
|
|
19
|
-
compute_minmax: Burrows' original min-max method
|
|
20
|
-
compute_johns_delta: Delta variations (
|
|
19
|
+
compute_minmax: Burrows' original min-max distance method
|
|
20
|
+
compute_johns_delta: Delta variations (quadratic, weighted)
|
|
21
|
+
compute_compression_distance: Normalized Compression Distance (NCD)
|
|
21
22
|
"""
|
|
22
23
|
|
|
23
24
|
from .additional_methods import compute_johns_delta, compute_minmax
|
|
24
25
|
from .burrows_delta import compute_burrows_delta, compute_cosine_delta
|
|
26
|
+
from .compression import compute_compression_distance
|
|
25
27
|
from .kilgarriff import compute_kilgarriff
|
|
26
28
|
from .zeta import compute_zeta
|
|
27
29
|
|
|
28
30
|
__all__ = [
|
|
29
31
|
"compute_burrows_delta",
|
|
32
|
+
"compute_compression_distance",
|
|
30
33
|
"compute_cosine_delta",
|
|
31
|
-
"
|
|
34
|
+
"compute_johns_delta",
|
|
32
35
|
"compute_kilgarriff",
|
|
33
36
|
"compute_minmax",
|
|
34
|
-
"
|
|
37
|
+
"compute_zeta",
|
|
35
38
|
]
|
|
@@ -8,40 +8,150 @@ Related GitHub Issue:
|
|
|
8
8
|
https://github.com/craigtrim/pystylometry/issues/24
|
|
9
9
|
|
|
10
10
|
Methods implemented:
|
|
11
|
-
- Kilgarriff's Chi-squared
|
|
12
|
-
- Min-Max (Burrows' original method)
|
|
13
|
-
- John Burrows' Delta variations
|
|
11
|
+
- Kilgarriff's Chi-squared -> See kilgarriff.py (Issue #31)
|
|
12
|
+
- Min-Max distance (Burrows' original method)
|
|
13
|
+
- John Burrows' Delta variations (Quadratic, Weighted)
|
|
14
14
|
|
|
15
15
|
References:
|
|
16
16
|
Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics.
|
|
17
17
|
Burrows, J. F. (1992). Not unless you ask nicely. Literary and Linguistic Computing.
|
|
18
18
|
Burrows, J. (2005). Who wrote Shamela? Literary and Linguistic Computing.
|
|
19
|
+
Argamon, S. (2008). Interpreting Burrows's Delta. Literary and Linguistic Computing.
|
|
19
20
|
"""
|
|
20
21
|
|
|
22
|
+
from __future__ import annotations
|
|
23
|
+
|
|
24
|
+
import math
|
|
25
|
+
from collections import Counter
|
|
26
|
+
|
|
21
27
|
from .._types import JohnsBurrowsResult, MinMaxResult
|
|
28
|
+
from .._utils import tokenize
|
|
22
29
|
|
|
23
30
|
|
|
24
31
|
def compute_minmax(text1: str, text2: str, mfw: int = 100) -> MinMaxResult:
|
|
25
32
|
"""
|
|
26
|
-
Compute Min-Max distance
|
|
33
|
+
Compute Min-Max distance between two texts.
|
|
34
|
+
|
|
35
|
+
This is Burrows' original method from his 1992 paper, before the development
|
|
36
|
+
of Delta. It normalizes word frequencies using min-max scaling and computes
|
|
37
|
+
the mean absolute distance between normalized frequency vectors.
|
|
27
38
|
|
|
28
39
|
Related GitHub Issue:
|
|
29
40
|
#24 - Additional Authorship Attribution Methods
|
|
30
41
|
https://github.com/craigtrim/pystylometry/issues/24
|
|
31
42
|
|
|
43
|
+
Algorithm:
|
|
44
|
+
1. Tokenize both texts and build frequency distributions
|
|
45
|
+
2. Identify the top N most frequent words in the joint corpus
|
|
46
|
+
3. Compute relative frequencies for each word in each text
|
|
47
|
+
4. Normalize each word's frequencies using min-max scaling:
|
|
48
|
+
normalized(f) = (f - min) / (max - min)
|
|
49
|
+
5. Compute mean absolute difference of normalized frequencies
|
|
50
|
+
|
|
51
|
+
Interpretation:
|
|
52
|
+
- Lower values indicate more similar texts (likely same author)
|
|
53
|
+
- Higher values indicate more different texts
|
|
54
|
+
- Scale: 0.0 (identical) to 1.0 (maximally different)
|
|
55
|
+
|
|
56
|
+
References:
|
|
57
|
+
Burrows, J. F. (1992). Not unless you ask nicely: The interpretative
|
|
58
|
+
nexus between analysis and information. Literary and Linguistic
|
|
59
|
+
Computing, 7(2), 91-109.
|
|
60
|
+
|
|
32
61
|
Args:
|
|
33
62
|
text1: First text for comparison
|
|
34
63
|
text2: Second text for comparison
|
|
35
|
-
mfw: Number of most frequent words to analyze
|
|
64
|
+
mfw: Number of most frequent words to analyze (default: 100)
|
|
36
65
|
|
|
37
66
|
Returns:
|
|
38
67
|
MinMaxResult with min-max distance and distinctive features.
|
|
68
|
+
|
|
69
|
+
Example:
|
|
70
|
+
>>> result = compute_minmax(text_by_author_a, text_by_author_b)
|
|
71
|
+
>>> print(f"MinMax distance: {result.minmax_distance:.3f}")
|
|
72
|
+
>>> print(f"Most distinctive: {result.most_distinctive_features[0]}")
|
|
39
73
|
"""
|
|
40
|
-
#
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
74
|
+
# Tokenize and lowercase
|
|
75
|
+
tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
|
|
76
|
+
tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
|
|
77
|
+
|
|
78
|
+
if not tokens1 or not tokens2:
|
|
79
|
+
return MinMaxResult(
|
|
80
|
+
minmax_distance=0.0,
|
|
81
|
+
feature_count=0,
|
|
82
|
+
most_distinctive_features=[],
|
|
83
|
+
metadata={
|
|
84
|
+
"text1_size": len(tokens1),
|
|
85
|
+
"text2_size": len(tokens2),
|
|
86
|
+
"warning": "One or both texts are empty",
|
|
87
|
+
},
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
# Build frequency distributions
|
|
91
|
+
freq1 = Counter(tokens1)
|
|
92
|
+
freq2 = Counter(tokens2)
|
|
93
|
+
size1 = len(tokens1)
|
|
94
|
+
size2 = len(tokens2)
|
|
95
|
+
|
|
96
|
+
# Joint corpus: top N most frequent words
|
|
97
|
+
joint: Counter[str] = Counter()
|
|
98
|
+
joint.update(freq1)
|
|
99
|
+
joint.update(freq2)
|
|
100
|
+
top_words = [word for word, _ in joint.most_common(mfw)]
|
|
101
|
+
|
|
102
|
+
if not top_words:
|
|
103
|
+
return MinMaxResult(
|
|
104
|
+
minmax_distance=0.0,
|
|
105
|
+
feature_count=0,
|
|
106
|
+
most_distinctive_features=[],
|
|
107
|
+
metadata={
|
|
108
|
+
"text1_size": size1,
|
|
109
|
+
"text2_size": size2,
|
|
110
|
+
"warning": "No common words found",
|
|
111
|
+
},
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
# Relative frequencies
|
|
115
|
+
rel1 = [freq1.get(w, 0) / size1 for w in top_words]
|
|
116
|
+
rel2 = [freq2.get(w, 0) / size2 for w in top_words]
|
|
117
|
+
|
|
118
|
+
# Min-Max normalization per feature across both texts
|
|
119
|
+
# Then compute absolute distance
|
|
120
|
+
contributions: list[tuple[str, float]] = []
|
|
121
|
+
total_distance = 0.0
|
|
122
|
+
|
|
123
|
+
for i, word in enumerate(top_words):
|
|
124
|
+
f1, f2 = rel1[i], rel2[i]
|
|
125
|
+
max_val = max(f1, f2)
|
|
126
|
+
|
|
127
|
+
if max_val > 0:
|
|
128
|
+
# Min-Max normalized distance for this feature
|
|
129
|
+
dist = abs(f1 - f2) / max_val
|
|
130
|
+
else:
|
|
131
|
+
dist = 0.0
|
|
132
|
+
|
|
133
|
+
total_distance += dist
|
|
134
|
+
contributions.append((word, dist))
|
|
135
|
+
|
|
136
|
+
# Sort contributions by magnitude
|
|
137
|
+
contributions.sort(key=lambda x: x[1], reverse=True)
|
|
138
|
+
|
|
139
|
+
# Mean distance across all features
|
|
140
|
+
minmax_distance = total_distance / len(top_words) if top_words else 0.0
|
|
141
|
+
|
|
142
|
+
return MinMaxResult(
|
|
143
|
+
minmax_distance=minmax_distance,
|
|
144
|
+
feature_count=len(top_words),
|
|
145
|
+
most_distinctive_features=contributions[:20],
|
|
146
|
+
metadata={
|
|
147
|
+
"text1_size": size1,
|
|
148
|
+
"text2_size": size2,
|
|
149
|
+
"text1_vocab": len(freq1),
|
|
150
|
+
"text2_vocab": len(freq2),
|
|
151
|
+
"mfw_requested": mfw,
|
|
152
|
+
"method": "minmax_1992",
|
|
153
|
+
"all_contributions": contributions,
|
|
154
|
+
},
|
|
45
155
|
)
|
|
46
156
|
|
|
47
157
|
|
|
@@ -54,22 +164,157 @@ def compute_johns_delta(
|
|
|
54
164
|
"""
|
|
55
165
|
Compute John Burrows' Delta variations.
|
|
56
166
|
|
|
167
|
+
This implements alternative formulations of Burrows' Delta metric beyond
|
|
168
|
+
the standard mean absolute z-score difference. The quadratic variant uses
|
|
169
|
+
squared z-score differences (Euclidean distance), while the weighted variant
|
|
170
|
+
applies inverse-rank weighting so higher-frequency words contribute more.
|
|
171
|
+
|
|
57
172
|
Related GitHub Issue:
|
|
58
173
|
#24 - Additional Authorship Attribution Methods
|
|
59
174
|
https://github.com/craigtrim/pystylometry/issues/24
|
|
60
175
|
|
|
176
|
+
Methods:
|
|
177
|
+
- "quadratic": Euclidean distance of z-scores
|
|
178
|
+
Delta_Q = sqrt(sum((z1_i - z2_i)^2) / n)
|
|
179
|
+
|
|
180
|
+
- "weighted": Inverse-rank weighted Delta
|
|
181
|
+
Delta_W = sum(w_i * |z1_i - z2_i|) / sum(w_i)
|
|
182
|
+
where w_i = 1 / rank_i
|
|
183
|
+
|
|
184
|
+
Interpretation:
|
|
185
|
+
- Lower values indicate more similar texts (likely same author)
|
|
186
|
+
- Quadratic Delta penalizes large deviations more than standard Delta
|
|
187
|
+
- Weighted Delta emphasizes the most frequent words
|
|
188
|
+
|
|
189
|
+
References:
|
|
190
|
+
Burrows, J. (2005). Who wrote Shamela? Verifying the authorship of a
|
|
191
|
+
parodic text. Literary and Linguistic Computing, 20(4), 437-450.
|
|
192
|
+
Argamon, S. (2008). Interpreting Burrows's Delta: Geometric and
|
|
193
|
+
probabilistic foundations. Literary and Linguistic Computing,
|
|
194
|
+
23(2), 131-147.
|
|
195
|
+
|
|
61
196
|
Args:
|
|
62
197
|
text1: First text for comparison
|
|
63
198
|
text2: Second text for comparison
|
|
64
|
-
mfw: Number of most frequent words to analyze
|
|
65
|
-
method: Delta variation ("quadratic"
|
|
199
|
+
mfw: Number of most frequent words to analyze (default: 100)
|
|
200
|
+
method: Delta variation ("quadratic" or "weighted")
|
|
66
201
|
|
|
67
202
|
Returns:
|
|
68
203
|
JohnsBurrowsResult with delta score and method details.
|
|
204
|
+
|
|
205
|
+
Example:
|
|
206
|
+
>>> result = compute_johns_delta(text1, text2, method="quadratic")
|
|
207
|
+
>>> print(f"Quadratic Delta: {result.delta_score:.3f}")
|
|
69
208
|
"""
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
209
|
+
if method not in ("quadratic", "weighted"):
|
|
210
|
+
raise ValueError(f"method must be 'quadratic' or 'weighted', got '{method}'")
|
|
211
|
+
|
|
212
|
+
# Tokenize and lowercase
|
|
213
|
+
tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
|
|
214
|
+
tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
|
|
215
|
+
|
|
216
|
+
if not tokens1 or not tokens2:
|
|
217
|
+
return JohnsBurrowsResult(
|
|
218
|
+
delta_score=0.0,
|
|
219
|
+
method=method,
|
|
220
|
+
feature_count=0,
|
|
221
|
+
most_distinctive_features=[],
|
|
222
|
+
metadata={
|
|
223
|
+
"text1_size": len(tokens1),
|
|
224
|
+
"text2_size": len(tokens2),
|
|
225
|
+
"warning": "One or both texts are empty",
|
|
226
|
+
},
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
# Build frequency distributions
|
|
230
|
+
freq1 = Counter(tokens1)
|
|
231
|
+
freq2 = Counter(tokens2)
|
|
232
|
+
size1 = len(tokens1)
|
|
233
|
+
size2 = len(tokens2)
|
|
234
|
+
|
|
235
|
+
# Joint corpus: top N most frequent words
|
|
236
|
+
joint: Counter[str] = Counter()
|
|
237
|
+
joint.update(freq1)
|
|
238
|
+
joint.update(freq2)
|
|
239
|
+
top_words = [word for word, _ in joint.most_common(mfw)]
|
|
240
|
+
|
|
241
|
+
if not top_words:
|
|
242
|
+
return JohnsBurrowsResult(
|
|
243
|
+
delta_score=0.0,
|
|
244
|
+
method=method,
|
|
245
|
+
feature_count=0,
|
|
246
|
+
most_distinctive_features=[],
|
|
247
|
+
metadata={
|
|
248
|
+
"text1_size": size1,
|
|
249
|
+
"text2_size": size2,
|
|
250
|
+
"warning": "No common words found",
|
|
251
|
+
},
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
# Relative frequencies
|
|
255
|
+
rel1 = [freq1.get(w, 0) / size1 for w in top_words]
|
|
256
|
+
rel2 = [freq2.get(w, 0) / size2 for w in top_words]
|
|
257
|
+
|
|
258
|
+
# Mean-normalized differences
|
|
259
|
+
# With only 2 texts, classical z-scores are degenerate: stdev([a,b]) is
|
|
260
|
+
# always |a-b|/sqrt(2), producing identical z-scores (±0.707) for all
|
|
261
|
+
# features with any difference. Instead, we normalize by the mean frequency
|
|
262
|
+
# of each feature across both texts, which preserves discriminative power:
|
|
263
|
+
# normalized_i = (f1_i - f2_i) / mean(f1_i, f2_i)
|
|
264
|
+
# This weights words proportionally to how much they differ relative to
|
|
265
|
+
# their expected frequency, preventing high-frequency words from dominating
|
|
266
|
+
# through absolute differences alone.
|
|
267
|
+
z1: list[float] = []
|
|
268
|
+
z2: list[float] = []
|
|
269
|
+
for i in range(len(top_words)):
|
|
270
|
+
mean_val = (rel1[i] + rel2[i]) / 2
|
|
271
|
+
# Normalize by mean frequency; use epsilon for words absent in both
|
|
272
|
+
norm = mean_val if mean_val > 0 else 1e-10
|
|
273
|
+
z1.append((rel1[i] - mean_val) / norm)
|
|
274
|
+
z2.append((rel2[i] - mean_val) / norm)
|
|
275
|
+
|
|
276
|
+
# Compute distance based on method
|
|
277
|
+
contributions: list[tuple[str, float]] = []
|
|
278
|
+
|
|
279
|
+
if method == "quadratic":
|
|
280
|
+
# Quadratic Delta: root mean squared z-score difference
|
|
281
|
+
squared_diffs: list[float] = []
|
|
282
|
+
for i, word in enumerate(top_words):
|
|
283
|
+
diff_sq = (z1[i] - z2[i]) ** 2
|
|
284
|
+
squared_diffs.append(diff_sq)
|
|
285
|
+
contributions.append((word, diff_sq))
|
|
286
|
+
|
|
287
|
+
delta_score = math.sqrt(sum(squared_diffs) / len(squared_diffs)) if squared_diffs else 0.0
|
|
288
|
+
|
|
289
|
+
else: # weighted
|
|
290
|
+
# Weighted Delta: inverse-rank weighted mean absolute z-score difference
|
|
291
|
+
weighted_diffs: list[float] = []
|
|
292
|
+
weights: list[float] = []
|
|
293
|
+
for i, word in enumerate(top_words):
|
|
294
|
+
weight = 1.0 / (i + 1) # Inverse rank weighting
|
|
295
|
+
abs_diff = abs(z1[i] - z2[i])
|
|
296
|
+
weighted_diffs.append(weight * abs_diff)
|
|
297
|
+
weights.append(weight)
|
|
298
|
+
contributions.append((word, abs_diff))
|
|
299
|
+
|
|
300
|
+
delta_score = sum(weighted_diffs) / sum(weights) if weights else 0.0
|
|
301
|
+
|
|
302
|
+
# Sort contributions by magnitude
|
|
303
|
+
contributions.sort(key=lambda x: x[1], reverse=True)
|
|
304
|
+
|
|
305
|
+
return JohnsBurrowsResult(
|
|
306
|
+
delta_score=delta_score,
|
|
307
|
+
method=method,
|
|
308
|
+
feature_count=len(top_words),
|
|
309
|
+
most_distinctive_features=contributions[:20],
|
|
310
|
+
metadata={
|
|
311
|
+
"text1_size": size1,
|
|
312
|
+
"text2_size": size2,
|
|
313
|
+
"text1_vocab": len(freq1),
|
|
314
|
+
"text2_vocab": len(freq2),
|
|
315
|
+
"mfw_requested": mfw,
|
|
316
|
+
"z_scores_text1": dict(zip(top_words[:20], z1[:20])),
|
|
317
|
+
"z_scores_text2": dict(zip(top_words[:20], z2[:20])),
|
|
318
|
+
"all_contributions": contributions,
|
|
319
|
+
},
|
|
75
320
|
)
|