pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
pystylometry/_utils.py
CHANGED
|
@@ -9,9 +9,13 @@ from .tokenizer import Tokenizer
|
|
|
9
9
|
# ===== Convenience Functions =====
|
|
10
10
|
|
|
11
11
|
# Default tokenizer instance for backward compatibility
|
|
12
|
+
# Preserves emails and URLs to allow readability metrics (like Coleman-Liau)
|
|
13
|
+
# to count their alphabetic characters
|
|
12
14
|
_default_tokenizer = Tokenizer(
|
|
13
15
|
lowercase=False,
|
|
14
16
|
strip_punctuation=False,
|
|
17
|
+
preserve_urls=True,
|
|
18
|
+
preserve_emails=True,
|
|
15
19
|
)
|
|
16
20
|
|
|
17
21
|
|
|
@@ -1,10 +1,35 @@
|
|
|
1
|
-
"""Authorship attribution metrics.
|
|
1
|
+
"""Authorship attribution metrics.
|
|
2
2
|
|
|
3
|
+
This module provides methods for authorship attribution - comparing texts to
|
|
4
|
+
determine whether they were written by the same author. Available methods
|
|
5
|
+
include classic approaches (Burrows' Delta, Zeta) and statistical methods
|
|
6
|
+
(Kilgarriff's chi-squared).
|
|
7
|
+
|
|
8
|
+
Related GitHub Issues:
|
|
9
|
+
#24 - Additional Authorship Attribution Methods
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
11
|
+
#31 - Classical Stylometric Methods from Programming Historian
|
|
12
|
+
https://github.com/craigtrim/pystylometry/issues/31
|
|
13
|
+
|
|
14
|
+
Available Functions:
|
|
15
|
+
compute_burrows_delta: Classic Delta method for authorship distance
|
|
16
|
+
compute_cosine_delta: Angular distance variant of Delta
|
|
17
|
+
compute_zeta: Zeta method for marker word detection
|
|
18
|
+
compute_kilgarriff: Chi-squared method for corpus comparison
|
|
19
|
+
compute_minmax: Burrows' original min-max method (not yet implemented)
|
|
20
|
+
compute_johns_delta: Delta variations (not yet implemented)
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .additional_methods import compute_johns_delta, compute_minmax
|
|
3
24
|
from .burrows_delta import compute_burrows_delta, compute_cosine_delta
|
|
25
|
+
from .kilgarriff import compute_kilgarriff
|
|
4
26
|
from .zeta import compute_zeta
|
|
5
27
|
|
|
6
28
|
__all__ = [
|
|
7
29
|
"compute_burrows_delta",
|
|
8
30
|
"compute_cosine_delta",
|
|
9
31
|
"compute_zeta",
|
|
32
|
+
"compute_kilgarriff",
|
|
33
|
+
"compute_minmax",
|
|
34
|
+
"compute_johns_delta",
|
|
10
35
|
]
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Additional authorship attribution methods.
|
|
2
|
+
|
|
3
|
+
This module provides alternative distance/similarity metrics for authorship
|
|
4
|
+
attribution beyond Burrows' Delta and Zeta.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#24 - Additional Authorship Attribution Methods
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
9
|
+
|
|
10
|
+
Methods implemented:
|
|
11
|
+
- Kilgarriff's Chi-squared → See kilgarriff.py (Issue #31)
|
|
12
|
+
- Min-Max (Burrows' original method) → Not yet implemented
|
|
13
|
+
- John Burrows' Delta variations → Not yet implemented
|
|
14
|
+
|
|
15
|
+
References:
|
|
16
|
+
Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics.
|
|
17
|
+
Burrows, J. F. (1992). Not unless you ask nicely. Literary and Linguistic Computing.
|
|
18
|
+
Burrows, J. (2005). Who wrote Shamela? Literary and Linguistic Computing.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from .._types import JohnsBurrowsResult, MinMaxResult
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def compute_minmax(text1: str, text2: str, mfw: int = 100) -> MinMaxResult:
|
|
25
|
+
"""
|
|
26
|
+
Compute Min-Max distance (Burrows' original method).
|
|
27
|
+
|
|
28
|
+
Related GitHub Issue:
|
|
29
|
+
#24 - Additional Authorship Attribution Methods
|
|
30
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
31
|
+
|
|
32
|
+
Args:
|
|
33
|
+
text1: First text for comparison
|
|
34
|
+
text2: Second text for comparison
|
|
35
|
+
mfw: Number of most frequent words to analyze
|
|
36
|
+
|
|
37
|
+
Returns:
|
|
38
|
+
MinMaxResult with min-max distance and distinctive features.
|
|
39
|
+
"""
|
|
40
|
+
# TODO: Implement Min-Max distance
|
|
41
|
+
# GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
|
|
42
|
+
raise NotImplementedError(
|
|
43
|
+
"Min-Max distance not yet implemented. "
|
|
44
|
+
"See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def compute_johns_delta(
|
|
49
|
+
text1: str,
|
|
50
|
+
text2: str,
|
|
51
|
+
mfw: int = 100,
|
|
52
|
+
method: str = "quadratic",
|
|
53
|
+
) -> JohnsBurrowsResult:
|
|
54
|
+
"""
|
|
55
|
+
Compute John Burrows' Delta variations.
|
|
56
|
+
|
|
57
|
+
Related GitHub Issue:
|
|
58
|
+
#24 - Additional Authorship Attribution Methods
|
|
59
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
text1: First text for comparison
|
|
63
|
+
text2: Second text for comparison
|
|
64
|
+
mfw: Number of most frequent words to analyze
|
|
65
|
+
method: Delta variation ("quadratic", "weighted", "rotated")
|
|
66
|
+
|
|
67
|
+
Returns:
|
|
68
|
+
JohnsBurrowsResult with delta score and method details.
|
|
69
|
+
"""
|
|
70
|
+
# TODO: Implement John's Delta variations
|
|
71
|
+
# GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
|
|
72
|
+
raise NotImplementedError(
|
|
73
|
+
"John's Delta variations not yet implemented. "
|
|
74
|
+
"See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
|
|
75
|
+
)
|
|
@@ -0,0 +1,347 @@
|
|
|
1
|
+
"""Kilgarriff's Chi-Squared method for authorship attribution.
|
|
2
|
+
|
|
3
|
+
This module implements Adam Kilgarriff's chi-squared method for measuring
|
|
4
|
+
statistical distance between two text corpora based on word frequency
|
|
5
|
+
distributions. The method is particularly effective for authorship attribution
|
|
6
|
+
when comparing frequency profiles of common words.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issues:
|
|
9
|
+
#31 - Classical Stylometric Methods from Programming Historian
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/31
|
|
11
|
+
#24 - Additional Authorship Attribution Methods
|
|
12
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
13
|
+
#36 - Kilgarriff Chi-Squared drift detection (uses _kilgarriff_core)
|
|
14
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
15
|
+
|
|
16
|
+
Algorithm Overview:
|
|
17
|
+
Kilgarriff's method compares two texts by:
|
|
18
|
+
1. Combining both texts into a joint corpus
|
|
19
|
+
2. Extracting the top N most frequent words from the joint corpus
|
|
20
|
+
3. For each word, computing expected vs. observed frequencies
|
|
21
|
+
4. Applying the chi-squared formula: χ² = Σ((O - E)² / E)
|
|
22
|
+
|
|
23
|
+
Lower chi-squared values indicate more similar texts (likely same author).
|
|
24
|
+
The method identifies which words contribute most to the difference,
|
|
25
|
+
providing interpretable results.
|
|
26
|
+
|
|
27
|
+
Theoretical Background:
|
|
28
|
+
The chi-squared test measures the discrepancy between observed and expected
|
|
29
|
+
frequencies. In Kilgarriff's formulation, the expected frequency for a word
|
|
30
|
+
is calculated assuming both texts come from the same underlying distribution
|
|
31
|
+
(the joint corpus). Large deviations from this expectation contribute to
|
|
32
|
+
higher chi-squared scores.
|
|
33
|
+
|
|
34
|
+
Formula for expected frequency of word w in text T:
|
|
35
|
+
E(w, T) = (count(w) in joint corpus) × (size(T) / size(joint corpus))
|
|
36
|
+
|
|
37
|
+
Chi-squared contribution for word w:
|
|
38
|
+
χ²(w) = ((O(w, T1) - E(w, T1))² / E(w, T1)) + ((O(w, T2) - E(w, T2))² / E(w, T2))
|
|
39
|
+
|
|
40
|
+
References:
|
|
41
|
+
Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
|
|
42
|
+
Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
|
|
43
|
+
doi: 10.1075/ijcl.6.1.05kil
|
|
44
|
+
|
|
45
|
+
Oakes, Michael P. "Statistics for Corpus Linguistics." Edinburgh University
|
|
46
|
+
Press, 1998.
|
|
47
|
+
|
|
48
|
+
Programming Historian. "Introduction to Stylometry with Python."
|
|
49
|
+
https://programminghistorian.org/en/lessons/introduction-to-stylometry-with-python
|
|
50
|
+
"""
|
|
51
|
+
|
|
52
|
+
from __future__ import annotations
|
|
53
|
+
|
|
54
|
+
import math
|
|
55
|
+
from collections import Counter
|
|
56
|
+
from typing import Any
|
|
57
|
+
|
|
58
|
+
from .._types import KilgarriffResult
|
|
59
|
+
from .._utils import tokenize
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _chi2_cdf(x: float, df: int) -> float:
|
|
63
|
+
"""
|
|
64
|
+
Compute chi-squared CDF in pure Python (no scipy required).
|
|
65
|
+
|
|
66
|
+
P(X ≤ x) for X ~ χ²(df), computed via the regularized lower
|
|
67
|
+
incomplete gamma function: P(df/2, x/2).
|
|
68
|
+
|
|
69
|
+
Uses series expansion for x < a+1, continued fraction otherwise.
|
|
70
|
+
"""
|
|
71
|
+
if x <= 0 or df <= 0:
|
|
72
|
+
return 0.0 if x <= 0 else 1.0
|
|
73
|
+
|
|
74
|
+
a = df / 2.0
|
|
75
|
+
z = x / 2.0
|
|
76
|
+
|
|
77
|
+
if z < a + 1:
|
|
78
|
+
# Series expansion for P(a, z)
|
|
79
|
+
ap = a
|
|
80
|
+
total = 1.0 / a
|
|
81
|
+
delta = total
|
|
82
|
+
|
|
83
|
+
for _ in range(1000):
|
|
84
|
+
ap += 1
|
|
85
|
+
delta *= z / ap
|
|
86
|
+
total += delta
|
|
87
|
+
if abs(delta) < abs(total) * 1e-15:
|
|
88
|
+
break
|
|
89
|
+
|
|
90
|
+
try:
|
|
91
|
+
log_prefix = a * math.log(z) - z - math.lgamma(a)
|
|
92
|
+
return min(max(total * math.exp(log_prefix), 0.0), 1.0)
|
|
93
|
+
except (OverflowError, ValueError):
|
|
94
|
+
return 0.0
|
|
95
|
+
else:
|
|
96
|
+
# Continued fraction for Q(a, z) = 1 - P(a, z)
|
|
97
|
+
try:
|
|
98
|
+
log_prefix = a * math.log(z) - z - math.lgamma(a)
|
|
99
|
+
except (OverflowError, ValueError):
|
|
100
|
+
return 1.0
|
|
101
|
+
|
|
102
|
+
b = z + 1 - a
|
|
103
|
+
c = 1.0 / 1e-30
|
|
104
|
+
d = 1.0 / b
|
|
105
|
+
h = d
|
|
106
|
+
|
|
107
|
+
for i in range(1, 1000):
|
|
108
|
+
an = -i * (i - a)
|
|
109
|
+
b += 2.0
|
|
110
|
+
d = an * d + b
|
|
111
|
+
if abs(d) < 1e-30:
|
|
112
|
+
d = 1e-30
|
|
113
|
+
c = b + an / c
|
|
114
|
+
if abs(c) < 1e-30:
|
|
115
|
+
c = 1e-30
|
|
116
|
+
d = 1.0 / d
|
|
117
|
+
delta = d * c
|
|
118
|
+
h *= delta
|
|
119
|
+
if abs(delta - 1.0) < 1e-15:
|
|
120
|
+
break
|
|
121
|
+
|
|
122
|
+
q = math.exp(log_prefix) * h
|
|
123
|
+
return min(max(1.0 - q, 0.0), 1.0)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def _kilgarriff_core(
|
|
127
|
+
tokens1: list[str],
|
|
128
|
+
tokens2: list[str],
|
|
129
|
+
n_words: int = 500,
|
|
130
|
+
) -> tuple[float, int, list[tuple[str, float]], dict[str, Any]]:
|
|
131
|
+
"""
|
|
132
|
+
Core chi-squared computation between two tokenized texts.
|
|
133
|
+
|
|
134
|
+
This internal function performs the actual chi-squared calculation and is
|
|
135
|
+
shared by both compute_kilgarriff() (two-text comparison) and the
|
|
136
|
+
consistency module's compute_kilgarriff_drift() (intra-document analysis).
|
|
137
|
+
|
|
138
|
+
Related GitHub Issues:
|
|
139
|
+
#31 - Classical Stylometric Methods from Programming Historian
|
|
140
|
+
https://github.com/craigtrim/pystylometry/issues/31
|
|
141
|
+
#36 - Shared by consistency/drift.py for sliding window analysis
|
|
142
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
143
|
+
|
|
144
|
+
Algorithm:
|
|
145
|
+
1. Count word frequencies in each token list
|
|
146
|
+
2. Build joint vocabulary from top N words in combined corpus
|
|
147
|
+
3. For each word in joint vocabulary:
|
|
148
|
+
a. Compute observed count in each text
|
|
149
|
+
b. Compute expected count based on joint corpus proportions
|
|
150
|
+
c. Calculate chi-squared contribution: (O - E)² / E
|
|
151
|
+
4. Sum contributions for total chi-squared statistic
|
|
152
|
+
|
|
153
|
+
Args:
|
|
154
|
+
tokens1: List of tokens from first text (already lowercased)
|
|
155
|
+
tokens2: List of tokens from second text (already lowercased)
|
|
156
|
+
n_words: Number of most frequent words to use (default: 500)
|
|
157
|
+
|
|
158
|
+
Returns:
|
|
159
|
+
Tuple of:
|
|
160
|
+
- chi_squared: Total chi-squared statistic
|
|
161
|
+
- df: Degrees of freedom (n_words - 1)
|
|
162
|
+
- top_contributors: List of (word, contribution) pairs sorted by contribution
|
|
163
|
+
- details: Dict with frequency tables and intermediate values
|
|
164
|
+
|
|
165
|
+
Note:
|
|
166
|
+
P-value computation is omitted because the chi-squared test assumptions
|
|
167
|
+
are often violated in stylometric analysis (words are not independent).
|
|
168
|
+
The raw chi-squared value is more useful for relative comparisons.
|
|
169
|
+
|
|
170
|
+
Example:
|
|
171
|
+
>>> tokens1 = ["the", "cat", "sat", "on", "the", "mat"]
|
|
172
|
+
>>> tokens2 = ["the", "dog", "ran", "to", "the", "park"]
|
|
173
|
+
>>> chi_sq, df, top, details = _kilgarriff_core(tokens1, tokens2, n_words=10)
|
|
174
|
+
>>> print(f"Chi-squared: {chi_sq:.2f}")
|
|
175
|
+
"""
|
|
176
|
+
# Handle edge cases
|
|
177
|
+
if not tokens1 or not tokens2:
|
|
178
|
+
return 0.0, 0, [], {"warning": "One or both token lists are empty"}
|
|
179
|
+
|
|
180
|
+
# Count word frequencies
|
|
181
|
+
freq1 = Counter(tokens1)
|
|
182
|
+
freq2 = Counter(tokens2)
|
|
183
|
+
|
|
184
|
+
# Build joint corpus vocabulary (top N words)
|
|
185
|
+
# Kilgarriff (2001) recommends using the joint corpus to avoid bias
|
|
186
|
+
joint_freq: Counter[str] = Counter()
|
|
187
|
+
joint_freq.update(freq1)
|
|
188
|
+
joint_freq.update(freq2)
|
|
189
|
+
top_words = [word for word, _ in joint_freq.most_common(n_words)]
|
|
190
|
+
|
|
191
|
+
# Calculate corpus sizes
|
|
192
|
+
size1 = len(tokens1)
|
|
193
|
+
size2 = len(tokens2)
|
|
194
|
+
total_size = size1 + size2
|
|
195
|
+
|
|
196
|
+
# Proportions for expected frequency calculation
|
|
197
|
+
prop1 = size1 / total_size
|
|
198
|
+
prop2 = size2 / total_size
|
|
199
|
+
|
|
200
|
+
# Calculate chi-squared contributions for each word
|
|
201
|
+
chi_squared = 0.0
|
|
202
|
+
contributions: list[tuple[str, float]] = []
|
|
203
|
+
|
|
204
|
+
for word in top_words:
|
|
205
|
+
# Observed counts
|
|
206
|
+
obs1 = freq1.get(word, 0)
|
|
207
|
+
obs2 = freq2.get(word, 0)
|
|
208
|
+
joint_count = obs1 + obs2
|
|
209
|
+
|
|
210
|
+
# Expected counts (under null hypothesis of same distribution)
|
|
211
|
+
# Expected = joint_count × proportion_of_corpus
|
|
212
|
+
exp1 = joint_count * prop1
|
|
213
|
+
exp2 = joint_count * prop2
|
|
214
|
+
|
|
215
|
+
# Chi-squared contribution for this word
|
|
216
|
+
# Only compute if expected > 0 to avoid division by zero
|
|
217
|
+
contrib = 0.0
|
|
218
|
+
if exp1 > 0:
|
|
219
|
+
contrib += ((obs1 - exp1) ** 2) / exp1
|
|
220
|
+
if exp2 > 0:
|
|
221
|
+
contrib += ((obs2 - exp2) ** 2) / exp2
|
|
222
|
+
|
|
223
|
+
chi_squared += contrib
|
|
224
|
+
contributions.append((word, contrib))
|
|
225
|
+
|
|
226
|
+
# Sort contributions by magnitude (descending)
|
|
227
|
+
contributions.sort(key=lambda x: x[1], reverse=True)
|
|
228
|
+
|
|
229
|
+
# Degrees of freedom: n_words - 1 (standard for chi-squared goodness of fit)
|
|
230
|
+
df = len(top_words) - 1 if len(top_words) > 1 else 0
|
|
231
|
+
|
|
232
|
+
# Detailed information for debugging and analysis
|
|
233
|
+
details = {
|
|
234
|
+
"text1_size": size1,
|
|
235
|
+
"text2_size": size2,
|
|
236
|
+
"joint_corpus_size": total_size,
|
|
237
|
+
"text1_vocab": len(freq1),
|
|
238
|
+
"text2_vocab": len(freq2),
|
|
239
|
+
"joint_vocab": len(joint_freq),
|
|
240
|
+
"features_used": len(top_words),
|
|
241
|
+
"text1_proportion": prop1,
|
|
242
|
+
"text2_proportion": prop2,
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
return chi_squared, df, contributions, details
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def compute_kilgarriff(
|
|
249
|
+
text1: str,
|
|
250
|
+
text2: str,
|
|
251
|
+
n_words: int = 500,
|
|
252
|
+
) -> KilgarriffResult:
|
|
253
|
+
"""
|
|
254
|
+
Compute Kilgarriff's chi-squared distance between two texts.
|
|
255
|
+
|
|
256
|
+
This function measures the statistical distance between two texts based on
|
|
257
|
+
their word frequency distributions. Lower values indicate more similar texts
|
|
258
|
+
(likely same author or style). The method is particularly effective for
|
|
259
|
+
authorship attribution.
|
|
260
|
+
|
|
261
|
+
Related GitHub Issues:
|
|
262
|
+
#31 - Classical Stylometric Methods from Programming Historian
|
|
263
|
+
https://github.com/craigtrim/pystylometry/issues/31
|
|
264
|
+
#24 - Additional Authorship Attribution Methods
|
|
265
|
+
https://github.com/craigtrim/pystylometry/issues/24
|
|
266
|
+
|
|
267
|
+
Algorithm Overview:
|
|
268
|
+
1. Tokenize both texts and convert to lowercase
|
|
269
|
+
2. Extract top N most frequent words from joint corpus
|
|
270
|
+
3. Compute chi-squared statistic comparing frequency distributions
|
|
271
|
+
4. Identify most discriminating words
|
|
272
|
+
|
|
273
|
+
Interpretation:
|
|
274
|
+
- Lower χ² = More similar texts (likely same author)
|
|
275
|
+
- Higher χ² = More different texts (likely different authors)
|
|
276
|
+
- Top discriminating words reveal what makes texts different
|
|
277
|
+
|
|
278
|
+
Recommended Usage:
|
|
279
|
+
- Use n_words=500 for general authorship (Kilgarriff's recommendation)
|
|
280
|
+
- Use n_words=100-200 for shorter texts (< 5000 words each)
|
|
281
|
+
- Use n_words=1000+ for very long texts or fine-grained analysis
|
|
282
|
+
|
|
283
|
+
References:
|
|
284
|
+
Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
|
|
285
|
+
Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
|
|
286
|
+
|
|
287
|
+
Programming Historian. "Introduction to Stylometry with Python."
|
|
288
|
+
https://programminghistorian.org/en/lessons/introduction-to-stylometry-with-python
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
text1: First text for comparison
|
|
292
|
+
text2: Second text for comparison
|
|
293
|
+
n_words: Number of most frequent words to analyze (default: 500).
|
|
294
|
+
Higher values provide finer discrimination but require longer texts.
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
KilgarriffResult containing:
|
|
298
|
+
- chi_squared: Chi-squared statistic (lower = more similar)
|
|
299
|
+
- p_value: Statistical significance (often unreliable; use chi_squared for comparison)
|
|
300
|
+
- degrees_of_freedom: n_words - 1
|
|
301
|
+
- feature_count: Number of words used
|
|
302
|
+
- most_distinctive_features: Words that contribute most to difference
|
|
303
|
+
- metadata: Detailed frequency information
|
|
304
|
+
|
|
305
|
+
Example:
|
|
306
|
+
>>> # Compare two texts
|
|
307
|
+
>>> result = compute_kilgarriff(text_by_author_a, text_by_author_b)
|
|
308
|
+
>>> print(f"Chi-squared distance: {result.chi_squared:.2f}")
|
|
309
|
+
>>> print(f"Most distinctive word: {result.most_distinctive_features[0][0]}")
|
|
310
|
+
|
|
311
|
+
>>> # Lower chi-squared suggests same author
|
|
312
|
+
>>> if result.chi_squared < threshold:
|
|
313
|
+
... print("Texts are stylistically similar")
|
|
314
|
+
|
|
315
|
+
Note:
|
|
316
|
+
The p_value is included for completeness but should be interpreted
|
|
317
|
+
cautiously. Chi-squared test assumptions (independence) are typically
|
|
318
|
+
violated in text analysis. The raw chi_squared value is more reliable
|
|
319
|
+
for relative comparisons between text pairs.
|
|
320
|
+
"""
|
|
321
|
+
# Tokenize and lowercase
|
|
322
|
+
# Using lowercase ensures "The" and "the" are counted together
|
|
323
|
+
tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
|
|
324
|
+
tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
|
|
325
|
+
|
|
326
|
+
# Compute chi-squared using core function
|
|
327
|
+
chi_squared, df, contributions, details = _kilgarriff_core(tokens1, tokens2, n_words=n_words)
|
|
328
|
+
|
|
329
|
+
# P-value computation (pure Python, no scipy required)
|
|
330
|
+
# Note: This is provided for completeness but should be used cautiously.
|
|
331
|
+
# The chi-squared test assumes independence, which is violated in text.
|
|
332
|
+
# For authorship attribution, relative chi-squared comparisons are more reliable.
|
|
333
|
+
p_value = 1.0 - _chi2_cdf(chi_squared, df) if df > 0 else 1.0
|
|
334
|
+
|
|
335
|
+
return KilgarriffResult(
|
|
336
|
+
chi_squared=chi_squared,
|
|
337
|
+
p_value=p_value,
|
|
338
|
+
degrees_of_freedom=df,
|
|
339
|
+
feature_count=len(contributions),
|
|
340
|
+
most_distinctive_features=contributions[:20], # Top 20 contributors
|
|
341
|
+
metadata={
|
|
342
|
+
**details,
|
|
343
|
+
"all_contributions": contributions, # Full list for detailed analysis
|
|
344
|
+
"method": "kilgarriff_2001",
|
|
345
|
+
"n_words_requested": n_words,
|
|
346
|
+
},
|
|
347
|
+
)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Character-level metrics for stylometric analysis.
|
|
2
|
+
|
|
3
|
+
This package provides character-level features for authorship attribution
|
|
4
|
+
and style analysis.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issue:
|
|
7
|
+
#12 - Character-Level Metrics
|
|
8
|
+
https://github.com/craigtrim/pystylometry/issues/12
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from .character_metrics import compute_character_metrics
|
|
12
|
+
|
|
13
|
+
__all__ = [
|
|
14
|
+
"compute_character_metrics",
|
|
15
|
+
]
|