pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -0,0 +1,354 @@
1
+ """Kilgarriff's Chi-Squared method for authorship attribution.
2
+
3
+ This module implements Adam Kilgarriff's chi-squared method for measuring
4
+ statistical distance between two text corpora based on word frequency
5
+ distributions. The method is particularly effective for authorship attribution
6
+ when comparing frequency profiles of common words.
7
+
8
+ Related GitHub Issues:
9
+ #31 - Classical Stylometric Methods from Programming Historian
10
+ https://github.com/craigtrim/pystylometry/issues/31
11
+ #24 - Additional Authorship Attribution Methods
12
+ https://github.com/craigtrim/pystylometry/issues/24
13
+ #36 - Kilgarriff Chi-Squared drift detection (uses _kilgarriff_core)
14
+ https://github.com/craigtrim/pystylometry/issues/36
15
+
16
+ Algorithm Overview:
17
+ Kilgarriff's method compares two texts by:
18
+ 1. Combining both texts into a joint corpus
19
+ 2. Extracting the top N most frequent words from the joint corpus
20
+ 3. For each word, computing expected vs. observed frequencies
21
+ 4. Applying the chi-squared formula: χ² = Σ((O - E)² / E)
22
+
23
+ Lower chi-squared values indicate more similar texts (likely same author).
24
+ The method identifies which words contribute most to the difference,
25
+ providing interpretable results.
26
+
27
+ Theoretical Background:
28
+ The chi-squared test measures the discrepancy between observed and expected
29
+ frequencies. In Kilgarriff's formulation, the expected frequency for a word
30
+ is calculated assuming both texts come from the same underlying distribution
31
+ (the joint corpus). Large deviations from this expectation contribute to
32
+ higher chi-squared scores.
33
+
34
+ Formula for expected frequency of word w in text T:
35
+ E(w, T) = (count(w) in joint corpus) × (size(T) / size(joint corpus))
36
+
37
+ Chi-squared contribution for word w:
38
+ χ²(w) = ((O(w, T1) - E(w, T1))² / E(w, T1)) + ((O(w, T2) - E(w, T2))² / E(w, T2))
39
+
40
+ References:
41
+ Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
42
+ Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
43
+ doi: 10.1075/ijcl.6.1.05kil
44
+
45
+ Oakes, Michael P. "Statistics for Corpus Linguistics." Edinburgh University
46
+ Press, 1998.
47
+
48
+ Programming Historian. "Introduction to Stylometry with Python."
49
+ https://programminghistorian.org/en/lessons/introduction-to-stylometry-with-python
50
+ """
51
+
52
+ from __future__ import annotations
53
+
54
+ import math
55
+ from collections import Counter
56
+ from typing import Any
57
+
58
+ from .._types import KilgarriffResult
59
+ from .._utils import tokenize
60
+
61
+
62
+ def _chi2_cdf(x: float, df: int) -> float:
63
+ """
64
+ Compute chi-squared CDF in pure Python (no scipy required).
65
+
66
+ P(X ≤ x) for X ~ χ²(df), computed via the regularized lower
67
+ incomplete gamma function: P(df/2, x/2).
68
+
69
+ Uses series expansion for x < a+1, continued fraction otherwise.
70
+ """
71
+ if x <= 0 or df <= 0:
72
+ return 0.0 if x <= 0 else 1.0
73
+
74
+ a = df / 2.0
75
+ z = x / 2.0
76
+
77
+ if z < a + 1:
78
+ # Series expansion for P(a, z)
79
+ ap = a
80
+ total = 1.0 / a
81
+ delta = total
82
+
83
+ for _ in range(1000):
84
+ ap += 1
85
+ delta *= z / ap
86
+ total += delta
87
+ if abs(delta) < abs(total) * 1e-15:
88
+ break
89
+
90
+ try:
91
+ log_prefix = a * math.log(z) - z - math.lgamma(a)
92
+ return min(max(total * math.exp(log_prefix), 0.0), 1.0)
93
+ except (OverflowError, ValueError):
94
+ return 0.0
95
+ else:
96
+ # Continued fraction for Q(a, z) = 1 - P(a, z)
97
+ try:
98
+ log_prefix = a * math.log(z) - z - math.lgamma(a)
99
+ except (OverflowError, ValueError):
100
+ return 1.0
101
+
102
+ b = z + 1 - a
103
+ c = 1.0 / 1e-30
104
+ d = 1.0 / b
105
+ h = d
106
+
107
+ for i in range(1, 1000):
108
+ an = -i * (i - a)
109
+ b += 2.0
110
+ d = an * d + b
111
+ if abs(d) < 1e-30:
112
+ d = 1e-30
113
+ c = b + an / c
114
+ if abs(c) < 1e-30:
115
+ c = 1e-30
116
+ d = 1.0 / d
117
+ delta = d * c
118
+ h *= delta
119
+ if abs(delta - 1.0) < 1e-15:
120
+ break
121
+
122
+ q = math.exp(log_prefix) * h
123
+ return min(max(1.0 - q, 0.0), 1.0)
124
+
125
+
126
+ def _kilgarriff_core(
127
+ tokens1: list[str],
128
+ tokens2: list[str],
129
+ n_words: int = 500,
130
+ ) -> tuple[float, int, list[tuple[str, float]], dict[str, Any]]:
131
+ """
132
+ Core chi-squared computation between two tokenized texts.
133
+
134
+ This internal function performs the actual chi-squared calculation and is
135
+ shared by both compute_kilgarriff() (two-text comparison) and the
136
+ consistency module's compute_kilgarriff_drift() (intra-document analysis).
137
+
138
+ Related GitHub Issues:
139
+ #31 - Classical Stylometric Methods from Programming Historian
140
+ https://github.com/craigtrim/pystylometry/issues/31
141
+ #36 - Shared by consistency/drift.py for sliding window analysis
142
+ https://github.com/craigtrim/pystylometry/issues/36
143
+
144
+ Algorithm:
145
+ 1. Count word frequencies in each token list
146
+ 2. Build joint vocabulary from top N words in combined corpus
147
+ 3. For each word in joint vocabulary:
148
+ a. Compute observed count in each text
149
+ b. Compute expected count based on joint corpus proportions
150
+ c. Calculate chi-squared contribution: (O - E)² / E
151
+ 4. Sum contributions for total chi-squared statistic
152
+
153
+ Args:
154
+ tokens1: List of tokens from first text (already lowercased)
155
+ tokens2: List of tokens from second text (already lowercased)
156
+ n_words: Number of most frequent words to use (default: 500)
157
+
158
+ Returns:
159
+ Tuple of:
160
+ - chi_squared: Total chi-squared statistic
161
+ - df: Degrees of freedom (n_words - 1)
162
+ - top_contributors: List of (word, contribution) pairs sorted by contribution
163
+ - details: Dict with frequency tables and intermediate values
164
+
165
+ Note:
166
+ P-value computation is omitted because the chi-squared test assumptions
167
+ are often violated in stylometric analysis (words are not independent).
168
+ The raw chi-squared value is more useful for relative comparisons.
169
+
170
+ Example:
171
+ >>> tokens1 = ["the", "cat", "sat", "on", "the", "mat"]
172
+ >>> tokens2 = ["the", "dog", "ran", "to", "the", "park"]
173
+ >>> chi_sq, df, top, details = _kilgarriff_core(tokens1, tokens2, n_words=10)
174
+ >>> print(f"Chi-squared: {chi_sq:.2f}")
175
+ """
176
+ # Handle edge cases
177
+ if not tokens1 or not tokens2:
178
+ return 0.0, 0, [], {"warning": "One or both token lists are empty"}
179
+
180
+ # Count word frequencies
181
+ freq1 = Counter(tokens1)
182
+ freq2 = Counter(tokens2)
183
+
184
+ # Build joint corpus vocabulary (top N words)
185
+ # Kilgarriff (2001) recommends using the joint corpus to avoid bias
186
+ joint_freq: Counter[str] = Counter()
187
+ joint_freq.update(freq1)
188
+ joint_freq.update(freq2)
189
+ top_words = [word for word, _ in joint_freq.most_common(n_words)]
190
+
191
+ # Calculate corpus sizes
192
+ size1 = len(tokens1)
193
+ size2 = len(tokens2)
194
+ total_size = size1 + size2
195
+
196
+ # Proportions for expected frequency calculation
197
+ prop1 = size1 / total_size
198
+ prop2 = size2 / total_size
199
+
200
+ # Calculate chi-squared contributions for each word
201
+ chi_squared = 0.0
202
+ contributions: list[tuple[str, float]] = []
203
+
204
+ for word in top_words:
205
+ # Observed counts
206
+ obs1 = freq1.get(word, 0)
207
+ obs2 = freq2.get(word, 0)
208
+ joint_count = obs1 + obs2
209
+
210
+ # Expected counts (under null hypothesis of same distribution)
211
+ # Expected = joint_count × proportion_of_corpus
212
+ exp1 = joint_count * prop1
213
+ exp2 = joint_count * prop2
214
+
215
+ # Chi-squared contribution for this word
216
+ # Only compute if expected > 0 to avoid division by zero
217
+ contrib = 0.0
218
+ if exp1 > 0:
219
+ contrib += ((obs1 - exp1) ** 2) / exp1
220
+ if exp2 > 0:
221
+ contrib += ((obs2 - exp2) ** 2) / exp2
222
+
223
+ chi_squared += contrib
224
+ contributions.append((word, contrib))
225
+
226
+ # Sort contributions by magnitude (descending)
227
+ contributions.sort(key=lambda x: x[1], reverse=True)
228
+
229
+ # Degrees of freedom: n_words - 1 (standard for chi-squared goodness of fit)
230
+ df = len(top_words) - 1 if len(top_words) > 1 else 0
231
+
232
+ # Detailed information for debugging and analysis
233
+ details = {
234
+ "text1_size": size1,
235
+ "text2_size": size2,
236
+ "joint_corpus_size": total_size,
237
+ "text1_vocab": len(freq1),
238
+ "text2_vocab": len(freq2),
239
+ "joint_vocab": len(joint_freq),
240
+ "features_used": len(top_words),
241
+ "text1_proportion": prop1,
242
+ "text2_proportion": prop2,
243
+ }
244
+
245
+ return chi_squared, df, contributions, details
246
+
247
+
248
+ def compute_kilgarriff(
249
+ text1: str,
250
+ text2: str,
251
+ n_words: int = 500,
252
+ top_features: int = 20,
253
+ ) -> KilgarriffResult:
254
+ """
255
+ Compute Kilgarriff's chi-squared distance between two texts.
256
+
257
+ This function measures the statistical distance between two texts based on
258
+ their word frequency distributions. Lower values indicate more similar texts
259
+ (likely same author or style). The method is particularly effective for
260
+ authorship attribution.
261
+
262
+ Related GitHub Issues:
263
+ #31 - Classical Stylometric Methods from Programming Historian
264
+ https://github.com/craigtrim/pystylometry/issues/31
265
+ #24 - Additional Authorship Attribution Methods
266
+ https://github.com/craigtrim/pystylometry/issues/24
267
+
268
+ Algorithm Overview:
269
+ 1. Tokenize both texts and convert to lowercase
270
+ 2. Extract top N most frequent words from joint corpus
271
+ 3. Compute chi-squared statistic comparing frequency distributions
272
+ 4. Identify most discriminating words
273
+
274
+ Interpretation:
275
+ - Lower χ² = More similar texts (likely same author)
276
+ - Higher χ² = More different texts (likely different authors)
277
+ - Top discriminating words reveal what makes texts different
278
+
279
+ Recommended Usage:
280
+ - Use n_words=500 for general authorship (Kilgarriff's recommendation)
281
+ - Use n_words=100-200 for shorter texts (< 5000 words each)
282
+ - Use n_words=1000+ for very long texts or fine-grained analysis
283
+
284
+ References:
285
+ Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
286
+ Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
287
+
288
+ Programming Historian. "Introduction to Stylometry with Python."
289
+ https://programminghistorian.org/en/lessons/introduction-to-stylometry-with-python
290
+
291
+ Args:
292
+ text1: First text for comparison
293
+ text2: Second text for comparison
294
+ n_words: Number of most frequent words to analyze (default: 500).
295
+ Higher values provide finer discrimination but require longer texts.
296
+ top_features: Number of most distinctive features to return (default: 20).
297
+ Controls the length of most_distinctive_features in the result.
298
+
299
+ Returns:
300
+ KilgarriffResult containing:
301
+ - chi_squared: Chi-squared statistic (lower = more similar)
302
+ - p_value: Statistical significance (often unreliable; use chi_squared for comparison)
303
+ - degrees_of_freedom: n_words - 1
304
+ - feature_count: Number of words used
305
+ - most_distinctive_features: Words that contribute most to difference
306
+ - metadata: Detailed frequency information
307
+
308
+ Example:
309
+ >>> # Compare two texts
310
+ >>> result = compute_kilgarriff(text_by_author_a, text_by_author_b)
311
+ >>> print(f"Chi-squared distance: {result.chi_squared:.2f}")
312
+ >>> print(f"Most distinctive word: {result.most_distinctive_features[0][0]}")
313
+
314
+ >>> # Lower chi-squared suggests same author
315
+ >>> if result.chi_squared < threshold:
316
+ ... print("Texts are stylistically similar")
317
+
318
+ Note:
319
+ The p_value is included for completeness but should be interpreted
320
+ cautiously. Chi-squared test assumptions (independence) are typically
321
+ violated in text analysis. The raw chi_squared value is more reliable
322
+ for relative comparisons between text pairs.
323
+ """
324
+ # Validate top_features
325
+ if top_features < 1:
326
+ raise ValueError("top_features must be >= 1")
327
+
328
+ # Tokenize and lowercase
329
+ # Using lowercase ensures "The" and "the" are counted together
330
+ tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
331
+ tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
332
+
333
+ # Compute chi-squared using core function
334
+ chi_squared, df, contributions, details = _kilgarriff_core(tokens1, tokens2, n_words=n_words)
335
+
336
+ # P-value computation (pure Python, no scipy required)
337
+ # Note: This is provided for completeness but should be used cautiously.
338
+ # The chi-squared test assumes independence, which is violated in text.
339
+ # For authorship attribution, relative chi-squared comparisons are more reliable.
340
+ p_value = 1.0 - _chi2_cdf(chi_squared, df) if df > 0 else 1.0
341
+
342
+ return KilgarriffResult(
343
+ chi_squared=chi_squared,
344
+ p_value=p_value,
345
+ degrees_of_freedom=df,
346
+ feature_count=len(contributions),
347
+ most_distinctive_features=contributions[:top_features],
348
+ metadata={
349
+ **details,
350
+ "all_contributions": contributions, # Full list for detailed analysis
351
+ "method": "kilgarriff_2001",
352
+ "n_words_requested": n_words,
353
+ },
354
+ )
@@ -0,0 +1,17 @@
1
+ # character
2
+
3
+ ![1 public function](https://img.shields.io/badge/functions-1-blue)
4
+ ![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
5
+
6
+ Character-level features for stylometric fingerprinting.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Function | What It Measures |
11
+ |------|----------|-----------------|
12
+ | `character_metrics.py` | `compute_character_metrics` | Letter frequencies, digit ratios, uppercase ratios, special character usage, whitespace patterns |
13
+
14
+ ## See Also
15
+
16
+ - [`ngrams/`](../ngrams/) for character-level bigram entropy
17
+ - [`stylistic/markers.py`](../stylistic/) for punctuation style analysis