pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -0,0 +1,21 @@
1
+ # authorship
2
+
3
+ ![7 public functions](https://img.shields.io/badge/functions-7-blue)
4
+ ![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
5
+
6
+ Authorship attribution methods for comparing texts and determining likely authorship.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Functions | Method |
11
+ |------|-----------|--------|
12
+ | `burrows_delta.py` | `compute_burrows_delta`, `compute_cosine_delta` | Classic Delta and angular distance variant |
13
+ | `zeta.py` | `compute_zeta` | Zeta method for marker word detection |
14
+ | `kilgarriff.py` | `compute_kilgarriff` | Chi-squared corpus comparison |
15
+ | `additional_methods.py` | `compute_minmax`, `compute_johns_delta` | MinMax distance, Quadratic/Weighted Delta |
16
+ | `compression.py` | `compute_compression_distance` | Normalized Compression Distance (NCD) |
17
+
18
+ ## See Also
19
+
20
+ - [`consistency/`](../consistency/) applies `compute_kilgarriff` in sliding windows for intra-document drift detection
21
+ - [`lexical/`](../lexical/) provides the vocabulary features many attribution methods rely on
@@ -1,14 +1,38 @@
1
- """Authorship attribution metrics."""
1
+ """Authorship attribution metrics.
2
2
 
3
- from .additional_methods import compute_johns_delta, compute_kilgarriff, compute_minmax
3
+ This module provides methods for authorship attribution - comparing texts to
4
+ determine whether they were written by the same author. Available methods
5
+ include classic approaches (Burrows' Delta, Zeta), statistical methods
6
+ (Kilgarriff's chi-squared), and information-theoretic methods (NCD).
7
+
8
+ Related GitHub Issues:
9
+ #24 - Additional Authorship Attribution Methods
10
+ https://github.com/craigtrim/pystylometry/issues/24
11
+ #31 - Classical Stylometric Methods from Programming Historian
12
+ https://github.com/craigtrim/pystylometry/issues/31
13
+
14
+ Available Functions:
15
+ compute_burrows_delta: Classic Delta method for authorship distance
16
+ compute_cosine_delta: Angular distance variant of Delta
17
+ compute_zeta: Zeta method for marker word detection
18
+ compute_kilgarriff: Chi-squared method for corpus comparison
19
+ compute_minmax: Burrows' original min-max distance method
20
+ compute_johns_delta: Delta variations (quadratic, weighted)
21
+ compute_compression_distance: Normalized Compression Distance (NCD)
22
+ """
23
+
24
+ from .additional_methods import compute_johns_delta, compute_minmax
4
25
  from .burrows_delta import compute_burrows_delta, compute_cosine_delta
26
+ from .compression import compute_compression_distance
27
+ from .kilgarriff import compute_kilgarriff
5
28
  from .zeta import compute_zeta
6
29
 
7
30
  __all__ = [
8
31
  "compute_burrows_delta",
32
+ "compute_compression_distance",
9
33
  "compute_cosine_delta",
10
- "compute_zeta",
34
+ "compute_johns_delta",
11
35
  "compute_kilgarriff",
12
36
  "compute_minmax",
13
- "compute_johns_delta",
37
+ "compute_zeta",
14
38
  ]
@@ -8,65 +8,150 @@ Related GitHub Issue:
8
8
  https://github.com/craigtrim/pystylometry/issues/24
9
9
 
10
10
  Methods implemented:
11
- - Kilgarriff's Chi-squared
12
- - Min-Max (Burrows' original method)
13
- - John Burrows' Delta variations
11
+ - Kilgarriff's Chi-squared -> See kilgarriff.py (Issue #31)
12
+ - Min-Max distance (Burrows' original method)
13
+ - John Burrows' Delta variations (Quadratic, Weighted)
14
14
 
15
15
  References:
16
16
  Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics.
17
17
  Burrows, J. F. (1992). Not unless you ask nicely. Literary and Linguistic Computing.
18
18
  Burrows, J. (2005). Who wrote Shamela? Literary and Linguistic Computing.
19
+ Argamon, S. (2008). Interpreting Burrows's Delta. Literary and Linguistic Computing.
19
20
  """
20
21
 
21
- from .._types import JohnsBurrowsResult, KilgarriffResult, MinMaxResult
22
+ from __future__ import annotations
22
23
 
24
+ import math
25
+ from collections import Counter
23
26
 
24
- def compute_kilgarriff(text1: str, text2: str, mfw: int = 100) -> KilgarriffResult:
27
+ from .._types import JohnsBurrowsResult, MinMaxResult
28
+ from .._utils import tokenize
29
+
30
+
31
+ def compute_minmax(text1: str, text2: str, mfw: int = 100) -> MinMaxResult:
25
32
  """
26
- Compute Kilgarriff's Chi-squared distance between two texts.
33
+ Compute Min-Max distance between two texts.
34
+
35
+ This is Burrows' original method from his 1992 paper, before the development
36
+ of Delta. It normalizes word frequencies using min-max scaling and computes
37
+ the mean absolute distance between normalized frequency vectors.
27
38
 
28
39
  Related GitHub Issue:
29
40
  #24 - Additional Authorship Attribution Methods
30
41
  https://github.com/craigtrim/pystylometry/issues/24
31
42
 
43
+ Algorithm:
44
+ 1. Tokenize both texts and build frequency distributions
45
+ 2. Identify the top N most frequent words in the joint corpus
46
+ 3. Compute relative frequencies for each word in each text
47
+ 4. Normalize each word's frequencies using min-max scaling:
48
+ normalized(f) = (f - min) / (max - min)
49
+ 5. Compute mean absolute difference of normalized frequencies
50
+
51
+ Interpretation:
52
+ - Lower values indicate more similar texts (likely same author)
53
+ - Higher values indicate more different texts
54
+ - Scale: 0.0 (identical) to 1.0 (maximally different)
55
+
56
+ References:
57
+ Burrows, J. F. (1992). Not unless you ask nicely: The interpretative
58
+ nexus between analysis and information. Literary and Linguistic
59
+ Computing, 7(2), 91-109.
60
+
32
61
  Args:
33
62
  text1: First text for comparison
34
63
  text2: Second text for comparison
35
- mfw: Number of most frequent words to analyze
64
+ mfw: Number of most frequent words to analyze (default: 100)
36
65
 
37
66
  Returns:
38
- KilgarriffResult with chi-squared statistic, p-value, and
39
- most distinctive features.
67
+ MinMaxResult with min-max distance and distinctive features.
68
+
69
+ Example:
70
+ >>> result = compute_minmax(text_by_author_a, text_by_author_b)
71
+ >>> print(f"MinMax distance: {result.minmax_distance:.3f}")
72
+ >>> print(f"Most distinctive: {result.most_distinctive_features[0]}")
40
73
  """
41
- # TODO: Implement Kilgarriff's chi-squared
42
- # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
43
- raise NotImplementedError(
44
- "Kilgarriff's chi-squared not yet implemented. "
45
- "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
46
- )
74
+ # Tokenize and lowercase
75
+ tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
76
+ tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
47
77
 
78
+ if not tokens1 or not tokens2:
79
+ return MinMaxResult(
80
+ minmax_distance=0.0,
81
+ feature_count=0,
82
+ most_distinctive_features=[],
83
+ metadata={
84
+ "text1_size": len(tokens1),
85
+ "text2_size": len(tokens2),
86
+ "warning": "One or both texts are empty",
87
+ },
88
+ )
48
89
 
49
- def compute_minmax(text1: str, text2: str, mfw: int = 100) -> MinMaxResult:
50
- """
51
- Compute Min-Max distance (Burrows' original method).
90
+ # Build frequency distributions
91
+ freq1 = Counter(tokens1)
92
+ freq2 = Counter(tokens2)
93
+ size1 = len(tokens1)
94
+ size2 = len(tokens2)
52
95
 
53
- Related GitHub Issue:
54
- #24 - Additional Authorship Attribution Methods
55
- https://github.com/craigtrim/pystylometry/issues/24
96
+ # Joint corpus: top N most frequent words
97
+ joint: Counter[str] = Counter()
98
+ joint.update(freq1)
99
+ joint.update(freq2)
100
+ top_words = [word for word, _ in joint.most_common(mfw)]
56
101
 
57
- Args:
58
- text1: First text for comparison
59
- text2: Second text for comparison
60
- mfw: Number of most frequent words to analyze
102
+ if not top_words:
103
+ return MinMaxResult(
104
+ minmax_distance=0.0,
105
+ feature_count=0,
106
+ most_distinctive_features=[],
107
+ metadata={
108
+ "text1_size": size1,
109
+ "text2_size": size2,
110
+ "warning": "No common words found",
111
+ },
112
+ )
61
113
 
62
- Returns:
63
- MinMaxResult with min-max distance and distinctive features.
64
- """
65
- # TODO: Implement Min-Max distance
66
- # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
67
- raise NotImplementedError(
68
- "Min-Max distance not yet implemented. "
69
- "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
114
+ # Relative frequencies
115
+ rel1 = [freq1.get(w, 0) / size1 for w in top_words]
116
+ rel2 = [freq2.get(w, 0) / size2 for w in top_words]
117
+
118
+ # Min-Max normalization per feature across both texts
119
+ # Then compute absolute distance
120
+ contributions: list[tuple[str, float]] = []
121
+ total_distance = 0.0
122
+
123
+ for i, word in enumerate(top_words):
124
+ f1, f2 = rel1[i], rel2[i]
125
+ max_val = max(f1, f2)
126
+
127
+ if max_val > 0:
128
+ # Min-Max normalized distance for this feature
129
+ dist = abs(f1 - f2) / max_val
130
+ else:
131
+ dist = 0.0
132
+
133
+ total_distance += dist
134
+ contributions.append((word, dist))
135
+
136
+ # Sort contributions by magnitude
137
+ contributions.sort(key=lambda x: x[1], reverse=True)
138
+
139
+ # Mean distance across all features
140
+ minmax_distance = total_distance / len(top_words) if top_words else 0.0
141
+
142
+ return MinMaxResult(
143
+ minmax_distance=minmax_distance,
144
+ feature_count=len(top_words),
145
+ most_distinctive_features=contributions[:20],
146
+ metadata={
147
+ "text1_size": size1,
148
+ "text2_size": size2,
149
+ "text1_vocab": len(freq1),
150
+ "text2_vocab": len(freq2),
151
+ "mfw_requested": mfw,
152
+ "method": "minmax_1992",
153
+ "all_contributions": contributions,
154
+ },
70
155
  )
71
156
 
72
157
 
@@ -79,22 +164,157 @@ def compute_johns_delta(
79
164
  """
80
165
  Compute John Burrows' Delta variations.
81
166
 
167
+ This implements alternative formulations of Burrows' Delta metric beyond
168
+ the standard mean absolute z-score difference. The quadratic variant uses
169
+ squared z-score differences (Euclidean distance), while the weighted variant
170
+ applies inverse-rank weighting so higher-frequency words contribute more.
171
+
82
172
  Related GitHub Issue:
83
173
  #24 - Additional Authorship Attribution Methods
84
174
  https://github.com/craigtrim/pystylometry/issues/24
85
175
 
176
+ Methods:
177
+ - "quadratic": Euclidean distance of z-scores
178
+ Delta_Q = sqrt(sum((z1_i - z2_i)^2) / n)
179
+
180
+ - "weighted": Inverse-rank weighted Delta
181
+ Delta_W = sum(w_i * |z1_i - z2_i|) / sum(w_i)
182
+ where w_i = 1 / rank_i
183
+
184
+ Interpretation:
185
+ - Lower values indicate more similar texts (likely same author)
186
+ - Quadratic Delta penalizes large deviations more than standard Delta
187
+ - Weighted Delta emphasizes the most frequent words
188
+
189
+ References:
190
+ Burrows, J. (2005). Who wrote Shamela? Verifying the authorship of a
191
+ parodic text. Literary and Linguistic Computing, 20(4), 437-450.
192
+ Argamon, S. (2008). Interpreting Burrows's Delta: Geometric and
193
+ probabilistic foundations. Literary and Linguistic Computing,
194
+ 23(2), 131-147.
195
+
86
196
  Args:
87
197
  text1: First text for comparison
88
198
  text2: Second text for comparison
89
- mfw: Number of most frequent words to analyze
90
- method: Delta variation ("quadratic", "weighted", "rotated")
199
+ mfw: Number of most frequent words to analyze (default: 100)
200
+ method: Delta variation ("quadratic" or "weighted")
91
201
 
92
202
  Returns:
93
203
  JohnsBurrowsResult with delta score and method details.
204
+
205
+ Example:
206
+ >>> result = compute_johns_delta(text1, text2, method="quadratic")
207
+ >>> print(f"Quadratic Delta: {result.delta_score:.3f}")
94
208
  """
95
- # TODO: Implement John's Delta variations
96
- # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
97
- raise NotImplementedError(
98
- "John's Delta variations not yet implemented. "
99
- "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
209
+ if method not in ("quadratic", "weighted"):
210
+ raise ValueError(f"method must be 'quadratic' or 'weighted', got '{method}'")
211
+
212
+ # Tokenize and lowercase
213
+ tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
214
+ tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
215
+
216
+ if not tokens1 or not tokens2:
217
+ return JohnsBurrowsResult(
218
+ delta_score=0.0,
219
+ method=method,
220
+ feature_count=0,
221
+ most_distinctive_features=[],
222
+ metadata={
223
+ "text1_size": len(tokens1),
224
+ "text2_size": len(tokens2),
225
+ "warning": "One or both texts are empty",
226
+ },
227
+ )
228
+
229
+ # Build frequency distributions
230
+ freq1 = Counter(tokens1)
231
+ freq2 = Counter(tokens2)
232
+ size1 = len(tokens1)
233
+ size2 = len(tokens2)
234
+
235
+ # Joint corpus: top N most frequent words
236
+ joint: Counter[str] = Counter()
237
+ joint.update(freq1)
238
+ joint.update(freq2)
239
+ top_words = [word for word, _ in joint.most_common(mfw)]
240
+
241
+ if not top_words:
242
+ return JohnsBurrowsResult(
243
+ delta_score=0.0,
244
+ method=method,
245
+ feature_count=0,
246
+ most_distinctive_features=[],
247
+ metadata={
248
+ "text1_size": size1,
249
+ "text2_size": size2,
250
+ "warning": "No common words found",
251
+ },
252
+ )
253
+
254
+ # Relative frequencies
255
+ rel1 = [freq1.get(w, 0) / size1 for w in top_words]
256
+ rel2 = [freq2.get(w, 0) / size2 for w in top_words]
257
+
258
+ # Mean-normalized differences
259
+ # With only 2 texts, classical z-scores are degenerate: stdev([a,b]) is
260
+ # always |a-b|/sqrt(2), producing identical z-scores (±0.707) for all
261
+ # features with any difference. Instead, we normalize by the mean frequency
262
+ # of each feature across both texts, which preserves discriminative power:
263
+ # normalized_i = (f1_i - f2_i) / mean(f1_i, f2_i)
264
+ # This weights words proportionally to how much they differ relative to
265
+ # their expected frequency, preventing high-frequency words from dominating
266
+ # through absolute differences alone.
267
+ z1: list[float] = []
268
+ z2: list[float] = []
269
+ for i in range(len(top_words)):
270
+ mean_val = (rel1[i] + rel2[i]) / 2
271
+ # Normalize by mean frequency; use epsilon for words absent in both
272
+ norm = mean_val if mean_val > 0 else 1e-10
273
+ z1.append((rel1[i] - mean_val) / norm)
274
+ z2.append((rel2[i] - mean_val) / norm)
275
+
276
+ # Compute distance based on method
277
+ contributions: list[tuple[str, float]] = []
278
+
279
+ if method == "quadratic":
280
+ # Quadratic Delta: root mean squared z-score difference
281
+ squared_diffs: list[float] = []
282
+ for i, word in enumerate(top_words):
283
+ diff_sq = (z1[i] - z2[i]) ** 2
284
+ squared_diffs.append(diff_sq)
285
+ contributions.append((word, diff_sq))
286
+
287
+ delta_score = math.sqrt(sum(squared_diffs) / len(squared_diffs)) if squared_diffs else 0.0
288
+
289
+ else: # weighted
290
+ # Weighted Delta: inverse-rank weighted mean absolute z-score difference
291
+ weighted_diffs: list[float] = []
292
+ weights: list[float] = []
293
+ for i, word in enumerate(top_words):
294
+ weight = 1.0 / (i + 1) # Inverse rank weighting
295
+ abs_diff = abs(z1[i] - z2[i])
296
+ weighted_diffs.append(weight * abs_diff)
297
+ weights.append(weight)
298
+ contributions.append((word, abs_diff))
299
+
300
+ delta_score = sum(weighted_diffs) / sum(weights) if weights else 0.0
301
+
302
+ # Sort contributions by magnitude
303
+ contributions.sort(key=lambda x: x[1], reverse=True)
304
+
305
+ return JohnsBurrowsResult(
306
+ delta_score=delta_score,
307
+ method=method,
308
+ feature_count=len(top_words),
309
+ most_distinctive_features=contributions[:20],
310
+ metadata={
311
+ "text1_size": size1,
312
+ "text2_size": size2,
313
+ "text1_vocab": len(freq1),
314
+ "text2_vocab": len(freq2),
315
+ "mfw_requested": mfw,
316
+ "z_scores_text1": dict(zip(top_words[:20], z1[:20])),
317
+ "z_scores_text2": dict(zip(top_words[:20], z2[:20])),
318
+ "all_contributions": contributions,
319
+ },
100
320
  )
@@ -0,0 +1,175 @@
1
+ """Compression-based authorship attribution using Normalized Compression Distance.
2
+
3
+ This module implements the Normalized Compression Distance (NCD) method for
4
+ authorship attribution. NCD is a language-independent similarity metric based
5
+ on Kolmogorov complexity, approximated through real-world compressors.
6
+
7
+ Related GitHub Issue:
8
+ #24 - Additional Authorship Attribution Methods
9
+ https://github.com/craigtrim/pystylometry/issues/24
10
+
11
+ The core insight is that if two texts share statistical regularities (as texts
12
+ by the same author tend to), compressing them together yields better compression
13
+ than compressing separately. This captures deep patterns including vocabulary,
14
+ syntax, and stylistic preferences without requiring explicit feature engineering.
15
+
16
+ References:
17
+ Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
18
+ IEEE Transactions on Information Theory, 51(4), 1523-1545.
19
+ Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
20
+ zipping. Physical Review Letters, 88(4), 048702.
21
+ Li, M., et al. (2004). The similarity metric. IEEE Transactions on
22
+ Information Theory, 50(12), 3250-3264.
23
+ """
24
+
25
+ from __future__ import annotations
26
+
27
+ import bz2
28
+ import gzip
29
+ import zlib
30
+
31
+ from .._types import CompressionResult
32
+
33
+ # Supported compressors mapped to their compress functions
34
+ _COMPRESSORS: dict[str, type] = {
35
+ "gzip": type(None), # placeholder, handled below
36
+ "zlib": type(None),
37
+ "bz2": type(None),
38
+ }
39
+
40
+ _VALID_COMPRESSORS = frozenset({"gzip", "zlib", "bz2"})
41
+
42
+
43
+ def _compress(data: bytes, compressor: str) -> bytes:
44
+ """Compress data using the specified algorithm.
45
+
46
+ Args:
47
+ data: Raw bytes to compress.
48
+ compressor: One of "gzip", "zlib", or "bz2".
49
+
50
+ Returns:
51
+ Compressed bytes.
52
+ """
53
+ if compressor == "gzip":
54
+ return gzip.compress(data)
55
+ if compressor == "zlib":
56
+ return zlib.compress(data)
57
+ if compressor == "bz2":
58
+ return bz2.compress(data)
59
+ raise ValueError(f"Unknown compressor: {compressor}") # pragma: no cover
60
+
61
+
62
+ def compute_compression_distance(
63
+ text1: str,
64
+ text2: str,
65
+ compressor: str = "gzip",
66
+ ) -> CompressionResult:
67
+ """
68
+ Compute Normalized Compression Distance (NCD) between two texts.
69
+
70
+ NCD approximates the normalized information distance, a universal similarity
71
+ metric based on Kolmogorov complexity. Since Kolmogorov complexity is
72
+ uncomputable, NCD uses real-world compressors as practical approximations.
73
+
74
+ Related GitHub Issue:
75
+ #24 - Additional Authorship Attribution Methods
76
+ https://github.com/craigtrim/pystylometry/issues/24
77
+
78
+ Algorithm:
79
+ 1. Encode both texts as UTF-8 bytes
80
+ 2. Compress text1, text2, and their concatenation separately
81
+ 3. Compute NCD using the formula:
82
+ NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
83
+
84
+ Interpretation:
85
+ - NCD ~ 0: Texts are maximally similar (identical information content)
86
+ - NCD ~ 1: Texts are maximally different (no shared information)
87
+ - Values slightly above 1.0 are possible due to compressor overhead
88
+ - Typical same-author pairs: 0.3-0.6
89
+ - Typical different-author pairs: 0.6-0.9
90
+
91
+ Compressor choice:
92
+ - "gzip" (default): Good balance of speed and accuracy; most widely used
93
+ in NCD literature. Uses Lempel-Ziv (LZ77) algorithm.
94
+ - "zlib": Same underlying algorithm as gzip but lower overhead. Slightly
95
+ faster, very similar results.
96
+ - "bz2": Uses Burrows-Wheeler transform. Better compression but slower.
97
+ May capture different patterns than LZ-based methods.
98
+
99
+ References:
100
+ Cilibrasi, R., & Vitanyi, P. M. (2005). Clustering by compression.
101
+ IEEE Transactions on Information Theory, 51(4), 1523-1545.
102
+ Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
103
+ zipping. Physical Review Letters, 88(4), 048702.
104
+ Li, M., et al. (2004). The similarity metric. IEEE Transactions on
105
+ Information Theory, 50(12), 3250-3264.
106
+
107
+ Args:
108
+ text1: First text for comparison.
109
+ text2: Second text for comparison.
110
+ compressor: Compression algorithm to use ("gzip", "zlib", or "bz2").
111
+
112
+ Returns:
113
+ CompressionResult with NCD score and compression details.
114
+
115
+ Raises:
116
+ ValueError: If compressor is not one of "gzip", "zlib", "bz2".
117
+
118
+ Example:
119
+ >>> result = compute_compression_distance(text_by_author_a, text_by_author_b)
120
+ >>> print(f"NCD: {result.ncd:.3f}")
121
+ >>> print(f"Compressor: {result.compressor}")
122
+ >>> if result.ncd < 0.5:
123
+ ... print("Texts likely by same author")
124
+ """
125
+ if compressor not in _VALID_COMPRESSORS:
126
+ raise ValueError(
127
+ f"compressor must be one of {sorted(_VALID_COMPRESSORS)}, got '{compressor}'"
128
+ )
129
+
130
+ # Encode texts as bytes
131
+ bytes1 = text1.encode("utf-8")
132
+ bytes2 = text2.encode("utf-8")
133
+ bytes_combined = bytes1 + bytes2
134
+
135
+ # Compress each component
136
+ compressed1 = _compress(bytes1, compressor)
137
+ compressed2 = _compress(bytes2, compressor)
138
+ compressed_combined = _compress(bytes_combined, compressor)
139
+
140
+ c1 = len(compressed1)
141
+ c2 = len(compressed2)
142
+ c12 = len(compressed_combined)
143
+
144
+ # NCD formula: (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
145
+ min_c = min(c1, c2)
146
+ max_c = max(c1, c2)
147
+
148
+ if max_c == 0:
149
+ # Both texts are empty
150
+ ncd = 0.0
151
+ else:
152
+ ncd = (c12 - min_c) / max_c
153
+
154
+ # Compute compression ratios for metadata
155
+ raw1 = len(bytes1)
156
+ raw2 = len(bytes2)
157
+ raw_combined = len(bytes_combined)
158
+
159
+ return CompressionResult(
160
+ ncd=ncd,
161
+ compressor=compressor,
162
+ text1_compressed_size=c1,
163
+ text2_compressed_size=c2,
164
+ combined_compressed_size=c12,
165
+ metadata={
166
+ "text1_raw_size": raw1,
167
+ "text2_raw_size": raw2,
168
+ "combined_raw_size": raw_combined,
169
+ "text1_compression_ratio": c1 / raw1 if raw1 > 0 else 0.0,
170
+ "text2_compression_ratio": c2 / raw2 if raw2 > 0 else 0.0,
171
+ "combined_compression_ratio": c12 / raw_combined if raw_combined > 0 else 0.0,
172
+ "min_compressed": min_c,
173
+ "max_compressed": max_c,
174
+ },
175
+ )