pystylometry 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pystylometry/README.md ADDED
@@ -0,0 +1,42 @@
1
+ # pystylometry
2
+
3
+ ![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue)
4
+ ![License: MIT](https://img.shields.io/badge/license-MIT-green)
5
+
6
+ Core package for stylometric analysis and authorship attribution.
7
+
8
+ ## Module Map
9
+
10
+ | Module | Purpose | Key Functions |
11
+ |--------|---------|---------------|
12
+ | [`lexical/`](lexical/) | Vocabulary diversity & richness | `compute_mtld`, `compute_yule`, `compute_ttr`, `compute_hapax_ratios` |
13
+ | [`readability/`](readability/) | Text readability scoring | `compute_flesch`, `compute_gunning_fog`, `compute_ari`, `compute_smog` |
14
+ | [`syntactic/`](syntactic/) | Sentence & parse structure | `compute_pos_ratios`, `compute_sentence_types`, `compute_advanced_syntactic` |
15
+ | [`authorship/`](authorship/) | Author attribution & comparison | `compute_burrows_delta`, `compute_kilgarriff`, `compute_compression_distance` |
16
+ | [`stylistic/`](stylistic/) | Style markers & vocabulary overlap | `compute_stylistic_markers`, `compute_vocabulary_overlap`, `compute_genre_register` |
17
+ | [`character/`](character/) | Character-level features | `compute_character_metrics` |
18
+ | [`ngrams/`](ngrams/) | N-gram entropy & sequences | `compute_extended_ngrams`, `compute_ngram_entropy` |
19
+ | [`dialect/`](dialect/) | Regional dialect detection | `compute_dialect` |
20
+ | [`consistency/`](consistency/) | Intra-document drift detection | `compute_kilgarriff_drift` |
21
+ | [`prosody/`](prosody/) | Rhythm & stress patterns | `compute_rhythm_prosody` |
22
+ | [`viz/`](viz/) | Visualization (PNG & interactive HTML) | `plot_drift_timeline`, `export_drift_report_jsx` |
23
+
24
+ ## Shared Internals
25
+
26
+ | File | Purpose |
27
+ |------|---------|
28
+ | `_types.py` | All dataclass result types (e.g. `FleschResult`, `MTLDResult`, `KilgarriffDriftResult`) |
29
+ | `_normalize.py` | Text normalization for readability and stylometry pipelines |
30
+ | `_utils.py` | Shared tokenization and helper functions |
31
+ | `tokenizer.py` | Configurable tokenizer with sentence/word splitting |
32
+ | `cli.py` | Command-line interface (`pystylometry analyze`) |
33
+
34
+ ## Installation Extras
35
+
36
+ ```
37
+ pip install pystylometry # Core (lexical only)
38
+ pip install pystylometry[readability] # + readability
39
+ pip install pystylometry[syntactic] # + syntactic (requires spaCy)
40
+ pip install pystylometry[authorship] # + authorship attribution
41
+ pip install pystylometry[all] # Everything
42
+ ```
pystylometry/__init__.py CHANGED
@@ -63,18 +63,28 @@ try:
63
63
  except ImportError:
64
64
  _SYNTACTIC_AVAILABLE = False
65
65
 
66
- # Authorship, ngrams, dialect, and consistency use only stdlib (no external dependencies)
66
+ # Prosody requires pronouncing (CMU dictionary) - same dependency as readability
67
+ try:
68
+ from . import prosody # noqa: F401 - Rhythm and prosody metrics (Issue #25)
69
+
70
+ _PROSODY_AVAILABLE = True
71
+ except ImportError:
72
+ _PROSODY_AVAILABLE = False
73
+
74
+ # Authorship, ngrams, dialect, consistency, and stylistic use only stdlib (no external dependencies)
67
75
  from . import (
68
76
  authorship, # noqa: F401
69
77
  consistency, # noqa: F401 - Style drift detection (Issue #36)
70
78
  dialect, # noqa: F401
71
79
  ngrams, # noqa: F401
80
+ stylistic, # noqa: F401 - Vocabulary overlap and similarity (Issue #21)
72
81
  )
73
82
 
74
83
  _AUTHORSHIP_AVAILABLE = True
75
84
  _NGRAMS_AVAILABLE = True
76
85
  _DIALECT_AVAILABLE = True
77
86
  _CONSISTENCY_AVAILABLE = True
87
+ _STYLISTIC_AVAILABLE = True
78
88
 
79
89
 
80
90
  def analyze(
@@ -206,6 +216,8 @@ def get_available_modules() -> dict[str, bool]:
206
216
  "ngrams": _NGRAMS_AVAILABLE,
207
217
  "dialect": _DIALECT_AVAILABLE,
208
218
  "consistency": _CONSISTENCY_AVAILABLE, # Style drift detection (Issue #36)
219
+ "stylistic": _STYLISTIC_AVAILABLE, # Vocabulary overlap (Issue #21)
220
+ "prosody": _PROSODY_AVAILABLE, # Rhythm and prosody (Issue #25)
209
221
  }
210
222
 
211
223
 
@@ -229,3 +241,7 @@ if _DIALECT_AVAILABLE:
229
241
  __all__.append("dialect")
230
242
  if _CONSISTENCY_AVAILABLE:
231
243
  __all__.append("consistency")
244
+ if _STYLISTIC_AVAILABLE:
245
+ __all__.append("stylistic")
246
+ if _PROSODY_AVAILABLE:
247
+ __all__.append("prosody")
pystylometry/_types.py CHANGED
@@ -1517,6 +1517,7 @@ class VocabularyOverlapResult:
1517
1517
  - Dice coefficient (2 * intersection / sum of sizes)
1518
1518
  - Overlap coefficient (intersection / min(size1, size2))
1519
1519
  - Cosine similarity (using word frequency vectors)
1520
+ - KL divergence (asymmetric distributional difference)
1520
1521
  - Shared vocabulary size and ratio
1521
1522
  - Unique words in each text
1522
1523
  - Most distinctive words for each text
@@ -1526,6 +1527,10 @@ class VocabularyOverlapResult:
1526
1527
  New Phytologist, 11(2), 37-50.
1527
1528
  Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
1528
1529
  Retrieval. McGraw-Hill.
1530
+ Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
1531
+ Annals of Mathematical Statistics, 22(1), 79-86.
1532
+ Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
1533
+ MIT Press.
1529
1534
 
1530
1535
  Example:
1531
1536
  >>> result = compute_vocabulary_overlap(text1, text2)
@@ -1539,6 +1544,7 @@ class VocabularyOverlapResult:
1539
1544
  dice_coefficient: float # 2 * intersection / (size1 + size2)
1540
1545
  overlap_coefficient: float # Intersection / min(size1, size2)
1541
1546
  cosine_similarity: float # Cosine of frequency vectors
1547
+ kl_divergence: float # Kullback-Leibler divergence (asymmetric, text1 || text2)
1542
1548
 
1543
1549
  # Vocabulary sizes
1544
1550
  text1_vocab_size: int # Unique words in text 1
@@ -1897,6 +1903,54 @@ class JohnsBurrowsResult:
1897
1903
  metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
1898
1904
 
1899
1905
 
1906
+ @dataclass
1907
+ class CompressionResult:
1908
+ """Result from compression-based authorship attribution.
1909
+
1910
+ Compression-based methods use the Normalized Compression Distance (NCD) to
1911
+ measure similarity between texts. The intuition is that if two texts are
1912
+ similar, compressing them together will yield better compression than
1913
+ compressing separately. This approach is language-independent and captures
1914
+ deep statistical regularities.
1915
+
1916
+ Related GitHub Issue:
1917
+ #24 - Additional Authorship Attribution Methods
1918
+ https://github.com/craigtrim/pystylometry/issues/24
1919
+
1920
+ Formula:
1921
+ NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
1922
+
1923
+ where C(x) is the compressed size of x, and C(xy) is the compressed
1924
+ size of x and y concatenated.
1925
+
1926
+ Interpretation:
1927
+ - NCD ≈ 0: Texts are very similar
1928
+ - NCD ≈ 1: Texts are very different
1929
+ - Typical same-author pairs: 0.3-0.6
1930
+ - Typical different-author pairs: 0.6-0.9
1931
+
1932
+ References:
1933
+ Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
1934
+ IEEE Transactions on Information Theory, 51(4), 1523-1545.
1935
+
1936
+ Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
1937
+ zipping. Physical Review Letters, 88(4), 048702.
1938
+
1939
+ Example:
1940
+ >>> result = compute_compression_distance(text1, text2)
1941
+ >>> print(f"NCD: {result.ncd:.3f}")
1942
+ >>> if result.ncd < 0.5:
1943
+ ... print("Texts likely by same author")
1944
+ """
1945
+
1946
+ ncd: float # Normalized Compression Distance [0, 1+]
1947
+ compressor: str # Compression algorithm used (e.g., "gzip", "zlib", "bz2")
1948
+ text1_compressed_size: int # Compressed size of text1 alone
1949
+ text2_compressed_size: int # Compressed size of text2 alone
1950
+ combined_compressed_size: int # Compressed size of concatenated texts
1951
+ metadata: dict[str, Any] # Raw sizes, compression ratios, etc.
1952
+
1953
+
1900
1954
  # ===== Rhythm and Prosody Results =====
1901
1955
  # Related to GitHub Issue #25: Rhythm and Prosody Metrics
1902
1956
  # https://github.com/craigtrim/pystylometry/issues/25
@@ -0,0 +1,21 @@
1
+ # authorship
2
+
3
+ ![7 public functions](https://img.shields.io/badge/functions-7-blue)
4
+ ![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
5
+
6
+ Authorship attribution methods for comparing texts and determining likely authorship.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Functions | Method |
11
+ |------|-----------|--------|
12
+ | `burrows_delta.py` | `compute_burrows_delta`, `compute_cosine_delta` | Classic Delta and angular distance variant |
13
+ | `zeta.py` | `compute_zeta` | Zeta method for marker word detection |
14
+ | `kilgarriff.py` | `compute_kilgarriff` | Chi-squared corpus comparison |
15
+ | `additional_methods.py` | `compute_minmax`, `compute_johns_delta` | MinMax distance, Quadratic/Weighted Delta |
16
+ | `compression.py` | `compute_compression_distance` | Normalized Compression Distance (NCD) |
17
+
18
+ ## See Also
19
+
20
+ - [`consistency/`](../consistency/) applies `compute_kilgarriff` in sliding windows for intra-document drift detection
21
+ - [`lexical/`](../lexical/) provides the vocabulary features many attribution methods rely on
@@ -2,8 +2,8 @@
2
2
 
3
3
  This module provides methods for authorship attribution - comparing texts to
4
4
  determine whether they were written by the same author. Available methods
5
- include classic approaches (Burrows' Delta, Zeta) and statistical methods
6
- (Kilgarriff's chi-squared).
5
+ include classic approaches (Burrows' Delta, Zeta), statistical methods
6
+ (Kilgarriff's chi-squared), and information-theoretic methods (NCD).
7
7
 
8
8
  Related GitHub Issues:
9
9
  #24 - Additional Authorship Attribution Methods
@@ -16,20 +16,23 @@ Available Functions:
16
16
  compute_cosine_delta: Angular distance variant of Delta
17
17
  compute_zeta: Zeta method for marker word detection
18
18
  compute_kilgarriff: Chi-squared method for corpus comparison
19
- compute_minmax: Burrows' original min-max method (not yet implemented)
20
- compute_johns_delta: Delta variations (not yet implemented)
19
+ compute_minmax: Burrows' original min-max distance method
20
+ compute_johns_delta: Delta variations (quadratic, weighted)
21
+ compute_compression_distance: Normalized Compression Distance (NCD)
21
22
  """
22
23
 
23
24
  from .additional_methods import compute_johns_delta, compute_minmax
24
25
  from .burrows_delta import compute_burrows_delta, compute_cosine_delta
26
+ from .compression import compute_compression_distance
25
27
  from .kilgarriff import compute_kilgarriff
26
28
  from .zeta import compute_zeta
27
29
 
28
30
  __all__ = [
29
31
  "compute_burrows_delta",
32
+ "compute_compression_distance",
30
33
  "compute_cosine_delta",
31
- "compute_zeta",
34
+ "compute_johns_delta",
32
35
  "compute_kilgarriff",
33
36
  "compute_minmax",
34
- "compute_johns_delta",
37
+ "compute_zeta",
35
38
  ]
@@ -8,40 +8,150 @@ Related GitHub Issue:
8
8
  https://github.com/craigtrim/pystylometry/issues/24
9
9
 
10
10
  Methods implemented:
11
- - Kilgarriff's Chi-squared See kilgarriff.py (Issue #31)
12
- - Min-Max (Burrows' original method) → Not yet implemented
13
- - John Burrows' Delta variations Not yet implemented
11
+ - Kilgarriff's Chi-squared -> See kilgarriff.py (Issue #31)
12
+ - Min-Max distance (Burrows' original method)
13
+ - John Burrows' Delta variations (Quadratic, Weighted)
14
14
 
15
15
  References:
16
16
  Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics.
17
17
  Burrows, J. F. (1992). Not unless you ask nicely. Literary and Linguistic Computing.
18
18
  Burrows, J. (2005). Who wrote Shamela? Literary and Linguistic Computing.
19
+ Argamon, S. (2008). Interpreting Burrows's Delta. Literary and Linguistic Computing.
19
20
  """
20
21
 
22
+ from __future__ import annotations
23
+
24
+ import math
25
+ from collections import Counter
26
+
21
27
  from .._types import JohnsBurrowsResult, MinMaxResult
28
+ from .._utils import tokenize
22
29
 
23
30
 
24
31
  def compute_minmax(text1: str, text2: str, mfw: int = 100) -> MinMaxResult:
25
32
  """
26
- Compute Min-Max distance (Burrows' original method).
33
+ Compute Min-Max distance between two texts.
34
+
35
+ This is Burrows' original method from his 1992 paper, before the development
36
+ of Delta. It normalizes word frequencies using min-max scaling and computes
37
+ the mean absolute distance between normalized frequency vectors.
27
38
 
28
39
  Related GitHub Issue:
29
40
  #24 - Additional Authorship Attribution Methods
30
41
  https://github.com/craigtrim/pystylometry/issues/24
31
42
 
43
+ Algorithm:
44
+ 1. Tokenize both texts and build frequency distributions
45
+ 2. Identify the top N most frequent words in the joint corpus
46
+ 3. Compute relative frequencies for each word in each text
47
+ 4. Normalize each word's frequencies using min-max scaling:
48
+ normalized(f) = (f - min) / (max - min)
49
+ 5. Compute mean absolute difference of normalized frequencies
50
+
51
+ Interpretation:
52
+ - Lower values indicate more similar texts (likely same author)
53
+ - Higher values indicate more different texts
54
+ - Scale: 0.0 (identical) to 1.0 (maximally different)
55
+
56
+ References:
57
+ Burrows, J. F. (1992). Not unless you ask nicely: The interpretative
58
+ nexus between analysis and information. Literary and Linguistic
59
+ Computing, 7(2), 91-109.
60
+
32
61
  Args:
33
62
  text1: First text for comparison
34
63
  text2: Second text for comparison
35
- mfw: Number of most frequent words to analyze
64
+ mfw: Number of most frequent words to analyze (default: 100)
36
65
 
37
66
  Returns:
38
67
  MinMaxResult with min-max distance and distinctive features.
68
+
69
+ Example:
70
+ >>> result = compute_minmax(text_by_author_a, text_by_author_b)
71
+ >>> print(f"MinMax distance: {result.minmax_distance:.3f}")
72
+ >>> print(f"Most distinctive: {result.most_distinctive_features[0]}")
39
73
  """
40
- # TODO: Implement Min-Max distance
41
- # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
42
- raise NotImplementedError(
43
- "Min-Max distance not yet implemented. "
44
- "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
74
+ # Tokenize and lowercase
75
+ tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
76
+ tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
77
+
78
+ if not tokens1 or not tokens2:
79
+ return MinMaxResult(
80
+ minmax_distance=0.0,
81
+ feature_count=0,
82
+ most_distinctive_features=[],
83
+ metadata={
84
+ "text1_size": len(tokens1),
85
+ "text2_size": len(tokens2),
86
+ "warning": "One or both texts are empty",
87
+ },
88
+ )
89
+
90
+ # Build frequency distributions
91
+ freq1 = Counter(tokens1)
92
+ freq2 = Counter(tokens2)
93
+ size1 = len(tokens1)
94
+ size2 = len(tokens2)
95
+
96
+ # Joint corpus: top N most frequent words
97
+ joint: Counter[str] = Counter()
98
+ joint.update(freq1)
99
+ joint.update(freq2)
100
+ top_words = [word for word, _ in joint.most_common(mfw)]
101
+
102
+ if not top_words:
103
+ return MinMaxResult(
104
+ minmax_distance=0.0,
105
+ feature_count=0,
106
+ most_distinctive_features=[],
107
+ metadata={
108
+ "text1_size": size1,
109
+ "text2_size": size2,
110
+ "warning": "No common words found",
111
+ },
112
+ )
113
+
114
+ # Relative frequencies
115
+ rel1 = [freq1.get(w, 0) / size1 for w in top_words]
116
+ rel2 = [freq2.get(w, 0) / size2 for w in top_words]
117
+
118
+ # Min-Max normalization per feature across both texts
119
+ # Then compute absolute distance
120
+ contributions: list[tuple[str, float]] = []
121
+ total_distance = 0.0
122
+
123
+ for i, word in enumerate(top_words):
124
+ f1, f2 = rel1[i], rel2[i]
125
+ max_val = max(f1, f2)
126
+
127
+ if max_val > 0:
128
+ # Min-Max normalized distance for this feature
129
+ dist = abs(f1 - f2) / max_val
130
+ else:
131
+ dist = 0.0
132
+
133
+ total_distance += dist
134
+ contributions.append((word, dist))
135
+
136
+ # Sort contributions by magnitude
137
+ contributions.sort(key=lambda x: x[1], reverse=True)
138
+
139
+ # Mean distance across all features
140
+ minmax_distance = total_distance / len(top_words) if top_words else 0.0
141
+
142
+ return MinMaxResult(
143
+ minmax_distance=minmax_distance,
144
+ feature_count=len(top_words),
145
+ most_distinctive_features=contributions[:20],
146
+ metadata={
147
+ "text1_size": size1,
148
+ "text2_size": size2,
149
+ "text1_vocab": len(freq1),
150
+ "text2_vocab": len(freq2),
151
+ "mfw_requested": mfw,
152
+ "method": "minmax_1992",
153
+ "all_contributions": contributions,
154
+ },
45
155
  )
46
156
 
47
157
 
@@ -54,22 +164,157 @@ def compute_johns_delta(
54
164
  """
55
165
  Compute John Burrows' Delta variations.
56
166
 
167
+ This implements alternative formulations of Burrows' Delta metric beyond
168
+ the standard mean absolute z-score difference. The quadratic variant uses
169
+ squared z-score differences (Euclidean distance), while the weighted variant
170
+ applies inverse-rank weighting so higher-frequency words contribute more.
171
+
57
172
  Related GitHub Issue:
58
173
  #24 - Additional Authorship Attribution Methods
59
174
  https://github.com/craigtrim/pystylometry/issues/24
60
175
 
176
+ Methods:
177
+ - "quadratic": Euclidean distance of z-scores
178
+ Delta_Q = sqrt(sum((z1_i - z2_i)^2) / n)
179
+
180
+ - "weighted": Inverse-rank weighted Delta
181
+ Delta_W = sum(w_i * |z1_i - z2_i|) / sum(w_i)
182
+ where w_i = 1 / rank_i
183
+
184
+ Interpretation:
185
+ - Lower values indicate more similar texts (likely same author)
186
+ - Quadratic Delta penalizes large deviations more than standard Delta
187
+ - Weighted Delta emphasizes the most frequent words
188
+
189
+ References:
190
+ Burrows, J. (2005). Who wrote Shamela? Verifying the authorship of a
191
+ parodic text. Literary and Linguistic Computing, 20(4), 437-450.
192
+ Argamon, S. (2008). Interpreting Burrows's Delta: Geometric and
193
+ probabilistic foundations. Literary and Linguistic Computing,
194
+ 23(2), 131-147.
195
+
61
196
  Args:
62
197
  text1: First text for comparison
63
198
  text2: Second text for comparison
64
- mfw: Number of most frequent words to analyze
65
- method: Delta variation ("quadratic", "weighted", "rotated")
199
+ mfw: Number of most frequent words to analyze (default: 100)
200
+ method: Delta variation ("quadratic" or "weighted")
66
201
 
67
202
  Returns:
68
203
  JohnsBurrowsResult with delta score and method details.
204
+
205
+ Example:
206
+ >>> result = compute_johns_delta(text1, text2, method="quadratic")
207
+ >>> print(f"Quadratic Delta: {result.delta_score:.3f}")
69
208
  """
70
- # TODO: Implement John's Delta variations
71
- # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
72
- raise NotImplementedError(
73
- "John's Delta variations not yet implemented. "
74
- "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
209
+ if method not in ("quadratic", "weighted"):
210
+ raise ValueError(f"method must be 'quadratic' or 'weighted', got '{method}'")
211
+
212
+ # Tokenize and lowercase
213
+ tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
214
+ tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
215
+
216
+ if not tokens1 or not tokens2:
217
+ return JohnsBurrowsResult(
218
+ delta_score=0.0,
219
+ method=method,
220
+ feature_count=0,
221
+ most_distinctive_features=[],
222
+ metadata={
223
+ "text1_size": len(tokens1),
224
+ "text2_size": len(tokens2),
225
+ "warning": "One or both texts are empty",
226
+ },
227
+ )
228
+
229
+ # Build frequency distributions
230
+ freq1 = Counter(tokens1)
231
+ freq2 = Counter(tokens2)
232
+ size1 = len(tokens1)
233
+ size2 = len(tokens2)
234
+
235
+ # Joint corpus: top N most frequent words
236
+ joint: Counter[str] = Counter()
237
+ joint.update(freq1)
238
+ joint.update(freq2)
239
+ top_words = [word for word, _ in joint.most_common(mfw)]
240
+
241
+ if not top_words:
242
+ return JohnsBurrowsResult(
243
+ delta_score=0.0,
244
+ method=method,
245
+ feature_count=0,
246
+ most_distinctive_features=[],
247
+ metadata={
248
+ "text1_size": size1,
249
+ "text2_size": size2,
250
+ "warning": "No common words found",
251
+ },
252
+ )
253
+
254
+ # Relative frequencies
255
+ rel1 = [freq1.get(w, 0) / size1 for w in top_words]
256
+ rel2 = [freq2.get(w, 0) / size2 for w in top_words]
257
+
258
+ # Mean-normalized differences
259
+ # With only 2 texts, classical z-scores are degenerate: stdev([a,b]) is
260
+ # always |a-b|/sqrt(2), producing identical z-scores (±0.707) for all
261
+ # features with any difference. Instead, we normalize by the mean frequency
262
+ # of each feature across both texts, which preserves discriminative power:
263
+ # normalized_i = (f1_i - f2_i) / mean(f1_i, f2_i)
264
+ # This weights words proportionally to how much they differ relative to
265
+ # their expected frequency, preventing high-frequency words from dominating
266
+ # through absolute differences alone.
267
+ z1: list[float] = []
268
+ z2: list[float] = []
269
+ for i in range(len(top_words)):
270
+ mean_val = (rel1[i] + rel2[i]) / 2
271
+ # Normalize by mean frequency; use epsilon for words absent in both
272
+ norm = mean_val if mean_val > 0 else 1e-10
273
+ z1.append((rel1[i] - mean_val) / norm)
274
+ z2.append((rel2[i] - mean_val) / norm)
275
+
276
+ # Compute distance based on method
277
+ contributions: list[tuple[str, float]] = []
278
+
279
+ if method == "quadratic":
280
+ # Quadratic Delta: root mean squared z-score difference
281
+ squared_diffs: list[float] = []
282
+ for i, word in enumerate(top_words):
283
+ diff_sq = (z1[i] - z2[i]) ** 2
284
+ squared_diffs.append(diff_sq)
285
+ contributions.append((word, diff_sq))
286
+
287
+ delta_score = math.sqrt(sum(squared_diffs) / len(squared_diffs)) if squared_diffs else 0.0
288
+
289
+ else: # weighted
290
+ # Weighted Delta: inverse-rank weighted mean absolute z-score difference
291
+ weighted_diffs: list[float] = []
292
+ weights: list[float] = []
293
+ for i, word in enumerate(top_words):
294
+ weight = 1.0 / (i + 1) # Inverse rank weighting
295
+ abs_diff = abs(z1[i] - z2[i])
296
+ weighted_diffs.append(weight * abs_diff)
297
+ weights.append(weight)
298
+ contributions.append((word, abs_diff))
299
+
300
+ delta_score = sum(weighted_diffs) / sum(weights) if weights else 0.0
301
+
302
+ # Sort contributions by magnitude
303
+ contributions.sort(key=lambda x: x[1], reverse=True)
304
+
305
+ return JohnsBurrowsResult(
306
+ delta_score=delta_score,
307
+ method=method,
308
+ feature_count=len(top_words),
309
+ most_distinctive_features=contributions[:20],
310
+ metadata={
311
+ "text1_size": size1,
312
+ "text2_size": size2,
313
+ "text1_vocab": len(freq1),
314
+ "text2_vocab": len(freq2),
315
+ "mfw_requested": mfw,
316
+ "z_scores_text1": dict(zip(top_words[:20], z1[:20])),
317
+ "z_scores_text2": dict(zip(top_words[:20], z2[:20])),
318
+ "all_contributions": contributions,
319
+ },
75
320
  )