pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
@@ -0,0 +1,549 @@
1
+ """Kilgarriff chi-squared drift detection for intra-document stylistic analysis.
2
+
3
+ This module implements drift detection within a single document by applying
4
+ Kilgarriff's chi-squared method to sequential chunks of text. It enables
5
+ detection of stylistic inconsistencies, AI-generated content signatures,
6
+ multi-author documents, and pasted/edited content.
7
+
8
+ Related GitHub Issues:
9
+ #36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
10
+ https://github.com/craigtrim/pystylometry/issues/36
11
+ #31 - Classical Stylometric Methods from Programming Historian
12
+ https://github.com/craigtrim/pystylometry/issues/31
13
+ #27 - Native chunked analysis with Distribution dataclass
14
+ https://github.com/craigtrim/pystylometry/issues/27
15
+
16
+ Core Concept:
17
+ By comparing sequential chunks of a single document, we can measure
18
+ **internal stylistic consistency**. Human writing typically shows natural
19
+ variation in chi-squared scores between chunks. AI-generated text often
20
+ shows either suspicious uniformity or periodic resets.
21
+
22
+ Pattern Signatures:
23
+ The function classifies detected patterns into named categories:
24
+
25
+ - consistent: Low, stable χ² across pairs
26
+ → Natural human writing with normal variation
27
+ → Well-edited, single-author text
28
+
29
+ - gradual_drift: Slowly increasing χ² trend
30
+ → Author fatigue (style degrades over time)
31
+ → Topic evolution affecting vocabulary
32
+ → Editing that becomes progressively heavier
33
+
34
+ - sudden_spike: One or more pairs have abnormally high χ²
35
+ → Pasted content from different source
36
+ → Different author wrote a section
37
+ → Heavy editing in one region
38
+
39
+ - suspiciously_uniform: Near-zero variance in χ² scores
40
+ → Possible AI generation (too consistent)
41
+ → Text generated in single session without revision
42
+ → Copy-pasted repetitive content
43
+
44
+ - unknown: Insufficient data for classification
45
+ → Text too short (fewer than MIN_WINDOWS chunks)
46
+
47
+ Sliding Window Support:
48
+ The function supports overlapping windows via the `stride` parameter:
49
+ - stride == window_size: Non-overlapping chunks (original behavior)
50
+ - stride < window_size: Overlapping windows (smoother drift curve)
51
+ - stride > window_size: Gaps between windows (sparse sampling)
52
+
53
+ 50% overlap (stride = window_size / 2) is recommended for smooth detection.
54
+
55
+ Marketing Name: "Style Drift Detector" / "Consistency Fingerprint"
56
+
57
+ References:
58
+ Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
59
+ Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
60
+ doi: 10.1075/ijcl.6.1.05kil
61
+
62
+ Eder, Maciej. "Does Size Matter? Authorship Attribution, Small Samples,
63
+ Big Problem." Digital Scholarship in the Humanities, vol. 30, no. 2,
64
+ 2015, pp. 167-182.
65
+
66
+ Juola, Patrick. "Authorship Attribution." Foundations and Trends in
67
+ Information Retrieval, vol. 1, no. 3, 2006, pp. 233-334.
68
+ """
69
+
70
+ from __future__ import annotations
71
+
72
+ import statistics
73
+ from typing import Any
74
+
75
+ from .._types import KilgarriffDriftResult
76
+ from .._utils import tokenize
77
+ from ..authorship.kilgarriff import _kilgarriff_core
78
+ from ._thresholds import (
79
+ CONFIDENCE_MIN_WINDOWS,
80
+ MARGINAL_DATA_MAX_CONFIDENCE,
81
+ MIN_WINDOWS,
82
+ RECOMMENDED_WINDOWS,
83
+ SPIKE_MIN_ABSOLUTE,
84
+ SPIKE_RATIO,
85
+ TREND_R_SQUARED_THRESHOLD,
86
+ TREND_SLOPE_THRESHOLD,
87
+ UNIFORM_CV_THRESHOLD,
88
+ UNIFORM_MEAN_THRESHOLD,
89
+ get_all_thresholds,
90
+ )
91
+
92
+
93
+ def _create_sliding_windows(
94
+ tokens: list[str],
95
+ window_size: int,
96
+ stride: int,
97
+ ) -> list[list[str]]:
98
+ """
99
+ Create sliding windows over a token list.
100
+
101
+ This function implements the sliding window mechanism for drift detection.
102
+ Windows can overlap (stride < window_size), be non-overlapping
103
+ (stride == window_size), or have gaps (stride > window_size).
104
+
105
+ Related GitHub Issue:
106
+ #36 - Sliding window support for drift detection
107
+ https://github.com/craigtrim/pystylometry/issues/36
108
+
109
+ Args:
110
+ tokens: List of tokens to window over
111
+ window_size: Number of tokens per window
112
+ stride: Number of tokens to advance between windows
113
+
114
+ Returns:
115
+ List of token lists, each representing one window
116
+
117
+ Example:
118
+ >>> tokens = ["a", "b", "c", "d", "e", "f", "g", "h"]
119
+ >>> windows = _create_sliding_windows(tokens, window_size=4, stride=2)
120
+ >>> # windows[0] = ["a", "b", "c", "d"]
121
+ >>> # windows[1] = ["c", "d", "e", "f"]
122
+ >>> # windows[2] = ["e", "f", "g", "h"]
123
+ """
124
+ if stride <= 0:
125
+ raise ValueError(f"stride must be positive, got {stride}")
126
+ if window_size <= 0:
127
+ raise ValueError(f"window_size must be positive, got {window_size}")
128
+
129
+ windows = []
130
+ start = 0
131
+
132
+ while start + window_size <= len(tokens):
133
+ window = tokens[start : start + window_size]
134
+ windows.append(window)
135
+ start += stride
136
+
137
+ # Handle final partial window if text doesn't divide evenly
138
+ # Only include if it has at least 50% of window_size tokens
139
+ if start < len(tokens):
140
+ final_window = tokens[start:]
141
+ if len(final_window) >= window_size * 0.5:
142
+ windows.append(final_window)
143
+
144
+ return windows
145
+
146
+
147
+ def _compute_trend(values: list[float]) -> tuple[float, float]:
148
+ """
149
+ Compute linear regression slope and R-squared for trend detection.
150
+
151
+ Uses simple linear regression to detect whether chi-squared values
152
+ are increasing or decreasing over the document.
153
+
154
+ Related GitHub Issue:
155
+ #36 - Gradual drift pattern detection
156
+ https://github.com/craigtrim/pystylometry/issues/36
157
+
158
+ Args:
159
+ values: List of chi-squared values in order
160
+
161
+ Returns:
162
+ Tuple of (slope, r_squared)
163
+ - slope: Chi-squared units per comparison (positive = increasing)
164
+ - r_squared: Coefficient of determination (0-1, higher = better fit)
165
+
166
+ Example:
167
+ >>> values = [10.0, 15.0, 20.0, 25.0, 30.0] # Linear increase
168
+ >>> slope, r_sq = _compute_trend(values)
169
+ >>> # slope ≈ 5.0, r_sq ≈ 1.0
170
+ """
171
+ if len(values) < 2:
172
+ return 0.0, 0.0
173
+
174
+ n = len(values)
175
+ x = list(range(n)) # 0, 1, 2, ...
176
+
177
+ # Means
178
+ mean_x = sum(x) / n
179
+ mean_y = sum(values) / n
180
+
181
+ # Covariance and variance
182
+ cov_xy = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, values))
183
+ var_x = sum((xi - mean_x) ** 2 for xi in x)
184
+ var_y = sum((yi - mean_y) ** 2 for yi in values)
185
+
186
+ # Slope
187
+ if var_x == 0:
188
+ return 0.0, 0.0
189
+ slope = cov_xy / var_x
190
+
191
+ # R-squared
192
+ if var_y == 0:
193
+ r_squared = 1.0 if slope == 0 else 0.0
194
+ else:
195
+ # R² = (explained variance) / (total variance)
196
+ ss_res = sum((yi - (mean_y + slope * (xi - mean_x))) ** 2 for xi, yi in zip(x, values))
197
+ ss_tot = var_y * n
198
+ r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0.0
199
+
200
+ return slope, max(0.0, min(1.0, r_squared))
201
+
202
+
203
+ def _classify_pattern(
204
+ mean_chi: float,
205
+ std_chi: float,
206
+ max_chi: float,
207
+ min_chi: float,
208
+ trend_slope: float,
209
+ trend_r_squared: float,
210
+ window_count: int,
211
+ ) -> tuple[str, float]:
212
+ """
213
+ Classify the detected pattern and compute confidence score.
214
+
215
+ This function implements the pattern classification logic described in
216
+ _thresholds.py. It uses a decision tree approach, checking for each
217
+ pattern in order of specificity.
218
+
219
+ Related GitHub Issue:
220
+ #36 - Pattern classification for drift detection
221
+ https://github.com/craigtrim/pystylometry/issues/36
222
+
223
+ Decision Order:
224
+ 1. Suspiciously uniform (near-zero variance)
225
+ 2. Sudden spike (outlier max value)
226
+ 3. Gradual drift (significant trend)
227
+ 4. Consistent (default)
228
+
229
+ Args:
230
+ mean_chi: Mean chi-squared across comparisons
231
+ std_chi: Standard deviation of chi-squared values
232
+ max_chi: Maximum chi-squared value
233
+ min_chi: Minimum chi-squared value
234
+ trend_slope: Linear regression slope
235
+ trend_r_squared: R-squared of trend fit
236
+ window_count: Number of windows analyzed
237
+
238
+ Returns:
239
+ Tuple of (pattern_name, confidence)
240
+ - pattern_name: One of "consistent", "gradual_drift", "sudden_spike",
241
+ "suspiciously_uniform", "unknown"
242
+ - confidence: 0.0-1.0 confidence in the classification
243
+
244
+ Note:
245
+ Confidence is scaled down for marginal data (few windows).
246
+ """
247
+ # Base confidence scales with window count
248
+ if window_count < MIN_WINDOWS:
249
+ return "unknown", 0.0
250
+
251
+ base_confidence = min(1.0, window_count / CONFIDENCE_MIN_WINDOWS)
252
+ if window_count < RECOMMENDED_WINDOWS:
253
+ base_confidence = min(base_confidence, MARGINAL_DATA_MAX_CONFIDENCE)
254
+
255
+ # Handle edge case of zero mean
256
+ if mean_chi == 0:
257
+ return "consistent", base_confidence
258
+
259
+ # Coefficient of variation
260
+ cv = std_chi / mean_chi if mean_chi > 0 else 0.0
261
+
262
+ # 1. Check for suspiciously uniform (AI signature)
263
+ # Very low variance with low mean suggests artificial consistency
264
+ if cv < UNIFORM_CV_THRESHOLD and mean_chi < UNIFORM_MEAN_THRESHOLD:
265
+ # Confidence increases as CV decreases
266
+ uniformity_strength = 1 - (cv / UNIFORM_CV_THRESHOLD)
267
+ confidence = base_confidence * (0.6 + 0.4 * uniformity_strength)
268
+ return "suspiciously_uniform", confidence
269
+
270
+ # 2. Check for sudden spike (discontinuity)
271
+ # Max significantly exceeds mean, indicating an outlier
272
+ spike_ratio = max_chi / mean_chi if mean_chi > 0 else 0.0
273
+ if spike_ratio > SPIKE_RATIO and max_chi > SPIKE_MIN_ABSOLUTE:
274
+ # Confidence based on how extreme the spike is
275
+ spike_strength = min(1.0, (spike_ratio - SPIKE_RATIO) / SPIKE_RATIO)
276
+ confidence = base_confidence * (0.7 + 0.3 * spike_strength)
277
+ return "sudden_spike", confidence
278
+
279
+ # 3. Check for gradual drift (trend)
280
+ # Significant slope with reasonable R-squared
281
+ if abs(trend_slope) > TREND_SLOPE_THRESHOLD and trend_r_squared > TREND_R_SQUARED_THRESHOLD:
282
+ # Confidence based on R-squared (how well trend explains variance)
283
+ confidence = base_confidence * (0.5 + 0.5 * trend_r_squared)
284
+ return "gradual_drift", confidence
285
+
286
+ # 4. Default: consistent (natural human variation)
287
+ # Moderate variance, no extreme patterns
288
+ confidence = base_confidence * 0.8 # Slightly lower confidence for "normal"
289
+ return "consistent", confidence
290
+
291
+
292
+ def compute_kilgarriff_drift(
293
+ text: str,
294
+ window_size: int = 1000,
295
+ stride: int = 500,
296
+ comparison_mode: str = "sequential",
297
+ lag: int = 1,
298
+ n_words: int = 500,
299
+ ) -> KilgarriffDriftResult:
300
+ """
301
+ Detect stylistic drift within a single document using Kilgarriff's chi-squared.
302
+
303
+ This function chunks a single text and computes chi-squared distances between
304
+ sequential (or all) chunk pairs to measure internal stylistic consistency.
305
+ It classifies the detected pattern and returns detailed metrics for analysis.
306
+
307
+ Related GitHub Issues:
308
+ #36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
309
+ https://github.com/craigtrim/pystylometry/issues/36
310
+ #31 - Classical Stylometric Methods from Programming Historian
311
+ https://github.com/craigtrim/pystylometry/issues/31
312
+
313
+ Marketing Name: "Style Drift Detector" / "Consistency Fingerprint"
314
+
315
+ Pattern Signatures:
316
+ - consistent: Low, stable χ² across pairs (natural human writing)
317
+ - gradual_drift: Slowly increasing trend (author fatigue, topic shift)
318
+ - sudden_spike: One pair has high χ² (pasted content, different author)
319
+ - suspiciously_uniform: Near-zero variance (possible AI generation)
320
+
321
+ Algorithm:
322
+ 1. Tokenize text and create sliding windows
323
+ 2. For each window pair (based on comparison_mode):
324
+ a. Compute chi-squared using Kilgarriff's method
325
+ b. Record contributing words
326
+ 3. Compute holistic metrics (mean, std, max, trend)
327
+ 4. Classify pattern based on thresholds
328
+ 5. Return detailed result with all metrics
329
+
330
+ References:
331
+ Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
332
+ Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
333
+
334
+ Args:
335
+ text: Input text to analyze for stylistic drift
336
+ window_size: Number of words per analysis window (default: 1000).
337
+ Larger windows provide more stable chi-squared but fewer comparisons.
338
+ stride: Number of words to advance between windows (default: 500).
339
+ - stride == window_size: Non-overlapping chunks
340
+ - stride < window_size: Overlapping windows (smoother detection)
341
+ - stride > window_size: Gaps between windows (sparse sampling)
342
+ comparison_mode: How to compare windows (default: "sequential")
343
+ - "sequential": Compare adjacent windows only (1-2, 2-3, 3-4)
344
+ - "all_pairs": Compare every window pair (produces distance matrix)
345
+ - "fixed_lag": Compare windows at fixed distance (e.g., 1-3, 2-4)
346
+ lag: Window distance for fixed_lag mode (default: 1, ignored otherwise)
347
+ n_words: Top N most frequent words for chi-squared (default: 500)
348
+
349
+ Returns:
350
+ KilgarriffDriftResult containing:
351
+ - status: "success", "marginal_data", or "insufficient_data"
352
+ - pattern: Classified pattern name
353
+ - pattern_confidence: 0.0-1.0 confidence score
354
+ - mean_chi_squared, std_chi_squared, max_chi_squared: Statistics
355
+ - trend: Slope of chi-squared over document
356
+ - pairwise_scores: Detailed per-pair data
357
+ - And more (see KilgarriffDriftResult docstring)
358
+
359
+ Raises:
360
+ ValueError: If stride is 0 or negative
361
+
362
+ Example:
363
+ >>> # Basic usage
364
+ >>> result = compute_kilgarriff_drift(long_text)
365
+ >>> print(f"Pattern: {result.pattern}")
366
+ >>> print(f"Confidence: {result.pattern_confidence:.2f}")
367
+
368
+ >>> # Custom sliding window
369
+ >>> result = compute_kilgarriff_drift(
370
+ ... text,
371
+ ... window_size=2000, # Larger windows
372
+ ... stride=1000, # 50% overlap
373
+ ... )
374
+
375
+ >>> # Check for AI-generated content
376
+ >>> if result.pattern == "suspiciously_uniform":
377
+ ... print("Warning: Text may be AI-generated")
378
+
379
+ >>> # Handle insufficient data gracefully
380
+ >>> if result.status == "insufficient_data":
381
+ ... print(result.status_message)
382
+ """
383
+ # Input validation
384
+ if stride <= 0:
385
+ raise ValueError(f"stride must be positive, got {stride}")
386
+ if window_size <= 0:
387
+ raise ValueError(f"window_size must be positive, got {window_size}")
388
+ valid_modes = ("sequential", "all_pairs", "fixed_lag")
389
+ if comparison_mode not in valid_modes:
390
+ raise ValueError(f"comparison_mode must be one of {valid_modes}, got '{comparison_mode}'")
391
+
392
+ # Tokenize text
393
+ tokens = [t.lower() for t in tokenize(text) if t.isalpha()]
394
+
395
+ # Create sliding windows
396
+ windows = _create_sliding_windows(tokens, window_size, stride)
397
+ window_count = len(windows)
398
+
399
+ # Compute overlap ratio for metadata
400
+ overlap_ratio = max(0.0, 1 - (stride / window_size))
401
+
402
+ # Get thresholds for transparency
403
+ thresholds = get_all_thresholds()
404
+
405
+ # Check for insufficient data
406
+ if window_count < MIN_WINDOWS:
407
+ return KilgarriffDriftResult(
408
+ status="insufficient_data",
409
+ status_message=(
410
+ f"Text produced {window_count} windows; minimum {MIN_WINDOWS} required. "
411
+ f"Need approximately {window_size + (MIN_WINDOWS - 1) * stride} words."
412
+ ),
413
+ pattern="unknown",
414
+ pattern_confidence=0.0,
415
+ mean_chi_squared=float("nan"),
416
+ std_chi_squared=float("nan"),
417
+ max_chi_squared=float("nan"),
418
+ min_chi_squared=float("nan"),
419
+ max_location=-1,
420
+ trend=float("nan"),
421
+ pairwise_scores=[],
422
+ window_size=window_size,
423
+ stride=stride,
424
+ overlap_ratio=overlap_ratio,
425
+ comparison_mode=comparison_mode,
426
+ window_count=window_count,
427
+ distance_matrix=None,
428
+ thresholds=thresholds,
429
+ metadata={
430
+ "total_tokens": len(tokens),
431
+ "tokens_per_window": [len(w) for w in windows],
432
+ },
433
+ )
434
+
435
+ # Determine which pairs to compare based on mode
436
+ pairs_to_compare: list[tuple[int, int]] = []
437
+
438
+ if comparison_mode == "sequential":
439
+ # Compare adjacent windows: (0,1), (1,2), (2,3), ...
440
+ pairs_to_compare = [(i, i + 1) for i in range(window_count - 1)]
441
+
442
+ elif comparison_mode == "all_pairs":
443
+ # Compare all window pairs: (0,1), (0,2), ..., (n-2,n-1)
444
+ pairs_to_compare = [(i, j) for i in range(window_count) for j in range(i + 1, window_count)]
445
+
446
+ elif comparison_mode == "fixed_lag":
447
+ # Compare windows at fixed lag distance: (0,lag), (1,lag+1), ...
448
+ pairs_to_compare = [
449
+ (i, i + lag) for i in range(window_count - lag) if i + lag < window_count
450
+ ]
451
+
452
+ # Compute chi-squared for each pair
453
+ pairwise_scores: list[dict[str, Any]] = []
454
+ chi_squared_values: list[float] = []
455
+
456
+ for i, j in pairs_to_compare:
457
+ chi_sq, df, contributions, details = _kilgarriff_core(
458
+ windows[i], windows[j], n_words=n_words
459
+ )
460
+
461
+ pairwise_scores.append(
462
+ {
463
+ "chunk_pair": (i, j),
464
+ "chi_squared": chi_sq,
465
+ "degrees_of_freedom": df,
466
+ "top_words": contributions[:10], # Top 10 contributing words
467
+ "window_i_size": len(windows[i]),
468
+ "window_j_size": len(windows[j]),
469
+ }
470
+ )
471
+ chi_squared_values.append(chi_sq)
472
+
473
+ # Build distance matrix for all_pairs mode
474
+ distance_matrix: list[list[float]] | None = None
475
+ if comparison_mode == "all_pairs":
476
+ distance_matrix = [[0.0] * window_count for _ in range(window_count)]
477
+ for score in pairwise_scores:
478
+ i, j = score["chunk_pair"]
479
+ distance_matrix[i][j] = score["chi_squared"]
480
+ distance_matrix[j][i] = score["chi_squared"] # Symmetric
481
+
482
+ # Compute holistic statistics
483
+ if chi_squared_values:
484
+ mean_chi = statistics.mean(chi_squared_values)
485
+ std_chi = statistics.stdev(chi_squared_values) if len(chi_squared_values) > 1 else 0.0
486
+ max_chi = max(chi_squared_values)
487
+ min_chi = min(chi_squared_values)
488
+ max_location = chi_squared_values.index(max_chi)
489
+ else:
490
+ mean_chi = std_chi = max_chi = min_chi = float("nan")
491
+ max_location = -1
492
+
493
+ # Compute trend (only meaningful for sequential comparisons)
494
+ if comparison_mode == "sequential" and len(chi_squared_values) >= 2:
495
+ trend_slope, trend_r_squared = _compute_trend(chi_squared_values)
496
+ else:
497
+ trend_slope = 0.0
498
+ trend_r_squared = 0.0
499
+
500
+ # Classify pattern
501
+ pattern, pattern_confidence = _classify_pattern(
502
+ mean_chi=mean_chi,
503
+ std_chi=std_chi,
504
+ max_chi=max_chi,
505
+ min_chi=min_chi,
506
+ trend_slope=trend_slope,
507
+ trend_r_squared=trend_r_squared,
508
+ window_count=window_count,
509
+ )
510
+
511
+ # Determine status
512
+ if window_count >= RECOMMENDED_WINDOWS:
513
+ status = "success"
514
+ status_message = f"Analyzed {window_count} windows with {len(pairwise_scores)} comparisons."
515
+ else:
516
+ status = "marginal_data"
517
+ status_message = (
518
+ f"Analyzed {window_count} windows; {RECOMMENDED_WINDOWS}+ recommended "
519
+ f"for reliable pattern classification."
520
+ )
521
+
522
+ return KilgarriffDriftResult(
523
+ status=status,
524
+ status_message=status_message,
525
+ pattern=pattern,
526
+ pattern_confidence=pattern_confidence,
527
+ mean_chi_squared=mean_chi,
528
+ std_chi_squared=std_chi,
529
+ max_chi_squared=max_chi,
530
+ min_chi_squared=min_chi,
531
+ max_location=max_location,
532
+ trend=trend_slope,
533
+ pairwise_scores=pairwise_scores,
534
+ window_size=window_size,
535
+ stride=stride,
536
+ overlap_ratio=overlap_ratio,
537
+ comparison_mode=comparison_mode,
538
+ window_count=window_count,
539
+ distance_matrix=distance_matrix,
540
+ thresholds=thresholds,
541
+ metadata={
542
+ "total_tokens": len(tokens),
543
+ "tokens_per_window": [len(w) for w in windows],
544
+ "comparisons_made": len(pairwise_scores),
545
+ "trend_r_squared": trend_r_squared,
546
+ "n_words_used": n_words,
547
+ "method": "kilgarriff_drift_2001",
548
+ },
549
+ )
@@ -0,0 +1,26 @@
1
+ # dialect
2
+
3
+ ![1 public function](https://img.shields.io/badge/functions-1-blue)
4
+ ![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
5
+
6
+ Regional dialect detection (British vs. American English) with markedness scoring.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Function | What It Does |
11
+ |------|----------|-------------|
12
+ | `detector.py` | `compute_dialect` | Classifies text dialect, computes British/American scores, markedness |
13
+ | `_loader.py` | `get_markers`, `DialectMarkers` | Loads and caches extensible JSON marker database |
14
+ | `_data/dialect_markers.json` | _(data)_ | Vocabulary, spelling, grammar, and eye-dialect markers |
15
+
16
+ ## Detection Categories
17
+
18
+ - **Vocabulary** -- flat/apartment, lorry/truck, boot/trunk
19
+ - **Spelling** -- colour/color, organise/organize, centre/center
20
+ - **Grammar** -- collective noun agreement, "have got" patterns
21
+ - **Eye dialect** -- gonna, wanna (register markers, not true dialect)
22
+
23
+ ## See Also
24
+
25
+ - [`stylistic/`](../stylistic/) for broader style marker analysis
26
+ - [`stylistic/genre_register.py`](../stylistic/) for formality and register classification
@@ -0,0 +1,65 @@
1
+ """Dialect detection module for stylometric analysis.
2
+
3
+ This module provides dialect detection capabilities, identifying regional
4
+ linguistic preferences (British vs. American English) and measuring text
5
+ markedness - how far the text deviates from "unmarked" standard English.
6
+
7
+ Related GitHub Issues:
8
+ #35 - Dialect detection with extensible JSON markers
9
+ https://github.com/craigtrim/pystylometry/issues/35
10
+ #30 - Whonix stylometry features (regional linguistic preferences)
11
+ https://github.com/craigtrim/pystylometry/issues/30
12
+ #27 - Native chunked analysis with Distribution dataclass
13
+ https://github.com/craigtrim/pystylometry/issues/27
14
+
15
+ Features:
16
+ - British/American vocabulary matching (flat/apartment, lorry/truck)
17
+ - Spelling pattern detection (-ise/-ize, -our/-or, -re/-er)
18
+ - Grammar pattern analysis (have got/have, collective noun agreement)
19
+ - Eye dialect identification (gonna, wanna - register, not dialect)
20
+ - Feature weighting based on linguistic research
21
+ - Markedness scoring for stylistic analysis
22
+ - Native chunked analysis with distribution statistics
23
+
24
+ The analysis uses an extensible JSON database (dialect_markers.json) that
25
+ can be augmented with additional markers over time.
26
+
27
+ Usage:
28
+ >>> from pystylometry.dialect import compute_dialect
29
+ >>> result = compute_dialect("The colour of the programme was brilliant.")
30
+ >>> result.dialect
31
+ 'british'
32
+ >>> result.british_score
33
+ 0.85
34
+ >>> result.markedness_score
35
+ 0.42
36
+
37
+ >>> # Access distributions for stylometric fingerprinting
38
+ >>> result.british_score_dist.std # Variance across chunks
39
+ 0.05
40
+
41
+ >>> # Inspect detailed marker breakdown
42
+ >>> result.spelling_markers
43
+ {'colour': 1, 'programme': 1}
44
+ >>> result.markers_by_level['phonological']
45
+ {'colour': 1}
46
+
47
+ References:
48
+ Goebl, Hans. "Dialektometrie: Prinzipien und Methoden des Einsatzes der
49
+ numerischen Taxonomie im Bereich der Dialektgeographie." Verlag der
50
+ Österreichischen Akademie der Wissenschaften, 1982.
51
+ Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
52
+ Compass, vol. 3, no. 1, 2009, pp. 175-198.
53
+ Whonix Project. "Stylometry: Deanonymization Techniques." Whonix Wiki,
54
+ https://www.whonix.org/wiki/Stylometry
55
+ """
56
+
57
+ from ._loader import DialectMarkers, clear_cache, get_markers
58
+ from .detector import compute_dialect
59
+
60
+ __all__ = [
61
+ "compute_dialect",
62
+ "get_markers",
63
+ "clear_cache",
64
+ "DialectMarkers",
65
+ ]