pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +29 -3
- pystylometry/_types.py +963 -259
- pystylometry/authorship/__init__.py +23 -2
- pystylometry/authorship/additional_methods.py +4 -29
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/RECORD +0 -46
|
@@ -0,0 +1,549 @@
|
|
|
1
|
+
"""Kilgarriff chi-squared drift detection for intra-document stylistic analysis.
|
|
2
|
+
|
|
3
|
+
This module implements drift detection within a single document by applying
|
|
4
|
+
Kilgarriff's chi-squared method to sequential chunks of text. It enables
|
|
5
|
+
detection of stylistic inconsistencies, AI-generated content signatures,
|
|
6
|
+
multi-author documents, and pasted/edited content.
|
|
7
|
+
|
|
8
|
+
Related GitHub Issues:
|
|
9
|
+
#36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
|
|
10
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
11
|
+
#31 - Classical Stylometric Methods from Programming Historian
|
|
12
|
+
https://github.com/craigtrim/pystylometry/issues/31
|
|
13
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
14
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
15
|
+
|
|
16
|
+
Core Concept:
|
|
17
|
+
By comparing sequential chunks of a single document, we can measure
|
|
18
|
+
**internal stylistic consistency**. Human writing typically shows natural
|
|
19
|
+
variation in chi-squared scores between chunks. AI-generated text often
|
|
20
|
+
shows either suspicious uniformity or periodic resets.
|
|
21
|
+
|
|
22
|
+
Pattern Signatures:
|
|
23
|
+
The function classifies detected patterns into named categories:
|
|
24
|
+
|
|
25
|
+
- consistent: Low, stable χ² across pairs
|
|
26
|
+
→ Natural human writing with normal variation
|
|
27
|
+
→ Well-edited, single-author text
|
|
28
|
+
|
|
29
|
+
- gradual_drift: Slowly increasing χ² trend
|
|
30
|
+
→ Author fatigue (style degrades over time)
|
|
31
|
+
→ Topic evolution affecting vocabulary
|
|
32
|
+
→ Editing that becomes progressively heavier
|
|
33
|
+
|
|
34
|
+
- sudden_spike: One or more pairs have abnormally high χ²
|
|
35
|
+
→ Pasted content from different source
|
|
36
|
+
→ Different author wrote a section
|
|
37
|
+
→ Heavy editing in one region
|
|
38
|
+
|
|
39
|
+
- suspiciously_uniform: Near-zero variance in χ² scores
|
|
40
|
+
→ Possible AI generation (too consistent)
|
|
41
|
+
→ Text generated in single session without revision
|
|
42
|
+
→ Copy-pasted repetitive content
|
|
43
|
+
|
|
44
|
+
- unknown: Insufficient data for classification
|
|
45
|
+
→ Text too short (fewer than MIN_WINDOWS chunks)
|
|
46
|
+
|
|
47
|
+
Sliding Window Support:
|
|
48
|
+
The function supports overlapping windows via the `stride` parameter:
|
|
49
|
+
- stride == window_size: Non-overlapping chunks (original behavior)
|
|
50
|
+
- stride < window_size: Overlapping windows (smoother drift curve)
|
|
51
|
+
- stride > window_size: Gaps between windows (sparse sampling)
|
|
52
|
+
|
|
53
|
+
50% overlap (stride = window_size / 2) is recommended for smooth detection.
|
|
54
|
+
|
|
55
|
+
Marketing Name: "Style Drift Detector" / "Consistency Fingerprint"
|
|
56
|
+
|
|
57
|
+
References:
|
|
58
|
+
Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
|
|
59
|
+
Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
|
|
60
|
+
doi: 10.1075/ijcl.6.1.05kil
|
|
61
|
+
|
|
62
|
+
Eder, Maciej. "Does Size Matter? Authorship Attribution, Small Samples,
|
|
63
|
+
Big Problem." Digital Scholarship in the Humanities, vol. 30, no. 2,
|
|
64
|
+
2015, pp. 167-182.
|
|
65
|
+
|
|
66
|
+
Juola, Patrick. "Authorship Attribution." Foundations and Trends in
|
|
67
|
+
Information Retrieval, vol. 1, no. 3, 2006, pp. 233-334.
|
|
68
|
+
"""
|
|
69
|
+
|
|
70
|
+
from __future__ import annotations
|
|
71
|
+
|
|
72
|
+
import statistics
|
|
73
|
+
from typing import Any
|
|
74
|
+
|
|
75
|
+
from .._types import KilgarriffDriftResult
|
|
76
|
+
from .._utils import tokenize
|
|
77
|
+
from ..authorship.kilgarriff import _kilgarriff_core
|
|
78
|
+
from ._thresholds import (
|
|
79
|
+
CONFIDENCE_MIN_WINDOWS,
|
|
80
|
+
MARGINAL_DATA_MAX_CONFIDENCE,
|
|
81
|
+
MIN_WINDOWS,
|
|
82
|
+
RECOMMENDED_WINDOWS,
|
|
83
|
+
SPIKE_MIN_ABSOLUTE,
|
|
84
|
+
SPIKE_RATIO,
|
|
85
|
+
TREND_R_SQUARED_THRESHOLD,
|
|
86
|
+
TREND_SLOPE_THRESHOLD,
|
|
87
|
+
UNIFORM_CV_THRESHOLD,
|
|
88
|
+
UNIFORM_MEAN_THRESHOLD,
|
|
89
|
+
get_all_thresholds,
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _create_sliding_windows(
|
|
94
|
+
tokens: list[str],
|
|
95
|
+
window_size: int,
|
|
96
|
+
stride: int,
|
|
97
|
+
) -> list[list[str]]:
|
|
98
|
+
"""
|
|
99
|
+
Create sliding windows over a token list.
|
|
100
|
+
|
|
101
|
+
This function implements the sliding window mechanism for drift detection.
|
|
102
|
+
Windows can overlap (stride < window_size), be non-overlapping
|
|
103
|
+
(stride == window_size), or have gaps (stride > window_size).
|
|
104
|
+
|
|
105
|
+
Related GitHub Issue:
|
|
106
|
+
#36 - Sliding window support for drift detection
|
|
107
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
tokens: List of tokens to window over
|
|
111
|
+
window_size: Number of tokens per window
|
|
112
|
+
stride: Number of tokens to advance between windows
|
|
113
|
+
|
|
114
|
+
Returns:
|
|
115
|
+
List of token lists, each representing one window
|
|
116
|
+
|
|
117
|
+
Example:
|
|
118
|
+
>>> tokens = ["a", "b", "c", "d", "e", "f", "g", "h"]
|
|
119
|
+
>>> windows = _create_sliding_windows(tokens, window_size=4, stride=2)
|
|
120
|
+
>>> # windows[0] = ["a", "b", "c", "d"]
|
|
121
|
+
>>> # windows[1] = ["c", "d", "e", "f"]
|
|
122
|
+
>>> # windows[2] = ["e", "f", "g", "h"]
|
|
123
|
+
"""
|
|
124
|
+
if stride <= 0:
|
|
125
|
+
raise ValueError(f"stride must be positive, got {stride}")
|
|
126
|
+
if window_size <= 0:
|
|
127
|
+
raise ValueError(f"window_size must be positive, got {window_size}")
|
|
128
|
+
|
|
129
|
+
windows = []
|
|
130
|
+
start = 0
|
|
131
|
+
|
|
132
|
+
while start + window_size <= len(tokens):
|
|
133
|
+
window = tokens[start : start + window_size]
|
|
134
|
+
windows.append(window)
|
|
135
|
+
start += stride
|
|
136
|
+
|
|
137
|
+
# Handle final partial window if text doesn't divide evenly
|
|
138
|
+
# Only include if it has at least 50% of window_size tokens
|
|
139
|
+
if start < len(tokens):
|
|
140
|
+
final_window = tokens[start:]
|
|
141
|
+
if len(final_window) >= window_size * 0.5:
|
|
142
|
+
windows.append(final_window)
|
|
143
|
+
|
|
144
|
+
return windows
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _compute_trend(values: list[float]) -> tuple[float, float]:
|
|
148
|
+
"""
|
|
149
|
+
Compute linear regression slope and R-squared for trend detection.
|
|
150
|
+
|
|
151
|
+
Uses simple linear regression to detect whether chi-squared values
|
|
152
|
+
are increasing or decreasing over the document.
|
|
153
|
+
|
|
154
|
+
Related GitHub Issue:
|
|
155
|
+
#36 - Gradual drift pattern detection
|
|
156
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
values: List of chi-squared values in order
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
Tuple of (slope, r_squared)
|
|
163
|
+
- slope: Chi-squared units per comparison (positive = increasing)
|
|
164
|
+
- r_squared: Coefficient of determination (0-1, higher = better fit)
|
|
165
|
+
|
|
166
|
+
Example:
|
|
167
|
+
>>> values = [10.0, 15.0, 20.0, 25.0, 30.0] # Linear increase
|
|
168
|
+
>>> slope, r_sq = _compute_trend(values)
|
|
169
|
+
>>> # slope ≈ 5.0, r_sq ≈ 1.0
|
|
170
|
+
"""
|
|
171
|
+
if len(values) < 2:
|
|
172
|
+
return 0.0, 0.0
|
|
173
|
+
|
|
174
|
+
n = len(values)
|
|
175
|
+
x = list(range(n)) # 0, 1, 2, ...
|
|
176
|
+
|
|
177
|
+
# Means
|
|
178
|
+
mean_x = sum(x) / n
|
|
179
|
+
mean_y = sum(values) / n
|
|
180
|
+
|
|
181
|
+
# Covariance and variance
|
|
182
|
+
cov_xy = sum((xi - mean_x) * (yi - mean_y) for xi, yi in zip(x, values))
|
|
183
|
+
var_x = sum((xi - mean_x) ** 2 for xi in x)
|
|
184
|
+
var_y = sum((yi - mean_y) ** 2 for yi in values)
|
|
185
|
+
|
|
186
|
+
# Slope
|
|
187
|
+
if var_x == 0:
|
|
188
|
+
return 0.0, 0.0
|
|
189
|
+
slope = cov_xy / var_x
|
|
190
|
+
|
|
191
|
+
# R-squared
|
|
192
|
+
if var_y == 0:
|
|
193
|
+
r_squared = 1.0 if slope == 0 else 0.0
|
|
194
|
+
else:
|
|
195
|
+
# R² = (explained variance) / (total variance)
|
|
196
|
+
ss_res = sum((yi - (mean_y + slope * (xi - mean_x))) ** 2 for xi, yi in zip(x, values))
|
|
197
|
+
ss_tot = var_y * n
|
|
198
|
+
r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0.0
|
|
199
|
+
|
|
200
|
+
return slope, max(0.0, min(1.0, r_squared))
|
|
201
|
+
|
|
202
|
+
|
|
203
|
+
def _classify_pattern(
|
|
204
|
+
mean_chi: float,
|
|
205
|
+
std_chi: float,
|
|
206
|
+
max_chi: float,
|
|
207
|
+
min_chi: float,
|
|
208
|
+
trend_slope: float,
|
|
209
|
+
trend_r_squared: float,
|
|
210
|
+
window_count: int,
|
|
211
|
+
) -> tuple[str, float]:
|
|
212
|
+
"""
|
|
213
|
+
Classify the detected pattern and compute confidence score.
|
|
214
|
+
|
|
215
|
+
This function implements the pattern classification logic described in
|
|
216
|
+
_thresholds.py. It uses a decision tree approach, checking for each
|
|
217
|
+
pattern in order of specificity.
|
|
218
|
+
|
|
219
|
+
Related GitHub Issue:
|
|
220
|
+
#36 - Pattern classification for drift detection
|
|
221
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
222
|
+
|
|
223
|
+
Decision Order:
|
|
224
|
+
1. Suspiciously uniform (near-zero variance)
|
|
225
|
+
2. Sudden spike (outlier max value)
|
|
226
|
+
3. Gradual drift (significant trend)
|
|
227
|
+
4. Consistent (default)
|
|
228
|
+
|
|
229
|
+
Args:
|
|
230
|
+
mean_chi: Mean chi-squared across comparisons
|
|
231
|
+
std_chi: Standard deviation of chi-squared values
|
|
232
|
+
max_chi: Maximum chi-squared value
|
|
233
|
+
min_chi: Minimum chi-squared value
|
|
234
|
+
trend_slope: Linear regression slope
|
|
235
|
+
trend_r_squared: R-squared of trend fit
|
|
236
|
+
window_count: Number of windows analyzed
|
|
237
|
+
|
|
238
|
+
Returns:
|
|
239
|
+
Tuple of (pattern_name, confidence)
|
|
240
|
+
- pattern_name: One of "consistent", "gradual_drift", "sudden_spike",
|
|
241
|
+
"suspiciously_uniform", "unknown"
|
|
242
|
+
- confidence: 0.0-1.0 confidence in the classification
|
|
243
|
+
|
|
244
|
+
Note:
|
|
245
|
+
Confidence is scaled down for marginal data (few windows).
|
|
246
|
+
"""
|
|
247
|
+
# Base confidence scales with window count
|
|
248
|
+
if window_count < MIN_WINDOWS:
|
|
249
|
+
return "unknown", 0.0
|
|
250
|
+
|
|
251
|
+
base_confidence = min(1.0, window_count / CONFIDENCE_MIN_WINDOWS)
|
|
252
|
+
if window_count < RECOMMENDED_WINDOWS:
|
|
253
|
+
base_confidence = min(base_confidence, MARGINAL_DATA_MAX_CONFIDENCE)
|
|
254
|
+
|
|
255
|
+
# Handle edge case of zero mean
|
|
256
|
+
if mean_chi == 0:
|
|
257
|
+
return "consistent", base_confidence
|
|
258
|
+
|
|
259
|
+
# Coefficient of variation
|
|
260
|
+
cv = std_chi / mean_chi if mean_chi > 0 else 0.0
|
|
261
|
+
|
|
262
|
+
# 1. Check for suspiciously uniform (AI signature)
|
|
263
|
+
# Very low variance with low mean suggests artificial consistency
|
|
264
|
+
if cv < UNIFORM_CV_THRESHOLD and mean_chi < UNIFORM_MEAN_THRESHOLD:
|
|
265
|
+
# Confidence increases as CV decreases
|
|
266
|
+
uniformity_strength = 1 - (cv / UNIFORM_CV_THRESHOLD)
|
|
267
|
+
confidence = base_confidence * (0.6 + 0.4 * uniformity_strength)
|
|
268
|
+
return "suspiciously_uniform", confidence
|
|
269
|
+
|
|
270
|
+
# 2. Check for sudden spike (discontinuity)
|
|
271
|
+
# Max significantly exceeds mean, indicating an outlier
|
|
272
|
+
spike_ratio = max_chi / mean_chi if mean_chi > 0 else 0.0
|
|
273
|
+
if spike_ratio > SPIKE_RATIO and max_chi > SPIKE_MIN_ABSOLUTE:
|
|
274
|
+
# Confidence based on how extreme the spike is
|
|
275
|
+
spike_strength = min(1.0, (spike_ratio - SPIKE_RATIO) / SPIKE_RATIO)
|
|
276
|
+
confidence = base_confidence * (0.7 + 0.3 * spike_strength)
|
|
277
|
+
return "sudden_spike", confidence
|
|
278
|
+
|
|
279
|
+
# 3. Check for gradual drift (trend)
|
|
280
|
+
# Significant slope with reasonable R-squared
|
|
281
|
+
if abs(trend_slope) > TREND_SLOPE_THRESHOLD and trend_r_squared > TREND_R_SQUARED_THRESHOLD:
|
|
282
|
+
# Confidence based on R-squared (how well trend explains variance)
|
|
283
|
+
confidence = base_confidence * (0.5 + 0.5 * trend_r_squared)
|
|
284
|
+
return "gradual_drift", confidence
|
|
285
|
+
|
|
286
|
+
# 4. Default: consistent (natural human variation)
|
|
287
|
+
# Moderate variance, no extreme patterns
|
|
288
|
+
confidence = base_confidence * 0.8 # Slightly lower confidence for "normal"
|
|
289
|
+
return "consistent", confidence
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def compute_kilgarriff_drift(
|
|
293
|
+
text: str,
|
|
294
|
+
window_size: int = 1000,
|
|
295
|
+
stride: int = 500,
|
|
296
|
+
comparison_mode: str = "sequential",
|
|
297
|
+
lag: int = 1,
|
|
298
|
+
n_words: int = 500,
|
|
299
|
+
) -> KilgarriffDriftResult:
|
|
300
|
+
"""
|
|
301
|
+
Detect stylistic drift within a single document using Kilgarriff's chi-squared.
|
|
302
|
+
|
|
303
|
+
This function chunks a single text and computes chi-squared distances between
|
|
304
|
+
sequential (or all) chunk pairs to measure internal stylistic consistency.
|
|
305
|
+
It classifies the detected pattern and returns detailed metrics for analysis.
|
|
306
|
+
|
|
307
|
+
Related GitHub Issues:
|
|
308
|
+
#36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
|
|
309
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
310
|
+
#31 - Classical Stylometric Methods from Programming Historian
|
|
311
|
+
https://github.com/craigtrim/pystylometry/issues/31
|
|
312
|
+
|
|
313
|
+
Marketing Name: "Style Drift Detector" / "Consistency Fingerprint"
|
|
314
|
+
|
|
315
|
+
Pattern Signatures:
|
|
316
|
+
- consistent: Low, stable χ² across pairs (natural human writing)
|
|
317
|
+
- gradual_drift: Slowly increasing trend (author fatigue, topic shift)
|
|
318
|
+
- sudden_spike: One pair has high χ² (pasted content, different author)
|
|
319
|
+
- suspiciously_uniform: Near-zero variance (possible AI generation)
|
|
320
|
+
|
|
321
|
+
Algorithm:
|
|
322
|
+
1. Tokenize text and create sliding windows
|
|
323
|
+
2. For each window pair (based on comparison_mode):
|
|
324
|
+
a. Compute chi-squared using Kilgarriff's method
|
|
325
|
+
b. Record contributing words
|
|
326
|
+
3. Compute holistic metrics (mean, std, max, trend)
|
|
327
|
+
4. Classify pattern based on thresholds
|
|
328
|
+
5. Return detailed result with all metrics
|
|
329
|
+
|
|
330
|
+
References:
|
|
331
|
+
Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
|
|
332
|
+
Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
|
|
333
|
+
|
|
334
|
+
Args:
|
|
335
|
+
text: Input text to analyze for stylistic drift
|
|
336
|
+
window_size: Number of words per analysis window (default: 1000).
|
|
337
|
+
Larger windows provide more stable chi-squared but fewer comparisons.
|
|
338
|
+
stride: Number of words to advance between windows (default: 500).
|
|
339
|
+
- stride == window_size: Non-overlapping chunks
|
|
340
|
+
- stride < window_size: Overlapping windows (smoother detection)
|
|
341
|
+
- stride > window_size: Gaps between windows (sparse sampling)
|
|
342
|
+
comparison_mode: How to compare windows (default: "sequential")
|
|
343
|
+
- "sequential": Compare adjacent windows only (1-2, 2-3, 3-4)
|
|
344
|
+
- "all_pairs": Compare every window pair (produces distance matrix)
|
|
345
|
+
- "fixed_lag": Compare windows at fixed distance (e.g., 1-3, 2-4)
|
|
346
|
+
lag: Window distance for fixed_lag mode (default: 1, ignored otherwise)
|
|
347
|
+
n_words: Top N most frequent words for chi-squared (default: 500)
|
|
348
|
+
|
|
349
|
+
Returns:
|
|
350
|
+
KilgarriffDriftResult containing:
|
|
351
|
+
- status: "success", "marginal_data", or "insufficient_data"
|
|
352
|
+
- pattern: Classified pattern name
|
|
353
|
+
- pattern_confidence: 0.0-1.0 confidence score
|
|
354
|
+
- mean_chi_squared, std_chi_squared, max_chi_squared: Statistics
|
|
355
|
+
- trend: Slope of chi-squared over document
|
|
356
|
+
- pairwise_scores: Detailed per-pair data
|
|
357
|
+
- And more (see KilgarriffDriftResult docstring)
|
|
358
|
+
|
|
359
|
+
Raises:
|
|
360
|
+
ValueError: If stride is 0 or negative
|
|
361
|
+
|
|
362
|
+
Example:
|
|
363
|
+
>>> # Basic usage
|
|
364
|
+
>>> result = compute_kilgarriff_drift(long_text)
|
|
365
|
+
>>> print(f"Pattern: {result.pattern}")
|
|
366
|
+
>>> print(f"Confidence: {result.pattern_confidence:.2f}")
|
|
367
|
+
|
|
368
|
+
>>> # Custom sliding window
|
|
369
|
+
>>> result = compute_kilgarriff_drift(
|
|
370
|
+
... text,
|
|
371
|
+
... window_size=2000, # Larger windows
|
|
372
|
+
... stride=1000, # 50% overlap
|
|
373
|
+
... )
|
|
374
|
+
|
|
375
|
+
>>> # Check for AI-generated content
|
|
376
|
+
>>> if result.pattern == "suspiciously_uniform":
|
|
377
|
+
... print("Warning: Text may be AI-generated")
|
|
378
|
+
|
|
379
|
+
>>> # Handle insufficient data gracefully
|
|
380
|
+
>>> if result.status == "insufficient_data":
|
|
381
|
+
... print(result.status_message)
|
|
382
|
+
"""
|
|
383
|
+
# Input validation
|
|
384
|
+
if stride <= 0:
|
|
385
|
+
raise ValueError(f"stride must be positive, got {stride}")
|
|
386
|
+
if window_size <= 0:
|
|
387
|
+
raise ValueError(f"window_size must be positive, got {window_size}")
|
|
388
|
+
valid_modes = ("sequential", "all_pairs", "fixed_lag")
|
|
389
|
+
if comparison_mode not in valid_modes:
|
|
390
|
+
raise ValueError(f"comparison_mode must be one of {valid_modes}, got '{comparison_mode}'")
|
|
391
|
+
|
|
392
|
+
# Tokenize text
|
|
393
|
+
tokens = [t.lower() for t in tokenize(text) if t.isalpha()]
|
|
394
|
+
|
|
395
|
+
# Create sliding windows
|
|
396
|
+
windows = _create_sliding_windows(tokens, window_size, stride)
|
|
397
|
+
window_count = len(windows)
|
|
398
|
+
|
|
399
|
+
# Compute overlap ratio for metadata
|
|
400
|
+
overlap_ratio = max(0.0, 1 - (stride / window_size))
|
|
401
|
+
|
|
402
|
+
# Get thresholds for transparency
|
|
403
|
+
thresholds = get_all_thresholds()
|
|
404
|
+
|
|
405
|
+
# Check for insufficient data
|
|
406
|
+
if window_count < MIN_WINDOWS:
|
|
407
|
+
return KilgarriffDriftResult(
|
|
408
|
+
status="insufficient_data",
|
|
409
|
+
status_message=(
|
|
410
|
+
f"Text produced {window_count} windows; minimum {MIN_WINDOWS} required. "
|
|
411
|
+
f"Need approximately {window_size + (MIN_WINDOWS - 1) * stride} words."
|
|
412
|
+
),
|
|
413
|
+
pattern="unknown",
|
|
414
|
+
pattern_confidence=0.0,
|
|
415
|
+
mean_chi_squared=float("nan"),
|
|
416
|
+
std_chi_squared=float("nan"),
|
|
417
|
+
max_chi_squared=float("nan"),
|
|
418
|
+
min_chi_squared=float("nan"),
|
|
419
|
+
max_location=-1,
|
|
420
|
+
trend=float("nan"),
|
|
421
|
+
pairwise_scores=[],
|
|
422
|
+
window_size=window_size,
|
|
423
|
+
stride=stride,
|
|
424
|
+
overlap_ratio=overlap_ratio,
|
|
425
|
+
comparison_mode=comparison_mode,
|
|
426
|
+
window_count=window_count,
|
|
427
|
+
distance_matrix=None,
|
|
428
|
+
thresholds=thresholds,
|
|
429
|
+
metadata={
|
|
430
|
+
"total_tokens": len(tokens),
|
|
431
|
+
"tokens_per_window": [len(w) for w in windows],
|
|
432
|
+
},
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
# Determine which pairs to compare based on mode
|
|
436
|
+
pairs_to_compare: list[tuple[int, int]] = []
|
|
437
|
+
|
|
438
|
+
if comparison_mode == "sequential":
|
|
439
|
+
# Compare adjacent windows: (0,1), (1,2), (2,3), ...
|
|
440
|
+
pairs_to_compare = [(i, i + 1) for i in range(window_count - 1)]
|
|
441
|
+
|
|
442
|
+
elif comparison_mode == "all_pairs":
|
|
443
|
+
# Compare all window pairs: (0,1), (0,2), ..., (n-2,n-1)
|
|
444
|
+
pairs_to_compare = [(i, j) for i in range(window_count) for j in range(i + 1, window_count)]
|
|
445
|
+
|
|
446
|
+
elif comparison_mode == "fixed_lag":
|
|
447
|
+
# Compare windows at fixed lag distance: (0,lag), (1,lag+1), ...
|
|
448
|
+
pairs_to_compare = [
|
|
449
|
+
(i, i + lag) for i in range(window_count - lag) if i + lag < window_count
|
|
450
|
+
]
|
|
451
|
+
|
|
452
|
+
# Compute chi-squared for each pair
|
|
453
|
+
pairwise_scores: list[dict[str, Any]] = []
|
|
454
|
+
chi_squared_values: list[float] = []
|
|
455
|
+
|
|
456
|
+
for i, j in pairs_to_compare:
|
|
457
|
+
chi_sq, df, contributions, details = _kilgarriff_core(
|
|
458
|
+
windows[i], windows[j], n_words=n_words
|
|
459
|
+
)
|
|
460
|
+
|
|
461
|
+
pairwise_scores.append(
|
|
462
|
+
{
|
|
463
|
+
"chunk_pair": (i, j),
|
|
464
|
+
"chi_squared": chi_sq,
|
|
465
|
+
"degrees_of_freedom": df,
|
|
466
|
+
"top_words": contributions[:10], # Top 10 contributing words
|
|
467
|
+
"window_i_size": len(windows[i]),
|
|
468
|
+
"window_j_size": len(windows[j]),
|
|
469
|
+
}
|
|
470
|
+
)
|
|
471
|
+
chi_squared_values.append(chi_sq)
|
|
472
|
+
|
|
473
|
+
# Build distance matrix for all_pairs mode
|
|
474
|
+
distance_matrix: list[list[float]] | None = None
|
|
475
|
+
if comparison_mode == "all_pairs":
|
|
476
|
+
distance_matrix = [[0.0] * window_count for _ in range(window_count)]
|
|
477
|
+
for score in pairwise_scores:
|
|
478
|
+
i, j = score["chunk_pair"]
|
|
479
|
+
distance_matrix[i][j] = score["chi_squared"]
|
|
480
|
+
distance_matrix[j][i] = score["chi_squared"] # Symmetric
|
|
481
|
+
|
|
482
|
+
# Compute holistic statistics
|
|
483
|
+
if chi_squared_values:
|
|
484
|
+
mean_chi = statistics.mean(chi_squared_values)
|
|
485
|
+
std_chi = statistics.stdev(chi_squared_values) if len(chi_squared_values) > 1 else 0.0
|
|
486
|
+
max_chi = max(chi_squared_values)
|
|
487
|
+
min_chi = min(chi_squared_values)
|
|
488
|
+
max_location = chi_squared_values.index(max_chi)
|
|
489
|
+
else:
|
|
490
|
+
mean_chi = std_chi = max_chi = min_chi = float("nan")
|
|
491
|
+
max_location = -1
|
|
492
|
+
|
|
493
|
+
# Compute trend (only meaningful for sequential comparisons)
|
|
494
|
+
if comparison_mode == "sequential" and len(chi_squared_values) >= 2:
|
|
495
|
+
trend_slope, trend_r_squared = _compute_trend(chi_squared_values)
|
|
496
|
+
else:
|
|
497
|
+
trend_slope = 0.0
|
|
498
|
+
trend_r_squared = 0.0
|
|
499
|
+
|
|
500
|
+
# Classify pattern
|
|
501
|
+
pattern, pattern_confidence = _classify_pattern(
|
|
502
|
+
mean_chi=mean_chi,
|
|
503
|
+
std_chi=std_chi,
|
|
504
|
+
max_chi=max_chi,
|
|
505
|
+
min_chi=min_chi,
|
|
506
|
+
trend_slope=trend_slope,
|
|
507
|
+
trend_r_squared=trend_r_squared,
|
|
508
|
+
window_count=window_count,
|
|
509
|
+
)
|
|
510
|
+
|
|
511
|
+
# Determine status
|
|
512
|
+
if window_count >= RECOMMENDED_WINDOWS:
|
|
513
|
+
status = "success"
|
|
514
|
+
status_message = f"Analyzed {window_count} windows with {len(pairwise_scores)} comparisons."
|
|
515
|
+
else:
|
|
516
|
+
status = "marginal_data"
|
|
517
|
+
status_message = (
|
|
518
|
+
f"Analyzed {window_count} windows; {RECOMMENDED_WINDOWS}+ recommended "
|
|
519
|
+
f"for reliable pattern classification."
|
|
520
|
+
)
|
|
521
|
+
|
|
522
|
+
return KilgarriffDriftResult(
|
|
523
|
+
status=status,
|
|
524
|
+
status_message=status_message,
|
|
525
|
+
pattern=pattern,
|
|
526
|
+
pattern_confidence=pattern_confidence,
|
|
527
|
+
mean_chi_squared=mean_chi,
|
|
528
|
+
std_chi_squared=std_chi,
|
|
529
|
+
max_chi_squared=max_chi,
|
|
530
|
+
min_chi_squared=min_chi,
|
|
531
|
+
max_location=max_location,
|
|
532
|
+
trend=trend_slope,
|
|
533
|
+
pairwise_scores=pairwise_scores,
|
|
534
|
+
window_size=window_size,
|
|
535
|
+
stride=stride,
|
|
536
|
+
overlap_ratio=overlap_ratio,
|
|
537
|
+
comparison_mode=comparison_mode,
|
|
538
|
+
window_count=window_count,
|
|
539
|
+
distance_matrix=distance_matrix,
|
|
540
|
+
thresholds=thresholds,
|
|
541
|
+
metadata={
|
|
542
|
+
"total_tokens": len(tokens),
|
|
543
|
+
"tokens_per_window": [len(w) for w in windows],
|
|
544
|
+
"comparisons_made": len(pairwise_scores),
|
|
545
|
+
"trend_r_squared": trend_r_squared,
|
|
546
|
+
"n_words_used": n_words,
|
|
547
|
+
"method": "kilgarriff_drift_2001",
|
|
548
|
+
},
|
|
549
|
+
)
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Dialect detection module for stylometric analysis.
|
|
2
|
+
|
|
3
|
+
This module provides dialect detection capabilities, identifying regional
|
|
4
|
+
linguistic preferences (British vs. American English) and measuring text
|
|
5
|
+
markedness - how far the text deviates from "unmarked" standard English.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issues:
|
|
8
|
+
#35 - Dialect detection with extensible JSON markers
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/35
|
|
10
|
+
#30 - Whonix stylometry features (regional linguistic preferences)
|
|
11
|
+
https://github.com/craigtrim/pystylometry/issues/30
|
|
12
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
13
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
14
|
+
|
|
15
|
+
Features:
|
|
16
|
+
- British/American vocabulary matching (flat/apartment, lorry/truck)
|
|
17
|
+
- Spelling pattern detection (-ise/-ize, -our/-or, -re/-er)
|
|
18
|
+
- Grammar pattern analysis (have got/have, collective noun agreement)
|
|
19
|
+
- Eye dialect identification (gonna, wanna - register, not dialect)
|
|
20
|
+
- Feature weighting based on linguistic research
|
|
21
|
+
- Markedness scoring for stylistic analysis
|
|
22
|
+
- Native chunked analysis with distribution statistics
|
|
23
|
+
|
|
24
|
+
The analysis uses an extensible JSON database (dialect_markers.json) that
|
|
25
|
+
can be augmented with additional markers over time.
|
|
26
|
+
|
|
27
|
+
Usage:
|
|
28
|
+
>>> from pystylometry.dialect import compute_dialect
|
|
29
|
+
>>> result = compute_dialect("The colour of the programme was brilliant.")
|
|
30
|
+
>>> result.dialect
|
|
31
|
+
'british'
|
|
32
|
+
>>> result.british_score
|
|
33
|
+
0.85
|
|
34
|
+
>>> result.markedness_score
|
|
35
|
+
0.42
|
|
36
|
+
|
|
37
|
+
>>> # Access distributions for stylometric fingerprinting
|
|
38
|
+
>>> result.british_score_dist.std # Variance across chunks
|
|
39
|
+
0.05
|
|
40
|
+
|
|
41
|
+
>>> # Inspect detailed marker breakdown
|
|
42
|
+
>>> result.spelling_markers
|
|
43
|
+
{'colour': 1, 'programme': 1}
|
|
44
|
+
>>> result.markers_by_level['phonological']
|
|
45
|
+
{'colour': 1}
|
|
46
|
+
|
|
47
|
+
References:
|
|
48
|
+
Goebl, Hans. "Dialektometrie: Prinzipien und Methoden des Einsatzes der
|
|
49
|
+
numerischen Taxonomie im Bereich der Dialektgeographie." Verlag der
|
|
50
|
+
Österreichischen Akademie der Wissenschaften, 1982.
|
|
51
|
+
Nerbonne, John. "Data-Driven Dialectology." Language and Linguistics
|
|
52
|
+
Compass, vol. 3, no. 1, 2009, pp. 175-198.
|
|
53
|
+
Whonix Project. "Stylometry: Deanonymization Techniques." Whonix Wiki,
|
|
54
|
+
https://www.whonix.org/wiki/Stylometry
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
from ._loader import DialectMarkers, clear_cache, get_markers
|
|
58
|
+
from .detector import compute_dialect
|
|
59
|
+
|
|
60
|
+
__all__ = [
|
|
61
|
+
"compute_dialect",
|
|
62
|
+
"get_markers",
|
|
63
|
+
"clear_cache",
|
|
64
|
+
"DialectMarkers",
|
|
65
|
+
]
|