pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/__init__.py +30 -5
- pystylometry/_normalize.py +277 -0
- pystylometry/_types.py +1954 -28
- pystylometry/_utils.py +4 -0
- pystylometry/authorship/__init__.py +26 -1
- pystylometry/authorship/additional_methods.py +75 -0
- pystylometry/authorship/kilgarriff.py +347 -0
- pystylometry/character/__init__.py +15 -0
- pystylometry/character/character_metrics.py +389 -0
- pystylometry/cli.py +427 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/__init__.py +13 -6
- pystylometry/lexical/advanced_diversity.py +680 -0
- pystylometry/lexical/function_words.py +590 -0
- pystylometry/lexical/hapax.py +310 -33
- pystylometry/lexical/mtld.py +180 -22
- pystylometry/lexical/ttr.py +149 -0
- pystylometry/lexical/word_frequency_sophistication.py +1805 -0
- pystylometry/lexical/yule.py +142 -29
- pystylometry/ngrams/__init__.py +2 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +235 -0
- pystylometry/prosody/__init__.py +12 -0
- pystylometry/prosody/rhythm_prosody.py +53 -0
- pystylometry/readability/__init__.py +12 -0
- pystylometry/readability/additional_formulas.py +2110 -0
- pystylometry/readability/ari.py +173 -35
- pystylometry/readability/coleman_liau.py +150 -30
- pystylometry/readability/complex_words.py +531 -0
- pystylometry/readability/flesch.py +181 -32
- pystylometry/readability/gunning_fog.py +208 -35
- pystylometry/readability/smog.py +126 -28
- pystylometry/readability/syllables.py +137 -30
- pystylometry/stylistic/__init__.py +20 -0
- pystylometry/stylistic/cohesion_coherence.py +45 -0
- pystylometry/stylistic/genre_register.py +45 -0
- pystylometry/stylistic/markers.py +131 -0
- pystylometry/stylistic/vocabulary_overlap.py +47 -0
- pystylometry/syntactic/__init__.py +4 -0
- pystylometry/syntactic/advanced_syntactic.py +494 -0
- pystylometry/syntactic/pos_ratios.py +172 -17
- pystylometry/syntactic/sentence_stats.py +105 -18
- pystylometry/syntactic/sentence_types.py +526 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
- pystylometry-1.1.0.dist-info/RECORD +63 -0
- pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
- pystylometry-0.1.0.dist-info/RECORD +0 -26
- {pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0
|
@@ -0,0 +1,589 @@
|
|
|
1
|
+
"""Drift detection visualizations (matplotlib).
|
|
2
|
+
|
|
3
|
+
This module provides matplotlib-based visualizations for Kilgarriff chi-squared
|
|
4
|
+
drift detection results. For interactive HTML exports, see pystylometry.viz.jsx.
|
|
5
|
+
|
|
6
|
+
Related GitHub Issues:
|
|
7
|
+
#36 - Kilgarriff Chi-Squared drift detection
|
|
8
|
+
#38 - Visualization Options for Style Drift Detection
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from typing import TYPE_CHECKING, TypedDict
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from .._types import KilgarriffDriftResult
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class _ScatterDataPoint(TypedDict):
|
|
21
|
+
"""Type for scatter plot data points."""
|
|
22
|
+
|
|
23
|
+
label: str
|
|
24
|
+
mean_chi: float
|
|
25
|
+
cv: float
|
|
26
|
+
pattern: str
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
# Reference bounds for zone classification (empirically derived)
|
|
30
|
+
MEAN_CHI_LOW = 100 # Below: AI-like baseline
|
|
31
|
+
MEAN_CHI_HIGH = 250 # Above: Human-like baseline
|
|
32
|
+
CV_LOW = 0.08 # Below: Very stable
|
|
33
|
+
CV_HIGH = 0.20 # Above: Volatile (potential discontinuity)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def plot_drift_timeline(
|
|
37
|
+
result: "KilgarriffDriftResult",
|
|
38
|
+
output: str | Path | None = None,
|
|
39
|
+
title: str | None = None,
|
|
40
|
+
figsize: tuple[float, float] = (12, 6),
|
|
41
|
+
show_spike_threshold: bool = True,
|
|
42
|
+
show_ai_threshold: bool = True,
|
|
43
|
+
) -> None:
|
|
44
|
+
"""
|
|
45
|
+
Plot chi-squared values as a timeline showing drift patterns.
|
|
46
|
+
|
|
47
|
+
Creates a line chart with window pair index on x-axis and chi-squared
|
|
48
|
+
value on y-axis. Highlights spike locations and shows reference thresholds.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
result: KilgarriffDriftResult from compute_kilgarriff_drift()
|
|
52
|
+
output: Path to save figure (shows interactively if None)
|
|
53
|
+
title: Custom title (auto-generated if None)
|
|
54
|
+
figsize: Figure size in inches (width, height)
|
|
55
|
+
show_spike_threshold: Show horizontal line at spike detection threshold
|
|
56
|
+
show_ai_threshold: Show horizontal line at AI baseline threshold
|
|
57
|
+
|
|
58
|
+
Example:
|
|
59
|
+
>>> result = compute_kilgarriff_drift(text)
|
|
60
|
+
>>> plot_drift_timeline(result, output="timeline.png")
|
|
61
|
+
"""
|
|
62
|
+
from . import _check_viz_available
|
|
63
|
+
|
|
64
|
+
_check_viz_available()
|
|
65
|
+
|
|
66
|
+
import matplotlib.pyplot as plt
|
|
67
|
+
import seaborn as sns # type: ignore[import-untyped]
|
|
68
|
+
|
|
69
|
+
# Extract data
|
|
70
|
+
chi_values = [s["chi_squared"] for s in result.pairwise_scores]
|
|
71
|
+
x = list(range(len(chi_values)))
|
|
72
|
+
|
|
73
|
+
# Set up style
|
|
74
|
+
sns.set_theme(style="whitegrid", palette="muted")
|
|
75
|
+
|
|
76
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
77
|
+
|
|
78
|
+
# Main line plot
|
|
79
|
+
ax.plot(x, chi_values, linewidth=2, color="#2563eb", marker="o", markersize=4, alpha=0.8)
|
|
80
|
+
|
|
81
|
+
# Fill under curve
|
|
82
|
+
ax.fill_between(x, chi_values, alpha=0.2, color="#2563eb")
|
|
83
|
+
|
|
84
|
+
# Mark spike location
|
|
85
|
+
if result.max_location is not None and result.max_location < len(chi_values):
|
|
86
|
+
ax.axvline(
|
|
87
|
+
x=result.max_location,
|
|
88
|
+
color="#dc2626",
|
|
89
|
+
linestyle="--",
|
|
90
|
+
linewidth=2,
|
|
91
|
+
alpha=0.7,
|
|
92
|
+
label=f"Max χ² at window {result.max_location}",
|
|
93
|
+
)
|
|
94
|
+
ax.scatter(
|
|
95
|
+
[result.max_location],
|
|
96
|
+
[chi_values[result.max_location]],
|
|
97
|
+
color="#dc2626",
|
|
98
|
+
s=150,
|
|
99
|
+
zorder=5,
|
|
100
|
+
edgecolors="white",
|
|
101
|
+
linewidth=2,
|
|
102
|
+
)
|
|
103
|
+
|
|
104
|
+
# Reference thresholds
|
|
105
|
+
if show_ai_threshold:
|
|
106
|
+
ax.axhline(
|
|
107
|
+
y=50,
|
|
108
|
+
color="#f59e0b",
|
|
109
|
+
linestyle=":",
|
|
110
|
+
linewidth=1.5,
|
|
111
|
+
alpha=0.7,
|
|
112
|
+
label="AI baseline threshold (~50)",
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
if show_spike_threshold and result.mean_chi_squared > 0:
|
|
116
|
+
spike_threshold = result.mean_chi_squared + 2 * result.std_chi_squared
|
|
117
|
+
ax.axhline(
|
|
118
|
+
y=spike_threshold,
|
|
119
|
+
color="#10b981",
|
|
120
|
+
linestyle=":",
|
|
121
|
+
linewidth=1.5,
|
|
122
|
+
alpha=0.7,
|
|
123
|
+
label=f"Spike threshold (μ+2σ = {spike_threshold:.0f})",
|
|
124
|
+
)
|
|
125
|
+
|
|
126
|
+
# Mean line
|
|
127
|
+
ax.axhline(
|
|
128
|
+
y=result.mean_chi_squared,
|
|
129
|
+
color="#6b7280",
|
|
130
|
+
linestyle="-",
|
|
131
|
+
linewidth=1,
|
|
132
|
+
alpha=0.5,
|
|
133
|
+
label=f"Mean χ² = {result.mean_chi_squared:.1f}",
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
# Labels and title
|
|
137
|
+
ax.set_xlabel("Window Pair Index", fontsize=12)
|
|
138
|
+
ax.set_ylabel("Chi-squared (χ²)", fontsize=12)
|
|
139
|
+
|
|
140
|
+
if title is None:
|
|
141
|
+
pattern_label = result.pattern.replace("_", " ").title()
|
|
142
|
+
title = f"Stylistic Drift Timeline — Pattern: {pattern_label}"
|
|
143
|
+
ax.set_title(title, fontsize=14, fontweight="bold")
|
|
144
|
+
|
|
145
|
+
# Legend
|
|
146
|
+
ax.legend(loc="upper right", framealpha=0.9)
|
|
147
|
+
|
|
148
|
+
# Stats annotation
|
|
149
|
+
stats_text = (
|
|
150
|
+
f"Mean: {result.mean_chi_squared:.1f}\n"
|
|
151
|
+
f"Std: {result.std_chi_squared:.1f}\n"
|
|
152
|
+
f"Windows: {result.window_count}"
|
|
153
|
+
)
|
|
154
|
+
ax.annotate(
|
|
155
|
+
stats_text,
|
|
156
|
+
xy=(0.02, 0.98),
|
|
157
|
+
xycoords="axes fraction",
|
|
158
|
+
verticalalignment="top",
|
|
159
|
+
fontsize=10,
|
|
160
|
+
family="monospace",
|
|
161
|
+
bbox=dict(boxstyle="round,pad=0.5", facecolor="white", alpha=0.8),
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
plt.tight_layout()
|
|
165
|
+
|
|
166
|
+
if output:
|
|
167
|
+
plt.savefig(output, dpi=150, bbox_inches="tight")
|
|
168
|
+
plt.close()
|
|
169
|
+
else:
|
|
170
|
+
plt.show()
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def plot_drift_scatter(
|
|
174
|
+
results: list[tuple[str, "KilgarriffDriftResult"]],
|
|
175
|
+
output: str | Path | None = None,
|
|
176
|
+
title: str = "Style Drift Detection — Reference Zone Plot",
|
|
177
|
+
figsize: tuple[float, float] = (10, 8),
|
|
178
|
+
show_zones: bool = True,
|
|
179
|
+
annotate_points: bool = True,
|
|
180
|
+
) -> None:
|
|
181
|
+
"""
|
|
182
|
+
Plot multiple documents on a scatter plot with reference zones.
|
|
183
|
+
|
|
184
|
+
Creates a tic-tac-toe style visualization where:
|
|
185
|
+
- X-axis: Mean chi-squared (baseline stylistic variation)
|
|
186
|
+
- Y-axis: Coefficient of variation (volatility)
|
|
187
|
+
- Zones indicate expected classifications (human, AI, splice, etc.)
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
results: List of (label, KilgarriffDriftResult) tuples
|
|
191
|
+
output: Path to save figure (shows interactively if None)
|
|
192
|
+
title: Chart title
|
|
193
|
+
figsize: Figure size in inches
|
|
194
|
+
show_zones: Show reference zone boundaries and labels
|
|
195
|
+
annotate_points: Label each point with its name
|
|
196
|
+
|
|
197
|
+
Example:
|
|
198
|
+
>>> results = [
|
|
199
|
+
... ("Document A", compute_kilgarriff_drift(text_a)),
|
|
200
|
+
... ("Document B", compute_kilgarriff_drift(text_b)),
|
|
201
|
+
... ]
|
|
202
|
+
>>> plot_drift_scatter(results, output="scatter.png")
|
|
203
|
+
"""
|
|
204
|
+
from . import _check_viz_available
|
|
205
|
+
|
|
206
|
+
_check_viz_available()
|
|
207
|
+
|
|
208
|
+
import matplotlib.patches as mpatches
|
|
209
|
+
import matplotlib.pyplot as plt
|
|
210
|
+
import seaborn as sns # type: ignore[import-untyped]
|
|
211
|
+
|
|
212
|
+
# Extract data
|
|
213
|
+
data: list[_ScatterDataPoint] = []
|
|
214
|
+
for label, result in results:
|
|
215
|
+
mean_chi = result.mean_chi_squared
|
|
216
|
+
cv = result.std_chi_squared / mean_chi if mean_chi > 0 else 0
|
|
217
|
+
data.append(
|
|
218
|
+
{
|
|
219
|
+
"label": label,
|
|
220
|
+
"mean_chi": mean_chi,
|
|
221
|
+
"cv": cv,
|
|
222
|
+
"pattern": result.pattern,
|
|
223
|
+
}
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Set up style
|
|
227
|
+
sns.set_theme(style="whitegrid")
|
|
228
|
+
|
|
229
|
+
fig, ax = plt.subplots(figsize=figsize)
|
|
230
|
+
|
|
231
|
+
# Define zone colors
|
|
232
|
+
zone_colors = {
|
|
233
|
+
"human_normal": "#dcfce7", # Light green
|
|
234
|
+
"human_tight": "#d1fae5", # Lighter green
|
|
235
|
+
"ai_uniform": "#fee2e2", # Light red
|
|
236
|
+
"ai_like": "#fef3c7", # Light yellow
|
|
237
|
+
"splice": "#fecaca", # Light red-orange
|
|
238
|
+
"transition": "#f3f4f6", # Light gray
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
if show_zones:
|
|
242
|
+
# Draw zone backgrounds
|
|
243
|
+
# Bottom-right: Human zones
|
|
244
|
+
ax.axvspan(
|
|
245
|
+
MEAN_CHI_HIGH, 450, ymin=0, ymax=CV_HIGH, alpha=0.3, color=zone_colors["human_normal"]
|
|
246
|
+
)
|
|
247
|
+
ax.axvspan(
|
|
248
|
+
MEAN_CHI_HIGH,
|
|
249
|
+
450,
|
|
250
|
+
ymin=0,
|
|
251
|
+
ymax=CV_LOW / 1.0,
|
|
252
|
+
alpha=0.4,
|
|
253
|
+
color=zone_colors["human_tight"],
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
# Bottom-left: AI zone
|
|
257
|
+
ax.axvspan(
|
|
258
|
+
0, MEAN_CHI_LOW, ymin=0, ymax=CV_HIGH, alpha=0.3, color=zone_colors["ai_uniform"]
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Top zones: Splice/volatile
|
|
262
|
+
ax.axvspan(
|
|
263
|
+
MEAN_CHI_HIGH, 450, ymin=CV_HIGH, ymax=1.0, alpha=0.3, color=zone_colors["splice"]
|
|
264
|
+
)
|
|
265
|
+
|
|
266
|
+
# Middle: Transition
|
|
267
|
+
ax.axvspan(
|
|
268
|
+
MEAN_CHI_LOW,
|
|
269
|
+
MEAN_CHI_HIGH,
|
|
270
|
+
ymin=0,
|
|
271
|
+
ymax=1.0,
|
|
272
|
+
alpha=0.2,
|
|
273
|
+
color=zone_colors["transition"],
|
|
274
|
+
)
|
|
275
|
+
|
|
276
|
+
# Draw reference lines
|
|
277
|
+
ax.axvline(x=MEAN_CHI_LOW, color="#9ca3af", linestyle="--", linewidth=1.5, alpha=0.7)
|
|
278
|
+
ax.axvline(x=MEAN_CHI_HIGH, color="#9ca3af", linestyle="--", linewidth=1.5, alpha=0.7)
|
|
279
|
+
ax.axhline(y=CV_LOW, color="#9ca3af", linestyle="--", linewidth=1.5, alpha=0.7)
|
|
280
|
+
ax.axhline(y=CV_HIGH, color="#9ca3af", linestyle="--", linewidth=1.5, alpha=0.7)
|
|
281
|
+
|
|
282
|
+
# Zone labels
|
|
283
|
+
ax.text(
|
|
284
|
+
50, 0.04, "AI-UNIFORM", fontsize=9, ha="center", va="center", color="#6b7280", alpha=0.8
|
|
285
|
+
)
|
|
286
|
+
ax.text(
|
|
287
|
+
50, 0.5, "ANOMALOUS", fontsize=9, ha="center", va="center", color="#6b7280", alpha=0.8
|
|
288
|
+
)
|
|
289
|
+
ax.text(
|
|
290
|
+
175,
|
|
291
|
+
0.14,
|
|
292
|
+
"TRANSITION",
|
|
293
|
+
fontsize=9,
|
|
294
|
+
ha="center",
|
|
295
|
+
va="center",
|
|
296
|
+
color="#6b7280",
|
|
297
|
+
alpha=0.8,
|
|
298
|
+
)
|
|
299
|
+
ax.text(
|
|
300
|
+
350,
|
|
301
|
+
0.04,
|
|
302
|
+
"HUMAN-TIGHT",
|
|
303
|
+
fontsize=9,
|
|
304
|
+
ha="center",
|
|
305
|
+
va="center",
|
|
306
|
+
color="#6b7280",
|
|
307
|
+
alpha=0.8,
|
|
308
|
+
)
|
|
309
|
+
ax.text(
|
|
310
|
+
350, 0.14, "HUMAN", fontsize=9, ha="center", va="center", color="#059669", alpha=0.9
|
|
311
|
+
)
|
|
312
|
+
ax.text(
|
|
313
|
+
350, 0.5, "SPLICE", fontsize=9, ha="center", va="center", color="#dc2626", alpha=0.9
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
# Color points by pattern
|
|
317
|
+
pattern_colors = {
|
|
318
|
+
"consistent": "#22c55e", # Green
|
|
319
|
+
"sudden_spike": "#ef4444", # Red
|
|
320
|
+
"gradual_drift": "#f59e0b", # Amber
|
|
321
|
+
"suspiciously_uniform": "#8b5cf6", # Purple
|
|
322
|
+
"unknown": "#6b7280", # Gray
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
# Plot points
|
|
326
|
+
for d in data:
|
|
327
|
+
color = pattern_colors.get(d["pattern"], "#6b7280")
|
|
328
|
+
ax.scatter(
|
|
329
|
+
d["mean_chi"],
|
|
330
|
+
d["cv"],
|
|
331
|
+
s=200,
|
|
332
|
+
c=color,
|
|
333
|
+
edgecolors="white",
|
|
334
|
+
linewidth=2,
|
|
335
|
+
zorder=5,
|
|
336
|
+
alpha=0.9,
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
if annotate_points:
|
|
340
|
+
ax.annotate(
|
|
341
|
+
d["label"],
|
|
342
|
+
(d["mean_chi"], d["cv"]),
|
|
343
|
+
xytext=(8, 8),
|
|
344
|
+
textcoords="offset points",
|
|
345
|
+
fontsize=9,
|
|
346
|
+
fontweight="bold",
|
|
347
|
+
color="#1f2937",
|
|
348
|
+
)
|
|
349
|
+
|
|
350
|
+
# Axis labels and limits
|
|
351
|
+
ax.set_xlabel("Mean χ² (Baseline Stylistic Variation)", fontsize=12)
|
|
352
|
+
ax.set_ylabel("CV (Coefficient of Variation)", fontsize=12)
|
|
353
|
+
ax.set_xlim(0, max(450, max(d["mean_chi"] for d in data) * 1.1))
|
|
354
|
+
ax.set_ylim(0, max(1.0, max(d["cv"] for d in data) * 1.1))
|
|
355
|
+
|
|
356
|
+
ax.set_title(title, fontsize=14, fontweight="bold", pad=20)
|
|
357
|
+
|
|
358
|
+
# Legend for patterns
|
|
359
|
+
legend_handles = [
|
|
360
|
+
mpatches.Patch(color=pattern_colors["consistent"], label="Consistent"),
|
|
361
|
+
mpatches.Patch(color=pattern_colors["sudden_spike"], label="Sudden Spike"),
|
|
362
|
+
mpatches.Patch(color=pattern_colors["gradual_drift"], label="Gradual Drift"),
|
|
363
|
+
mpatches.Patch(color=pattern_colors["suspiciously_uniform"], label="Suspiciously Uniform"),
|
|
364
|
+
]
|
|
365
|
+
ax.legend(handles=legend_handles, loc="upper right", title="Detected Pattern", framealpha=0.9)
|
|
366
|
+
|
|
367
|
+
# Reference bounds annotation
|
|
368
|
+
bounds_text = (
|
|
369
|
+
f"Reference Bounds:\n"
|
|
370
|
+
f" Mean χ² < {MEAN_CHI_LOW}: AI baseline\n"
|
|
371
|
+
f" Mean χ² > {MEAN_CHI_HIGH}: Human baseline\n"
|
|
372
|
+
f" CV < {CV_LOW}: Very stable\n"
|
|
373
|
+
f" CV > {CV_HIGH}: Volatile"
|
|
374
|
+
)
|
|
375
|
+
ax.annotate(
|
|
376
|
+
bounds_text,
|
|
377
|
+
xy=(0.02, 0.98),
|
|
378
|
+
xycoords="axes fraction",
|
|
379
|
+
verticalalignment="top",
|
|
380
|
+
fontsize=8,
|
|
381
|
+
family="monospace",
|
|
382
|
+
bbox=dict(boxstyle="round,pad=0.5", facecolor="white", alpha=0.8),
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
plt.tight_layout()
|
|
386
|
+
|
|
387
|
+
if output:
|
|
388
|
+
plt.savefig(output, dpi=150, bbox_inches="tight")
|
|
389
|
+
plt.close()
|
|
390
|
+
else:
|
|
391
|
+
plt.show()
|
|
392
|
+
|
|
393
|
+
|
|
394
|
+
def plot_drift_report(
|
|
395
|
+
result: "KilgarriffDriftResult",
|
|
396
|
+
label: str = "Document",
|
|
397
|
+
output: str | Path | None = None,
|
|
398
|
+
figsize: tuple[float, float] = (14, 10),
|
|
399
|
+
) -> None:
|
|
400
|
+
"""
|
|
401
|
+
Generate a comprehensive drift analysis report with multiple panels.
|
|
402
|
+
|
|
403
|
+
Creates a multi-panel figure with:
|
|
404
|
+
- Timeline of chi-squared values
|
|
405
|
+
- Histogram of chi-squared distribution
|
|
406
|
+
- Summary statistics panel
|
|
407
|
+
- Top contributing words at spike location
|
|
408
|
+
|
|
409
|
+
Args:
|
|
410
|
+
result: KilgarriffDriftResult from compute_kilgarriff_drift()
|
|
411
|
+
label: Document label for title
|
|
412
|
+
output: Path to save figure (shows interactively if None)
|
|
413
|
+
figsize: Figure size in inches
|
|
414
|
+
|
|
415
|
+
Example:
|
|
416
|
+
>>> result = compute_kilgarriff_drift(text)
|
|
417
|
+
>>> plot_drift_report(result, label="My Document", output="report.png")
|
|
418
|
+
"""
|
|
419
|
+
from . import _check_viz_available
|
|
420
|
+
|
|
421
|
+
_check_viz_available()
|
|
422
|
+
|
|
423
|
+
import matplotlib.pyplot as plt
|
|
424
|
+
import seaborn as sns # type: ignore[import-untyped]
|
|
425
|
+
|
|
426
|
+
# Extract data
|
|
427
|
+
chi_values = [s["chi_squared"] for s in result.pairwise_scores]
|
|
428
|
+
cv = result.std_chi_squared / result.mean_chi_squared if result.mean_chi_squared > 0 else 0
|
|
429
|
+
|
|
430
|
+
# Set up style
|
|
431
|
+
sns.set_theme(style="whitegrid")
|
|
432
|
+
|
|
433
|
+
fig = plt.figure(figsize=figsize, constrained_layout=True)
|
|
434
|
+
|
|
435
|
+
# Create grid layout
|
|
436
|
+
gs = fig.add_gridspec(3, 2, height_ratios=[2, 1, 1], hspace=0.3, wspace=0.3)
|
|
437
|
+
|
|
438
|
+
# Panel 1: Timeline (spans full width)
|
|
439
|
+
ax1 = fig.add_subplot(gs[0, :])
|
|
440
|
+
ax1.plot(chi_values, linewidth=2, color="#2563eb", marker="o", markersize=4, alpha=0.8)
|
|
441
|
+
ax1.fill_between(range(len(chi_values)), chi_values, alpha=0.2, color="#2563eb")
|
|
442
|
+
|
|
443
|
+
if result.max_location is not None and result.max_location < len(chi_values):
|
|
444
|
+
ax1.axvline(x=result.max_location, color="#dc2626", linestyle="--", linewidth=2, alpha=0.7)
|
|
445
|
+
ax1.scatter(
|
|
446
|
+
[result.max_location],
|
|
447
|
+
[chi_values[result.max_location]],
|
|
448
|
+
color="#dc2626",
|
|
449
|
+
s=150,
|
|
450
|
+
zorder=5,
|
|
451
|
+
edgecolors="white",
|
|
452
|
+
linewidth=2,
|
|
453
|
+
)
|
|
454
|
+
|
|
455
|
+
ax1.axhline(y=result.mean_chi_squared, color="#6b7280", linestyle="-", linewidth=1, alpha=0.5)
|
|
456
|
+
ax1.set_xlabel("Window Pair Index")
|
|
457
|
+
ax1.set_ylabel("Chi-squared (χ²)")
|
|
458
|
+
ax1.set_title("Chi-squared Timeline", fontsize=12, fontweight="bold")
|
|
459
|
+
|
|
460
|
+
# Panel 2: Histogram
|
|
461
|
+
ax2 = fig.add_subplot(gs[1, 0])
|
|
462
|
+
sns.histplot(chi_values, kde=True, ax=ax2, color="#2563eb", alpha=0.6)
|
|
463
|
+
ax2.axvline(x=result.mean_chi_squared, color="#dc2626", linestyle="--", linewidth=2)
|
|
464
|
+
ax2.set_xlabel("Chi-squared (χ²)")
|
|
465
|
+
ax2.set_ylabel("Count")
|
|
466
|
+
ax2.set_title("Distribution", fontsize=12, fontweight="bold")
|
|
467
|
+
|
|
468
|
+
# Panel 3: Summary statistics
|
|
469
|
+
ax3 = fig.add_subplot(gs[1, 1])
|
|
470
|
+
ax3.axis("off")
|
|
471
|
+
|
|
472
|
+
pattern_label = result.pattern.replace("_", " ").title()
|
|
473
|
+
stats_text = (
|
|
474
|
+
f"Pattern: {pattern_label}\n"
|
|
475
|
+
f"Confidence: {result.pattern_confidence:.1%}\n"
|
|
476
|
+
f"─────────────────────\n"
|
|
477
|
+
f"Mean χ²: {result.mean_chi_squared:.1f}\n"
|
|
478
|
+
f"Std χ²: {result.std_chi_squared:.1f}\n"
|
|
479
|
+
f"CV: {cv:.3f}\n"
|
|
480
|
+
f"Min χ²: {result.min_chi_squared:.1f}\n"
|
|
481
|
+
f"Max χ²: {result.max_chi_squared:.1f}\n"
|
|
482
|
+
f"─────────────────────\n"
|
|
483
|
+
f"Windows: {result.window_count}\n"
|
|
484
|
+
f"Window Size: {result.window_size}\n"
|
|
485
|
+
f"Stride: {result.stride}\n"
|
|
486
|
+
f"Overlap: {result.overlap_ratio:.0%}"
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
ax3.text(
|
|
490
|
+
0.1,
|
|
491
|
+
0.9,
|
|
492
|
+
stats_text,
|
|
493
|
+
transform=ax3.transAxes,
|
|
494
|
+
fontsize=11,
|
|
495
|
+
family="monospace",
|
|
496
|
+
verticalalignment="top",
|
|
497
|
+
bbox=dict(boxstyle="round,pad=0.5", facecolor="#f8fafc", edgecolor="#e2e8f0"),
|
|
498
|
+
)
|
|
499
|
+
ax3.set_title("Summary Statistics", fontsize=12, fontweight="bold")
|
|
500
|
+
|
|
501
|
+
# Panel 4: Top contributing words at spike
|
|
502
|
+
ax4 = fig.add_subplot(gs[2, 0])
|
|
503
|
+
if result.max_location is not None and result.max_location < len(result.pairwise_scores):
|
|
504
|
+
spike_data = result.pairwise_scores[result.max_location]
|
|
505
|
+
if "top_words" in spike_data and spike_data["top_words"]:
|
|
506
|
+
words = spike_data["top_words"][:10]
|
|
507
|
+
word_labels = [w[0] for w in words]
|
|
508
|
+
word_values = [w[1] for w in words]
|
|
509
|
+
|
|
510
|
+
ax4.barh(word_labels[::-1], word_values[::-1], color="#2563eb", alpha=0.7)
|
|
511
|
+
ax4.set_xlabel("χ² Contribution")
|
|
512
|
+
ax4.set_title(
|
|
513
|
+
f"Top Contributors at Spike (Window {result.max_location})",
|
|
514
|
+
fontsize=12,
|
|
515
|
+
fontweight="bold",
|
|
516
|
+
)
|
|
517
|
+
else:
|
|
518
|
+
ax4.text(
|
|
519
|
+
0.5,
|
|
520
|
+
0.5,
|
|
521
|
+
"No word data available",
|
|
522
|
+
ha="center",
|
|
523
|
+
va="center",
|
|
524
|
+
transform=ax4.transAxes,
|
|
525
|
+
)
|
|
526
|
+
ax4.set_title("Top Contributors at Spike", fontsize=12, fontweight="bold")
|
|
527
|
+
else:
|
|
528
|
+
ax4.text(0.5, 0.5, "No spike detected", ha="center", va="center", transform=ax4.transAxes)
|
|
529
|
+
ax4.set_title("Top Contributors at Spike", fontsize=12, fontweight="bold")
|
|
530
|
+
|
|
531
|
+
# Panel 5: Zone classification
|
|
532
|
+
ax5 = fig.add_subplot(gs[2, 1])
|
|
533
|
+
ax5.axis("off")
|
|
534
|
+
|
|
535
|
+
# Determine zone
|
|
536
|
+
if result.mean_chi_squared < MEAN_CHI_LOW:
|
|
537
|
+
baseline_zone = "AI-like baseline"
|
|
538
|
+
elif result.mean_chi_squared > MEAN_CHI_HIGH:
|
|
539
|
+
baseline_zone = "Human-like baseline"
|
|
540
|
+
else:
|
|
541
|
+
baseline_zone = "Transition zone"
|
|
542
|
+
|
|
543
|
+
if cv < CV_LOW:
|
|
544
|
+
volatility_zone = "Very stable"
|
|
545
|
+
elif cv > CV_HIGH:
|
|
546
|
+
volatility_zone = "Volatile"
|
|
547
|
+
else:
|
|
548
|
+
volatility_zone = "Normal volatility"
|
|
549
|
+
|
|
550
|
+
zone_text = (
|
|
551
|
+
f"Zone Classification\n"
|
|
552
|
+
f"═══════════════════════\n\n"
|
|
553
|
+
f"Baseline: {baseline_zone}\n"
|
|
554
|
+
f" Mean χ² = {result.mean_chi_squared:.1f}\n\n"
|
|
555
|
+
f"Volatility: {volatility_zone}\n"
|
|
556
|
+
f" CV = {cv:.3f}\n\n"
|
|
557
|
+
f"═══════════════════════\n"
|
|
558
|
+
f"Reference Bounds:\n"
|
|
559
|
+
f" AI: Mean χ² < {MEAN_CHI_LOW}\n"
|
|
560
|
+
f" Human: Mean χ² > {MEAN_CHI_HIGH}\n"
|
|
561
|
+
f" Stable: CV < {CV_LOW}\n"
|
|
562
|
+
f" Volatile: CV > {CV_HIGH}"
|
|
563
|
+
)
|
|
564
|
+
|
|
565
|
+
ax5.text(
|
|
566
|
+
0.1,
|
|
567
|
+
0.9,
|
|
568
|
+
zone_text,
|
|
569
|
+
transform=ax5.transAxes,
|
|
570
|
+
fontsize=10,
|
|
571
|
+
family="monospace",
|
|
572
|
+
verticalalignment="top",
|
|
573
|
+
bbox=dict(boxstyle="round,pad=0.5", facecolor="#f8fafc", edgecolor="#e2e8f0"),
|
|
574
|
+
)
|
|
575
|
+
ax5.set_title("Zone Classification", fontsize=12, fontweight="bold")
|
|
576
|
+
|
|
577
|
+
# Main title
|
|
578
|
+
fig.suptitle(
|
|
579
|
+
f"Drift Analysis Report: {label}",
|
|
580
|
+
fontsize=16,
|
|
581
|
+
fontweight="bold",
|
|
582
|
+
y=0.98,
|
|
583
|
+
)
|
|
584
|
+
|
|
585
|
+
if output:
|
|
586
|
+
plt.savefig(output, dpi=150, bbox_inches="tight")
|
|
587
|
+
plt.close()
|
|
588
|
+
else:
|
|
589
|
+
plt.show()
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"""Interactive JSX/HTML exports for pystylometry visualizations.
|
|
2
|
+
|
|
3
|
+
This module provides self-contained HTML exports using React via CDN.
|
|
4
|
+
Each visualization opens directly in a browser without build steps.
|
|
5
|
+
|
|
6
|
+
Available Functions:
|
|
7
|
+
export_drift_timeline_jsx: Timeline of chi-squared values
|
|
8
|
+
export_drift_report_jsx: Multi-panel dashboard
|
|
9
|
+
export_drift_viewer: Standalone viewer with file upload (no pre-computed data)
|
|
10
|
+
|
|
11
|
+
Example:
|
|
12
|
+
>>> from pystylometry.consistency import compute_kilgarriff_drift
|
|
13
|
+
>>> from pystylometry.viz.jsx import export_drift_timeline_jsx, export_drift_viewer
|
|
14
|
+
>>>
|
|
15
|
+
>>> # Pre-computed visualization
|
|
16
|
+
>>> result = compute_kilgarriff_drift(text)
|
|
17
|
+
>>> export_drift_timeline_jsx(result, "timeline.html")
|
|
18
|
+
>>>
|
|
19
|
+
>>> # Standalone viewer (users can upload their own files)
|
|
20
|
+
>>> export_drift_viewer("drift_analyzer.html")
|
|
21
|
+
"""
|
|
22
|
+
|
|
23
|
+
from .report import export_drift_report_jsx
|
|
24
|
+
from .timeline import export_drift_timeline_jsx
|
|
25
|
+
from .viewer import export_drift_viewer
|
|
26
|
+
|
|
27
|
+
__all__ = [
|
|
28
|
+
"export_drift_timeline_jsx",
|
|
29
|
+
"export_drift_report_jsx",
|
|
30
|
+
"export_drift_viewer",
|
|
31
|
+
]
|