pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pystylometry/README.md +42 -0
- pystylometry/__init__.py +45 -3
- pystylometry/_types.py +1017 -259
- pystylometry/authorship/README.md +21 -0
- pystylometry/authorship/__init__.py +28 -4
- pystylometry/authorship/additional_methods.py +260 -40
- pystylometry/authorship/compression.py +175 -0
- pystylometry/authorship/kilgarriff.py +354 -0
- pystylometry/character/README.md +17 -0
- pystylometry/character/character_metrics.py +267 -179
- pystylometry/cli.py +427 -0
- pystylometry/consistency/README.md +27 -0
- pystylometry/consistency/__init__.py +57 -0
- pystylometry/consistency/_thresholds.py +162 -0
- pystylometry/consistency/drift.py +549 -0
- pystylometry/dialect/README.md +26 -0
- pystylometry/dialect/__init__.py +65 -0
- pystylometry/dialect/_data/dialect_markers.json +1134 -0
- pystylometry/dialect/_loader.py +360 -0
- pystylometry/dialect/detector.py +533 -0
- pystylometry/lexical/README.md +23 -0
- pystylometry/lexical/advanced_diversity.py +61 -22
- pystylometry/lexical/function_words.py +255 -56
- pystylometry/lexical/hapax.py +182 -52
- pystylometry/lexical/mtld.py +108 -26
- pystylometry/lexical/ttr.py +76 -10
- pystylometry/lexical/word_frequency_sophistication.py +1522 -298
- pystylometry/lexical/yule.py +136 -50
- pystylometry/ngrams/README.md +18 -0
- pystylometry/ngrams/entropy.py +150 -49
- pystylometry/ngrams/extended_ngrams.py +314 -69
- pystylometry/prosody/README.md +17 -0
- pystylometry/prosody/rhythm_prosody.py +773 -11
- pystylometry/readability/README.md +23 -0
- pystylometry/readability/additional_formulas.py +1887 -762
- pystylometry/readability/ari.py +144 -82
- pystylometry/readability/coleman_liau.py +136 -109
- pystylometry/readability/flesch.py +177 -73
- pystylometry/readability/gunning_fog.py +165 -161
- pystylometry/readability/smog.py +123 -42
- pystylometry/stylistic/README.md +20 -0
- pystylometry/stylistic/cohesion_coherence.py +669 -13
- pystylometry/stylistic/genre_register.py +1560 -17
- pystylometry/stylistic/markers.py +611 -17
- pystylometry/stylistic/vocabulary_overlap.py +354 -13
- pystylometry/syntactic/README.md +20 -0
- pystylometry/syntactic/advanced_syntactic.py +76 -14
- pystylometry/syntactic/pos_ratios.py +70 -6
- pystylometry/syntactic/sentence_stats.py +55 -12
- pystylometry/syntactic/sentence_types.py +71 -15
- pystylometry/viz/README.md +27 -0
- pystylometry/viz/__init__.py +71 -0
- pystylometry/viz/drift.py +589 -0
- pystylometry/viz/jsx/__init__.py +31 -0
- pystylometry/viz/jsx/_base.py +144 -0
- pystylometry/viz/jsx/report.py +677 -0
- pystylometry/viz/jsx/timeline.py +716 -0
- pystylometry/viz/jsx/viewer.py +1032 -0
- pystylometry-1.3.0.dist-info/METADATA +136 -0
- pystylometry-1.3.0.dist-info/RECORD +76 -0
- {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
- pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
- pystylometry-1.0.0.dist-info/METADATA +0 -275
- pystylometry-1.0.0.dist-info/RECORD +0 -46
pystylometry/cli.py
ADDED
|
@@ -0,0 +1,427 @@
|
|
|
1
|
+
"""Command-line interface for pystylometry.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
pystylometry-drift <file> [--window-size=N] [--stride=N] [--mode=MODE] [--json]
|
|
5
|
+
pystylometry-drift <file> --plot [output.png]
|
|
6
|
+
|
|
7
|
+
Example:
|
|
8
|
+
pystylometry-drift manuscript.txt
|
|
9
|
+
pystylometry-drift manuscript.txt --window-size=500 --stride=250
|
|
10
|
+
pystylometry-drift manuscript.txt --json
|
|
11
|
+
pystylometry-drift manuscript.txt --plot
|
|
12
|
+
pystylometry-drift manuscript.txt --plot drift_report.png
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import argparse
|
|
18
|
+
import json
|
|
19
|
+
import sys
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def drift_cli() -> None:
|
|
24
|
+
"""CLI entry point for Kilgarriff drift detection."""
|
|
25
|
+
parser = argparse.ArgumentParser(
|
|
26
|
+
prog="pystylometry-drift",
|
|
27
|
+
description="Detect stylistic drift within a document using Kilgarriff chi-squared.",
|
|
28
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
29
|
+
epilog="""
|
|
30
|
+
Examples:
|
|
31
|
+
pystylometry-drift manuscript.txt
|
|
32
|
+
pystylometry-drift manuscript.txt --window-size=500 --stride=250
|
|
33
|
+
pystylometry-drift manuscript.txt --mode=all_pairs --json
|
|
34
|
+
pystylometry-drift manuscript.txt --plot
|
|
35
|
+
pystylometry-drift manuscript.txt --plot report.png
|
|
36
|
+
pystylometry-drift manuscript.txt --plot timeline.png --plot-type=timeline
|
|
37
|
+
pystylometry-drift manuscript.txt --jsx report.html --plot-type=report
|
|
38
|
+
pystylometry-drift manuscript.txt --viz-all ./output # All PNG + HTML
|
|
39
|
+
|
|
40
|
+
Pattern Signatures:
|
|
41
|
+
consistent Low, stable χ² across pairs (natural human writing)
|
|
42
|
+
gradual_drift Slowly increasing trend (author fatigue, topic shift)
|
|
43
|
+
sudden_spike One pair has high χ² (pasted content, different author)
|
|
44
|
+
suspiciously_uniform Near-zero variance (possible AI generation)
|
|
45
|
+
""",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
parser.add_argument(
|
|
49
|
+
"file",
|
|
50
|
+
type=Path,
|
|
51
|
+
help="Path to text file to analyze",
|
|
52
|
+
)
|
|
53
|
+
parser.add_argument(
|
|
54
|
+
"--window-size",
|
|
55
|
+
type=int,
|
|
56
|
+
default=1000,
|
|
57
|
+
help="Number of tokens per window (default: 1000)",
|
|
58
|
+
)
|
|
59
|
+
parser.add_argument(
|
|
60
|
+
"--stride",
|
|
61
|
+
type=int,
|
|
62
|
+
default=500,
|
|
63
|
+
help="Tokens to advance between windows (default: 500)",
|
|
64
|
+
)
|
|
65
|
+
parser.add_argument(
|
|
66
|
+
"--mode",
|
|
67
|
+
choices=["sequential", "all_pairs", "fixed_lag"],
|
|
68
|
+
default="sequential",
|
|
69
|
+
help="Comparison mode (default: sequential)",
|
|
70
|
+
)
|
|
71
|
+
parser.add_argument(
|
|
72
|
+
"--n-words",
|
|
73
|
+
type=int,
|
|
74
|
+
default=500,
|
|
75
|
+
help="Most frequent words to analyze (default: 500)",
|
|
76
|
+
)
|
|
77
|
+
parser.add_argument(
|
|
78
|
+
"--json",
|
|
79
|
+
action="store_true",
|
|
80
|
+
help="Output results as JSON",
|
|
81
|
+
)
|
|
82
|
+
parser.add_argument(
|
|
83
|
+
"--plot",
|
|
84
|
+
nargs="?",
|
|
85
|
+
const="",
|
|
86
|
+
default=None,
|
|
87
|
+
metavar="OUTPUT",
|
|
88
|
+
help="Generate visualization (optional: path to save, otherwise displays interactively)",
|
|
89
|
+
)
|
|
90
|
+
parser.add_argument(
|
|
91
|
+
"--plot-type",
|
|
92
|
+
choices=["report", "timeline"],
|
|
93
|
+
default="report",
|
|
94
|
+
help="Visualization type: report (multi-panel) or timeline (line chart)",
|
|
95
|
+
)
|
|
96
|
+
parser.add_argument(
|
|
97
|
+
"--jsx",
|
|
98
|
+
metavar="OUTPUT_FILE",
|
|
99
|
+
help="Export interactive visualization as standalone HTML (uses --plot-type)",
|
|
100
|
+
)
|
|
101
|
+
parser.add_argument(
|
|
102
|
+
"--viz-all",
|
|
103
|
+
metavar="OUTPUT_DIR",
|
|
104
|
+
type=Path,
|
|
105
|
+
help="Generate ALL visualizations (PNG + HTML) to directory for testing",
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
args = parser.parse_args()
|
|
109
|
+
|
|
110
|
+
# Validate file exists
|
|
111
|
+
if not args.file.exists():
|
|
112
|
+
print(f"Error: File not found: {args.file}", file=sys.stderr)
|
|
113
|
+
sys.exit(1)
|
|
114
|
+
|
|
115
|
+
# Read file
|
|
116
|
+
try:
|
|
117
|
+
text = args.file.read_text(encoding="utf-8")
|
|
118
|
+
except Exception as e:
|
|
119
|
+
print(f"Error reading file: {e}", file=sys.stderr)
|
|
120
|
+
sys.exit(1)
|
|
121
|
+
|
|
122
|
+
# Determine output mode
|
|
123
|
+
if args.viz_all:
|
|
124
|
+
output_mode = "All Visualizations (PNG + HTML)"
|
|
125
|
+
output_dest = str(args.viz_all)
|
|
126
|
+
elif args.jsx:
|
|
127
|
+
output_mode = f"Interactive HTML ({args.plot_type})"
|
|
128
|
+
output_dest = args.jsx
|
|
129
|
+
elif args.plot is not None:
|
|
130
|
+
output_mode = f"Plot ({args.plot_type})"
|
|
131
|
+
output_dest = args.plot if args.plot else "interactive display"
|
|
132
|
+
elif args.json:
|
|
133
|
+
output_mode = "JSON"
|
|
134
|
+
output_dest = "stdout"
|
|
135
|
+
else:
|
|
136
|
+
output_mode = "Text Report"
|
|
137
|
+
output_dest = "stdout"
|
|
138
|
+
|
|
139
|
+
# Calculate file stats
|
|
140
|
+
token_count = len(text.split())
|
|
141
|
+
char_count = len(text)
|
|
142
|
+
|
|
143
|
+
# Print professional intro banner
|
|
144
|
+
print()
|
|
145
|
+
print(" PYSTYLOMETRY — Kilgarriff Chi-Squared Drift Detection")
|
|
146
|
+
print(" ═══════════════════════════════════════════════════════════════════════")
|
|
147
|
+
print()
|
|
148
|
+
print(" INPUT")
|
|
149
|
+
print(" ───────────────────────────────────────────────────────────────────────")
|
|
150
|
+
print(f" File: {args.file}")
|
|
151
|
+
print(f" Size: {char_count:,} characters / {token_count:,} tokens")
|
|
152
|
+
print()
|
|
153
|
+
print(" PARAMETERS")
|
|
154
|
+
print(" ───────────────────────────────────────────────────────────────────────")
|
|
155
|
+
print(f" Window size: {args.window_size} tokens")
|
|
156
|
+
print(f" Stride: {args.stride} tokens")
|
|
157
|
+
print(
|
|
158
|
+
f" Overlap: {((args.window_size - args.stride) / args.window_size) * 100:.0f}%"
|
|
159
|
+
)
|
|
160
|
+
print(f" Comparison mode: {args.mode}")
|
|
161
|
+
print(f" Top N words: {args.n_words}")
|
|
162
|
+
print()
|
|
163
|
+
print(" OUTPUT")
|
|
164
|
+
print(" ───────────────────────────────────────────────────────────────────────")
|
|
165
|
+
print(f" Format: {output_mode}")
|
|
166
|
+
print(f" Destination: {output_dest}")
|
|
167
|
+
print()
|
|
168
|
+
print(" Running analysis...")
|
|
169
|
+
print()
|
|
170
|
+
|
|
171
|
+
# Import here to avoid slow startup
|
|
172
|
+
from pystylometry.consistency import compute_kilgarriff_drift
|
|
173
|
+
|
|
174
|
+
# Run analysis
|
|
175
|
+
result = compute_kilgarriff_drift(
|
|
176
|
+
text,
|
|
177
|
+
window_size=args.window_size,
|
|
178
|
+
stride=args.stride,
|
|
179
|
+
comparison_mode=args.mode,
|
|
180
|
+
n_words=args.n_words,
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# Handle --viz-all: generate all visualizations for testing
|
|
184
|
+
if args.viz_all:
|
|
185
|
+
output_dir = args.viz_all
|
|
186
|
+
output_dir.mkdir(parents=True, exist_ok=True)
|
|
187
|
+
label = args.file.stem
|
|
188
|
+
|
|
189
|
+
from pystylometry.viz.jsx import export_drift_timeline_jsx
|
|
190
|
+
|
|
191
|
+
generated = []
|
|
192
|
+
|
|
193
|
+
# Write chunks to subdirectory
|
|
194
|
+
chunks_dir = output_dir / "chunks"
|
|
195
|
+
chunks_dir.mkdir(parents=True, exist_ok=True)
|
|
196
|
+
|
|
197
|
+
# Re-create windows to get chunk text (simple word-based chunking)
|
|
198
|
+
words = text.split()
|
|
199
|
+
chunk_texts = []
|
|
200
|
+
start = 0
|
|
201
|
+
chunk_idx = 0
|
|
202
|
+
while start + args.window_size <= len(words):
|
|
203
|
+
chunk_words = words[start : start + args.window_size]
|
|
204
|
+
chunk_text = " ".join(chunk_words)
|
|
205
|
+
chunk_texts.append(chunk_text)
|
|
206
|
+
|
|
207
|
+
# Write chunk file
|
|
208
|
+
chunk_path = chunks_dir / f"chunk_{chunk_idx:03d}.txt"
|
|
209
|
+
chunk_path.write_text(chunk_text, encoding="utf-8")
|
|
210
|
+
chunk_idx += 1
|
|
211
|
+
start += args.stride
|
|
212
|
+
|
|
213
|
+
print(f" Created: {chunks_dir}/ ({len(chunk_texts)} chunks)")
|
|
214
|
+
|
|
215
|
+
# Generate timeline HTML with chunk content
|
|
216
|
+
out_path = output_dir / "drift-detection.html"
|
|
217
|
+
export_drift_timeline_jsx(
|
|
218
|
+
result,
|
|
219
|
+
output_file=out_path,
|
|
220
|
+
title=f"Drift Timeline: {label}",
|
|
221
|
+
chunks=chunk_texts,
|
|
222
|
+
)
|
|
223
|
+
generated.append(out_path)
|
|
224
|
+
print(f" Created: {out_path}")
|
|
225
|
+
|
|
226
|
+
print()
|
|
227
|
+
n_viz, n_chunks = len(generated), len(chunk_texts)
|
|
228
|
+
print(f"Generated {n_viz} visualizations + {n_chunks} chunks to: {output_dir.resolve()}")
|
|
229
|
+
sys.exit(0)
|
|
230
|
+
|
|
231
|
+
# Handle JSX export (generates standalone HTML)
|
|
232
|
+
if args.jsx:
|
|
233
|
+
from pystylometry.viz.jsx import (
|
|
234
|
+
export_drift_report_jsx,
|
|
235
|
+
export_drift_timeline_jsx,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
label = args.file.stem
|
|
239
|
+
|
|
240
|
+
if args.plot_type == "timeline":
|
|
241
|
+
output_path = export_drift_timeline_jsx(
|
|
242
|
+
result,
|
|
243
|
+
output_file=args.jsx,
|
|
244
|
+
title=f"Drift Timeline: {label}",
|
|
245
|
+
)
|
|
246
|
+
else: # report (default)
|
|
247
|
+
output_path = export_drift_report_jsx(
|
|
248
|
+
result,
|
|
249
|
+
output_file=args.jsx,
|
|
250
|
+
label=label,
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
abs_path = output_path.resolve()
|
|
254
|
+
file_url = f"file://{abs_path}"
|
|
255
|
+
print(f"Interactive visualization saved to: {output_path}")
|
|
256
|
+
print(f"Open in browser: {file_url}")
|
|
257
|
+
sys.exit(0)
|
|
258
|
+
|
|
259
|
+
# Handle plot output
|
|
260
|
+
if args.plot is not None:
|
|
261
|
+
try:
|
|
262
|
+
from pystylometry.viz import plot_drift_report, plot_drift_timeline
|
|
263
|
+
except ImportError:
|
|
264
|
+
print(
|
|
265
|
+
"Error: Visualization requires optional dependencies.",
|
|
266
|
+
file=sys.stderr,
|
|
267
|
+
)
|
|
268
|
+
print(
|
|
269
|
+
"Install with: pip install pystylometry[viz] or poetry install --with viz",
|
|
270
|
+
file=sys.stderr,
|
|
271
|
+
)
|
|
272
|
+
sys.exit(1)
|
|
273
|
+
|
|
274
|
+
plot_output: str | None = args.plot if args.plot else None
|
|
275
|
+
label = args.file.stem
|
|
276
|
+
|
|
277
|
+
if args.plot_type == "timeline":
|
|
278
|
+
plot_drift_timeline(result, output=plot_output, title=f"Drift Timeline: {label}")
|
|
279
|
+
else: # report (default)
|
|
280
|
+
plot_drift_report(result, label=label, output=plot_output)
|
|
281
|
+
|
|
282
|
+
if plot_output:
|
|
283
|
+
print(f"Visualization saved to: {plot_output}")
|
|
284
|
+
sys.exit(0)
|
|
285
|
+
|
|
286
|
+
if args.json:
|
|
287
|
+
# JSON output
|
|
288
|
+
output = {
|
|
289
|
+
"status": result.status,
|
|
290
|
+
"status_message": result.status_message,
|
|
291
|
+
"pattern": result.pattern,
|
|
292
|
+
"pattern_confidence": result.pattern_confidence,
|
|
293
|
+
"mean_chi_squared": result.mean_chi_squared,
|
|
294
|
+
"std_chi_squared": result.std_chi_squared,
|
|
295
|
+
"max_chi_squared": result.max_chi_squared,
|
|
296
|
+
"min_chi_squared": result.min_chi_squared,
|
|
297
|
+
"max_location": result.max_location,
|
|
298
|
+
"trend": result.trend,
|
|
299
|
+
"window_size": result.window_size,
|
|
300
|
+
"stride": result.stride,
|
|
301
|
+
"overlap_ratio": result.overlap_ratio,
|
|
302
|
+
"window_count": result.window_count,
|
|
303
|
+
"comparison_mode": result.comparison_mode,
|
|
304
|
+
}
|
|
305
|
+
print(json.dumps(output, indent=2))
|
|
306
|
+
else:
|
|
307
|
+
# Human-readable output
|
|
308
|
+
print("=" * 60)
|
|
309
|
+
print("STYLISTIC DRIFT ANALYSIS")
|
|
310
|
+
print("=" * 60)
|
|
311
|
+
print(f"File: {args.file}")
|
|
312
|
+
print(f"Status: {result.status}")
|
|
313
|
+
print()
|
|
314
|
+
|
|
315
|
+
if result.status == "insufficient_data":
|
|
316
|
+
print(f"⚠️ {result.status_message}")
|
|
317
|
+
print()
|
|
318
|
+
print(f"Windows created: {result.window_count}")
|
|
319
|
+
print("Minimum required: 3")
|
|
320
|
+
print()
|
|
321
|
+
print("Try reducing --window-size or --stride to create more windows.")
|
|
322
|
+
sys.exit(0)
|
|
323
|
+
|
|
324
|
+
print("PATTERN DETECTED")
|
|
325
|
+
print("-" * 40)
|
|
326
|
+
print(f" Pattern: {result.pattern}")
|
|
327
|
+
print(f" Confidence: {result.pattern_confidence:.1%}")
|
|
328
|
+
print()
|
|
329
|
+
|
|
330
|
+
if result.pattern == "consistent":
|
|
331
|
+
print(" ✓ Text shows consistent writing style throughout.")
|
|
332
|
+
elif result.pattern == "gradual_drift":
|
|
333
|
+
print(" ↗ Text shows gradual stylistic drift over its length.")
|
|
334
|
+
print(" Possible causes: author fatigue, topic evolution, revision.")
|
|
335
|
+
elif result.pattern == "sudden_spike":
|
|
336
|
+
print(" ⚡ Text contains a sudden stylistic discontinuity.")
|
|
337
|
+
loc = result.max_location
|
|
338
|
+
print(f" Location: Between windows {loc} and {loc + 1}")
|
|
339
|
+
print(" Possible causes: pasted content, different author, major edit.")
|
|
340
|
+
elif result.pattern == "suspiciously_uniform":
|
|
341
|
+
print(" ⚠️ Text shows unusually uniform style (near-zero variance).")
|
|
342
|
+
print(" Possible causes: AI-generated content, heavy editing, templated text.")
|
|
343
|
+
|
|
344
|
+
print()
|
|
345
|
+
print("CHI-SQUARED STATISTICS")
|
|
346
|
+
print("-" * 40)
|
|
347
|
+
print(f" Mean χ²: {result.mean_chi_squared:.2f}")
|
|
348
|
+
print(f" Std χ²: {result.std_chi_squared:.2f}")
|
|
349
|
+
print(f" Min χ²: {result.min_chi_squared:.2f}")
|
|
350
|
+
print(f" Max χ²: {result.max_chi_squared:.2f}")
|
|
351
|
+
print(f" Trend: {result.trend:+.4f}")
|
|
352
|
+
print()
|
|
353
|
+
|
|
354
|
+
print("WINDOW CONFIGURATION")
|
|
355
|
+
print("-" * 40)
|
|
356
|
+
print(f" Window size: {result.window_size} tokens")
|
|
357
|
+
print(f" Stride: {result.stride} tokens")
|
|
358
|
+
print(f" Overlap: {result.overlap_ratio:.1%}")
|
|
359
|
+
print(f" Windows: {result.window_count}")
|
|
360
|
+
print(f" Comparisons: {len(result.pairwise_scores)}")
|
|
361
|
+
print()
|
|
362
|
+
|
|
363
|
+
if result.status == "marginal_data":
|
|
364
|
+
print(f"⚠️ {result.status_message}")
|
|
365
|
+
print()
|
|
366
|
+
|
|
367
|
+
|
|
368
|
+
def viewer_cli() -> None:
|
|
369
|
+
"""CLI entry point for generating a standalone drift viewer."""
|
|
370
|
+
parser = argparse.ArgumentParser(
|
|
371
|
+
prog="pystylometry-viewer",
|
|
372
|
+
description="Generate a standalone HTML drift analysis viewer.",
|
|
373
|
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
374
|
+
epilog="""
|
|
375
|
+
This generates a self-contained HTML file that users can open in any browser
|
|
376
|
+
to analyze their own text files. No Python or server required - just share
|
|
377
|
+
the HTML file and anyone can use it.
|
|
378
|
+
|
|
379
|
+
Examples:
|
|
380
|
+
pystylometry-viewer drift_analyzer.html
|
|
381
|
+
pystylometry-viewer ~/Desktop/analyzer.html --title "My Drift Analyzer"
|
|
382
|
+
|
|
383
|
+
The generated viewer includes:
|
|
384
|
+
- Drag-and-drop file upload
|
|
385
|
+
- Configurable analysis parameters
|
|
386
|
+
- Interactive timeline visualization
|
|
387
|
+
- Client-side Kilgarriff chi-squared implementation
|
|
388
|
+
""",
|
|
389
|
+
)
|
|
390
|
+
|
|
391
|
+
parser.add_argument(
|
|
392
|
+
"output",
|
|
393
|
+
type=Path,
|
|
394
|
+
help="Path to write the HTML viewer file",
|
|
395
|
+
)
|
|
396
|
+
parser.add_argument(
|
|
397
|
+
"--title",
|
|
398
|
+
default="Stylistic Drift Analyzer",
|
|
399
|
+
help="Page title (default: 'Stylistic Drift Analyzer')",
|
|
400
|
+
)
|
|
401
|
+
|
|
402
|
+
args = parser.parse_args()
|
|
403
|
+
|
|
404
|
+
from pystylometry.viz.jsx import export_drift_viewer
|
|
405
|
+
|
|
406
|
+
output_path = export_drift_viewer(args.output, title=args.title)
|
|
407
|
+
|
|
408
|
+
abs_path = output_path.resolve()
|
|
409
|
+
file_url = f"file://{abs_path}"
|
|
410
|
+
|
|
411
|
+
print()
|
|
412
|
+
print(" PYSTYLOMETRY — Standalone Drift Viewer")
|
|
413
|
+
print(" ═══════════════════════════════════════════════════════════════════════")
|
|
414
|
+
print()
|
|
415
|
+
print(f" Generated: {output_path}")
|
|
416
|
+
print(f" Open in browser: {file_url}")
|
|
417
|
+
print()
|
|
418
|
+
print(" This viewer can be shared with anyone. Users can:")
|
|
419
|
+
print(" • Drag-and-drop or upload .txt files")
|
|
420
|
+
print(" • Configure analysis parameters")
|
|
421
|
+
print(" • View interactive drift timeline")
|
|
422
|
+
print(" • Click points to see chunk comparisons")
|
|
423
|
+
print()
|
|
424
|
+
|
|
425
|
+
|
|
426
|
+
if __name__ == "__main__":
|
|
427
|
+
drift_cli()
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# consistency
|
|
2
|
+
|
|
3
|
+

|
|
4
|
+

|
|
5
|
+
|
|
6
|
+
Intra-document style drift detection using sliding-window chi-squared analysis.
|
|
7
|
+
|
|
8
|
+
## Catalogue
|
|
9
|
+
|
|
10
|
+
| File | Function | What It Does |
|
|
11
|
+
|------|----------|-------------|
|
|
12
|
+
| `drift.py` | `compute_kilgarriff_drift` | Detects stylistic drift, splice points, and AI-generation signatures |
|
|
13
|
+
| `_thresholds.py` | _(internal)_ | Classification thresholds for pattern detection |
|
|
14
|
+
|
|
15
|
+
## Detected Patterns
|
|
16
|
+
|
|
17
|
+
| Pattern | Meaning |
|
|
18
|
+
|---------|---------|
|
|
19
|
+
| `consistent` | Natural human variation throughout |
|
|
20
|
+
| `gradual_drift` | Style shifts progressively over the document |
|
|
21
|
+
| `sudden_spike` | Abrupt discontinuity (possible splice or paste) |
|
|
22
|
+
| `suspiciously_uniform` | Unnaturally low variation (possible AI generation) |
|
|
23
|
+
|
|
24
|
+
## See Also
|
|
25
|
+
|
|
26
|
+
- [`authorship/kilgarriff.py`](../authorship/) -- the underlying chi-squared method (between-text comparison)
|
|
27
|
+
- [`viz/`](../viz/) for timeline and report visualizations of drift results
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Consistency analysis module for pystylometry.
|
|
2
|
+
|
|
3
|
+
This module provides tools for analyzing internal stylistic consistency within
|
|
4
|
+
a single document. Unlike the `authorship` module (which compares different texts),
|
|
5
|
+
the `consistency` module focuses on detecting patterns within one text:
|
|
6
|
+
|
|
7
|
+
- Stylistic drift over the course of a document
|
|
8
|
+
- Sudden discontinuities suggesting pasted content or different authors
|
|
9
|
+
- Suspiciously uniform style (potential AI generation signature)
|
|
10
|
+
- Natural variation patterns in human writing
|
|
11
|
+
|
|
12
|
+
Related GitHub Issues:
|
|
13
|
+
#36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
|
|
14
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
15
|
+
#27 - Native chunked analysis with Distribution dataclass
|
|
16
|
+
https://github.com/craigtrim/pystylometry/issues/27
|
|
17
|
+
|
|
18
|
+
Marketing Names:
|
|
19
|
+
- "Style Drift Detector"
|
|
20
|
+
- "Consistency Fingerprint"
|
|
21
|
+
- "Authorship Continuity Score"
|
|
22
|
+
|
|
23
|
+
Available Functions:
|
|
24
|
+
compute_kilgarriff_drift: Detect stylistic drift using chi-squared method
|
|
25
|
+
|
|
26
|
+
Example Usage:
|
|
27
|
+
>>> from pystylometry.consistency import compute_kilgarriff_drift
|
|
28
|
+
>>>
|
|
29
|
+
>>> # Analyze a document for stylistic consistency
|
|
30
|
+
>>> result = compute_kilgarriff_drift(document_text)
|
|
31
|
+
>>>
|
|
32
|
+
>>> # Check the detected pattern
|
|
33
|
+
>>> print(f"Pattern: {result.pattern}") # e.g., "consistent", "sudden_spike"
|
|
34
|
+
>>> print(f"Confidence: {result.pattern_confidence:.2f}")
|
|
35
|
+
>>>
|
|
36
|
+
>>> # Investigate potential AI generation
|
|
37
|
+
>>> if result.pattern == "suspiciously_uniform":
|
|
38
|
+
... print("Warning: Text shows unusually uniform style")
|
|
39
|
+
>>>
|
|
40
|
+
>>> # Find where the biggest style shift occurs
|
|
41
|
+
>>> if result.pattern == "sudden_spike":
|
|
42
|
+
... print(f"Major discontinuity at window boundary {result.max_location}")
|
|
43
|
+
|
|
44
|
+
References:
|
|
45
|
+
Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
|
|
46
|
+
Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
|
|
47
|
+
|
|
48
|
+
Eder, Maciej. "Does Size Matter? Authorship Attribution, Small Samples,
|
|
49
|
+
Big Problem." Digital Scholarship in the Humanities, vol. 30, no. 2,
|
|
50
|
+
2015, pp. 167-182.
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
from .drift import compute_kilgarriff_drift
|
|
54
|
+
|
|
55
|
+
__all__ = [
|
|
56
|
+
"compute_kilgarriff_drift",
|
|
57
|
+
]
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
"""Threshold constants for consistency pattern classification.
|
|
2
|
+
|
|
3
|
+
This module contains calibration constants used for classifying stylistic
|
|
4
|
+
patterns in the consistency module. These thresholds determine how the
|
|
5
|
+
`compute_kilgarriff_drift()` function classifies detected patterns.
|
|
6
|
+
|
|
7
|
+
Related GitHub Issues:
|
|
8
|
+
#36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
|
|
9
|
+
https://github.com/craigtrim/pystylometry/issues/36
|
|
10
|
+
|
|
11
|
+
Calibration Notes:
|
|
12
|
+
These thresholds are initial estimates based on theoretical considerations
|
|
13
|
+
and limited empirical testing. They should be refined through systematic
|
|
14
|
+
evaluation on diverse corpora including:
|
|
15
|
+
- Human-written texts of various lengths and genres
|
|
16
|
+
- AI-generated texts from different models
|
|
17
|
+
- Mixed human/AI texts
|
|
18
|
+
- Multi-author documents
|
|
19
|
+
|
|
20
|
+
The thresholds are exposed as module-level constants to allow:
|
|
21
|
+
1. Transparency: Users can inspect what values are used
|
|
22
|
+
2. Customization: Advanced users can override via metadata or subclassing
|
|
23
|
+
3. Research: Easy adjustment for empirical calibration studies
|
|
24
|
+
|
|
25
|
+
Pattern Classification Logic:
|
|
26
|
+
The pattern classification uses a decision tree approach:
|
|
27
|
+
|
|
28
|
+
1. First check for insufficient variance (suspiciously uniform)
|
|
29
|
+
- AI-generated text often shows near-identical statistics across chunks
|
|
30
|
+
- This is detected by very low std_chi_squared
|
|
31
|
+
|
|
32
|
+
2. Then check for sudden discontinuities (sudden spike)
|
|
33
|
+
- Pasted content or different authors cause outlier chi-squared values
|
|
34
|
+
- Detected by max_chi_squared significantly exceeding mean
|
|
35
|
+
|
|
36
|
+
3. Then check for trends (gradual drift)
|
|
37
|
+
- Author fatigue or topic evolution shows increasing chi-squared
|
|
38
|
+
- Detected by significant slope in chi-squared over time
|
|
39
|
+
|
|
40
|
+
4. Otherwise classify as consistent (natural variation)
|
|
41
|
+
- Human writing typically shows moderate, stable variance
|
|
42
|
+
|
|
43
|
+
References:
|
|
44
|
+
The thresholds are informed by stylometric literature but require
|
|
45
|
+
empirical validation:
|
|
46
|
+
|
|
47
|
+
Eder, Maciej, et al. "Stylometry with R: A Package for Computational Text
|
|
48
|
+
Analysis." The R Journal, vol. 8, no. 1, 2016, pp. 107-121.
|
|
49
|
+
|
|
50
|
+
Juola, Patrick. "Authorship Attribution." Foundations and Trends in
|
|
51
|
+
Information Retrieval, vol. 1, no. 3, 2006, pp. 233-334.
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
from __future__ import annotations
|
|
55
|
+
|
|
56
|
+
# =============================================================================
|
|
57
|
+
# Window Count Thresholds
|
|
58
|
+
# =============================================================================
|
|
59
|
+
# These determine the minimum data requirements for meaningful analysis.
|
|
60
|
+
|
|
61
|
+
# Absolute minimum: Need at least 2 comparisons for any variance calculation
|
|
62
|
+
# With 3 windows, we get pairs: (1,2), (2,3) = 2 comparisons
|
|
63
|
+
MIN_WINDOWS = 3
|
|
64
|
+
|
|
65
|
+
# Recommended minimum: Need enough data points for reliable pattern classification
|
|
66
|
+
# With 5 windows, we get pairs: (1,2), (2,3), (3,4), (4,5) = 4 comparisons
|
|
67
|
+
# This allows computation of mean, std, and basic trend detection
|
|
68
|
+
RECOMMENDED_WINDOWS = 5
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
# =============================================================================
|
|
72
|
+
# Pattern Classification Thresholds
|
|
73
|
+
# =============================================================================
|
|
74
|
+
# These control how chi-squared patterns are classified into named patterns.
|
|
75
|
+
|
|
76
|
+
# --- Suspiciously Uniform Detection ---
|
|
77
|
+
# AI-generated text often shows near-zero variance in stylistic metrics.
|
|
78
|
+
# Human writing naturally fluctuates; AI maintains eerie consistency.
|
|
79
|
+
|
|
80
|
+
# Coefficient of variation threshold (std / mean)
|
|
81
|
+
# Below this, variance is suspiciously low
|
|
82
|
+
UNIFORM_CV_THRESHOLD = 0.15
|
|
83
|
+
|
|
84
|
+
# Also require low absolute mean for "uniform" classification
|
|
85
|
+
# (High mean with low variance is just "consistent" with high baseline)
|
|
86
|
+
UNIFORM_MEAN_THRESHOLD = 50.0
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
# --- Sudden Spike Detection ---
|
|
90
|
+
# Pasted content, different authors, or major edits cause discontinuities.
|
|
91
|
+
|
|
92
|
+
# Ratio of max to mean that indicates a spike
|
|
93
|
+
# If max > SPIKE_RATIO × mean, we have an outlier
|
|
94
|
+
SPIKE_RATIO = 2.5
|
|
95
|
+
|
|
96
|
+
# Absolute minimum spike size (to avoid false positives on low-baseline texts)
|
|
97
|
+
SPIKE_MIN_ABSOLUTE = 100.0
|
|
98
|
+
|
|
99
|
+
|
|
100
|
+
# --- Gradual Drift Detection ---
|
|
101
|
+
# Increasing chi-squared over time suggests evolving style or author fatigue.
|
|
102
|
+
|
|
103
|
+
# Minimum slope (chi-squared units per chunk pair) to detect trend
|
|
104
|
+
# Positive = increasing difference over time
|
|
105
|
+
# Negative = converging style (less common)
|
|
106
|
+
TREND_SLOPE_THRESHOLD = 5.0
|
|
107
|
+
|
|
108
|
+
# R-squared threshold: Trend must explain this much variance to be meaningful
|
|
109
|
+
TREND_R_SQUARED_THRESHOLD = 0.3
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
# --- Consistent Pattern ---
|
|
113
|
+
# If none of the above patterns are detected, text is classified as "consistent"
|
|
114
|
+
# This represents natural human writing with normal variation.
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
# =============================================================================
|
|
118
|
+
# Confidence Calculation Weights
|
|
119
|
+
# =============================================================================
|
|
120
|
+
# These control how pattern confidence scores are computed.
|
|
121
|
+
|
|
122
|
+
# Minimum window count for full confidence
|
|
123
|
+
# Below this, confidence is scaled down proportionally
|
|
124
|
+
CONFIDENCE_MIN_WINDOWS = 5
|
|
125
|
+
|
|
126
|
+
# Maximum confidence achievable with marginal data
|
|
127
|
+
MARGINAL_DATA_MAX_CONFIDENCE = 0.6
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
# =============================================================================
|
|
131
|
+
# Export all thresholds as a dict for easy inspection
|
|
132
|
+
# =============================================================================
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def get_all_thresholds() -> dict[str, float]:
|
|
136
|
+
"""
|
|
137
|
+
Return all threshold values as a dictionary.
|
|
138
|
+
|
|
139
|
+
This is useful for:
|
|
140
|
+
- Logging/debugging: Record what thresholds were used
|
|
141
|
+
- Transparency: Include in result metadata
|
|
142
|
+
- Research: Compare different threshold settings
|
|
143
|
+
|
|
144
|
+
Returns:
|
|
145
|
+
Dict mapping threshold names to their values
|
|
146
|
+
|
|
147
|
+
Example:
|
|
148
|
+
>>> thresholds = get_all_thresholds()
|
|
149
|
+
>>> print(f"Spike ratio: {thresholds['spike_ratio']}")
|
|
150
|
+
"""
|
|
151
|
+
return {
|
|
152
|
+
"min_windows": MIN_WINDOWS,
|
|
153
|
+
"recommended_windows": RECOMMENDED_WINDOWS,
|
|
154
|
+
"uniform_cv_threshold": UNIFORM_CV_THRESHOLD,
|
|
155
|
+
"uniform_mean_threshold": UNIFORM_MEAN_THRESHOLD,
|
|
156
|
+
"spike_ratio": SPIKE_RATIO,
|
|
157
|
+
"spike_min_absolute": SPIKE_MIN_ABSOLUTE,
|
|
158
|
+
"trend_slope_threshold": TREND_SLOPE_THRESHOLD,
|
|
159
|
+
"trend_r_squared_threshold": TREND_R_SQUARED_THRESHOLD,
|
|
160
|
+
"confidence_min_windows": CONFIDENCE_MIN_WINDOWS,
|
|
161
|
+
"marginal_data_max_confidence": MARGINAL_DATA_MAX_CONFIDENCE,
|
|
162
|
+
}
|