pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. pystylometry/README.md +42 -0
  2. pystylometry/__init__.py +45 -3
  3. pystylometry/_types.py +1017 -259
  4. pystylometry/authorship/README.md +21 -0
  5. pystylometry/authorship/__init__.py +28 -4
  6. pystylometry/authorship/additional_methods.py +260 -40
  7. pystylometry/authorship/compression.py +175 -0
  8. pystylometry/authorship/kilgarriff.py +354 -0
  9. pystylometry/character/README.md +17 -0
  10. pystylometry/character/character_metrics.py +267 -179
  11. pystylometry/cli.py +427 -0
  12. pystylometry/consistency/README.md +27 -0
  13. pystylometry/consistency/__init__.py +57 -0
  14. pystylometry/consistency/_thresholds.py +162 -0
  15. pystylometry/consistency/drift.py +549 -0
  16. pystylometry/dialect/README.md +26 -0
  17. pystylometry/dialect/__init__.py +65 -0
  18. pystylometry/dialect/_data/dialect_markers.json +1134 -0
  19. pystylometry/dialect/_loader.py +360 -0
  20. pystylometry/dialect/detector.py +533 -0
  21. pystylometry/lexical/README.md +23 -0
  22. pystylometry/lexical/advanced_diversity.py +61 -22
  23. pystylometry/lexical/function_words.py +255 -56
  24. pystylometry/lexical/hapax.py +182 -52
  25. pystylometry/lexical/mtld.py +108 -26
  26. pystylometry/lexical/ttr.py +76 -10
  27. pystylometry/lexical/word_frequency_sophistication.py +1522 -298
  28. pystylometry/lexical/yule.py +136 -50
  29. pystylometry/ngrams/README.md +18 -0
  30. pystylometry/ngrams/entropy.py +150 -49
  31. pystylometry/ngrams/extended_ngrams.py +314 -69
  32. pystylometry/prosody/README.md +17 -0
  33. pystylometry/prosody/rhythm_prosody.py +773 -11
  34. pystylometry/readability/README.md +23 -0
  35. pystylometry/readability/additional_formulas.py +1887 -762
  36. pystylometry/readability/ari.py +144 -82
  37. pystylometry/readability/coleman_liau.py +136 -109
  38. pystylometry/readability/flesch.py +177 -73
  39. pystylometry/readability/gunning_fog.py +165 -161
  40. pystylometry/readability/smog.py +123 -42
  41. pystylometry/stylistic/README.md +20 -0
  42. pystylometry/stylistic/cohesion_coherence.py +669 -13
  43. pystylometry/stylistic/genre_register.py +1560 -17
  44. pystylometry/stylistic/markers.py +611 -17
  45. pystylometry/stylistic/vocabulary_overlap.py +354 -13
  46. pystylometry/syntactic/README.md +20 -0
  47. pystylometry/syntactic/advanced_syntactic.py +76 -14
  48. pystylometry/syntactic/pos_ratios.py +70 -6
  49. pystylometry/syntactic/sentence_stats.py +55 -12
  50. pystylometry/syntactic/sentence_types.py +71 -15
  51. pystylometry/viz/README.md +27 -0
  52. pystylometry/viz/__init__.py +71 -0
  53. pystylometry/viz/drift.py +589 -0
  54. pystylometry/viz/jsx/__init__.py +31 -0
  55. pystylometry/viz/jsx/_base.py +144 -0
  56. pystylometry/viz/jsx/report.py +677 -0
  57. pystylometry/viz/jsx/timeline.py +716 -0
  58. pystylometry/viz/jsx/viewer.py +1032 -0
  59. pystylometry-1.3.0.dist-info/METADATA +136 -0
  60. pystylometry-1.3.0.dist-info/RECORD +76 -0
  61. {pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
  62. pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
  63. pystylometry-1.0.0.dist-info/METADATA +0 -275
  64. pystylometry-1.0.0.dist-info/RECORD +0 -46
pystylometry/cli.py ADDED
@@ -0,0 +1,427 @@
1
+ """Command-line interface for pystylometry.
2
+
3
+ Usage:
4
+ pystylometry-drift <file> [--window-size=N] [--stride=N] [--mode=MODE] [--json]
5
+ pystylometry-drift <file> --plot [output.png]
6
+
7
+ Example:
8
+ pystylometry-drift manuscript.txt
9
+ pystylometry-drift manuscript.txt --window-size=500 --stride=250
10
+ pystylometry-drift manuscript.txt --json
11
+ pystylometry-drift manuscript.txt --plot
12
+ pystylometry-drift manuscript.txt --plot drift_report.png
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import argparse
18
+ import json
19
+ import sys
20
+ from pathlib import Path
21
+
22
+
23
+ def drift_cli() -> None:
24
+ """CLI entry point for Kilgarriff drift detection."""
25
+ parser = argparse.ArgumentParser(
26
+ prog="pystylometry-drift",
27
+ description="Detect stylistic drift within a document using Kilgarriff chi-squared.",
28
+ formatter_class=argparse.RawDescriptionHelpFormatter,
29
+ epilog="""
30
+ Examples:
31
+ pystylometry-drift manuscript.txt
32
+ pystylometry-drift manuscript.txt --window-size=500 --stride=250
33
+ pystylometry-drift manuscript.txt --mode=all_pairs --json
34
+ pystylometry-drift manuscript.txt --plot
35
+ pystylometry-drift manuscript.txt --plot report.png
36
+ pystylometry-drift manuscript.txt --plot timeline.png --plot-type=timeline
37
+ pystylometry-drift manuscript.txt --jsx report.html --plot-type=report
38
+ pystylometry-drift manuscript.txt --viz-all ./output # All PNG + HTML
39
+
40
+ Pattern Signatures:
41
+ consistent Low, stable χ² across pairs (natural human writing)
42
+ gradual_drift Slowly increasing trend (author fatigue, topic shift)
43
+ sudden_spike One pair has high χ² (pasted content, different author)
44
+ suspiciously_uniform Near-zero variance (possible AI generation)
45
+ """,
46
+ )
47
+
48
+ parser.add_argument(
49
+ "file",
50
+ type=Path,
51
+ help="Path to text file to analyze",
52
+ )
53
+ parser.add_argument(
54
+ "--window-size",
55
+ type=int,
56
+ default=1000,
57
+ help="Number of tokens per window (default: 1000)",
58
+ )
59
+ parser.add_argument(
60
+ "--stride",
61
+ type=int,
62
+ default=500,
63
+ help="Tokens to advance between windows (default: 500)",
64
+ )
65
+ parser.add_argument(
66
+ "--mode",
67
+ choices=["sequential", "all_pairs", "fixed_lag"],
68
+ default="sequential",
69
+ help="Comparison mode (default: sequential)",
70
+ )
71
+ parser.add_argument(
72
+ "--n-words",
73
+ type=int,
74
+ default=500,
75
+ help="Most frequent words to analyze (default: 500)",
76
+ )
77
+ parser.add_argument(
78
+ "--json",
79
+ action="store_true",
80
+ help="Output results as JSON",
81
+ )
82
+ parser.add_argument(
83
+ "--plot",
84
+ nargs="?",
85
+ const="",
86
+ default=None,
87
+ metavar="OUTPUT",
88
+ help="Generate visualization (optional: path to save, otherwise displays interactively)",
89
+ )
90
+ parser.add_argument(
91
+ "--plot-type",
92
+ choices=["report", "timeline"],
93
+ default="report",
94
+ help="Visualization type: report (multi-panel) or timeline (line chart)",
95
+ )
96
+ parser.add_argument(
97
+ "--jsx",
98
+ metavar="OUTPUT_FILE",
99
+ help="Export interactive visualization as standalone HTML (uses --plot-type)",
100
+ )
101
+ parser.add_argument(
102
+ "--viz-all",
103
+ metavar="OUTPUT_DIR",
104
+ type=Path,
105
+ help="Generate ALL visualizations (PNG + HTML) to directory for testing",
106
+ )
107
+
108
+ args = parser.parse_args()
109
+
110
+ # Validate file exists
111
+ if not args.file.exists():
112
+ print(f"Error: File not found: {args.file}", file=sys.stderr)
113
+ sys.exit(1)
114
+
115
+ # Read file
116
+ try:
117
+ text = args.file.read_text(encoding="utf-8")
118
+ except Exception as e:
119
+ print(f"Error reading file: {e}", file=sys.stderr)
120
+ sys.exit(1)
121
+
122
+ # Determine output mode
123
+ if args.viz_all:
124
+ output_mode = "All Visualizations (PNG + HTML)"
125
+ output_dest = str(args.viz_all)
126
+ elif args.jsx:
127
+ output_mode = f"Interactive HTML ({args.plot_type})"
128
+ output_dest = args.jsx
129
+ elif args.plot is not None:
130
+ output_mode = f"Plot ({args.plot_type})"
131
+ output_dest = args.plot if args.plot else "interactive display"
132
+ elif args.json:
133
+ output_mode = "JSON"
134
+ output_dest = "stdout"
135
+ else:
136
+ output_mode = "Text Report"
137
+ output_dest = "stdout"
138
+
139
+ # Calculate file stats
140
+ token_count = len(text.split())
141
+ char_count = len(text)
142
+
143
+ # Print professional intro banner
144
+ print()
145
+ print(" PYSTYLOMETRY — Kilgarriff Chi-Squared Drift Detection")
146
+ print(" ═══════════════════════════════════════════════════════════════════════")
147
+ print()
148
+ print(" INPUT")
149
+ print(" ───────────────────────────────────────────────────────────────────────")
150
+ print(f" File: {args.file}")
151
+ print(f" Size: {char_count:,} characters / {token_count:,} tokens")
152
+ print()
153
+ print(" PARAMETERS")
154
+ print(" ───────────────────────────────────────────────────────────────────────")
155
+ print(f" Window size: {args.window_size} tokens")
156
+ print(f" Stride: {args.stride} tokens")
157
+ print(
158
+ f" Overlap: {((args.window_size - args.stride) / args.window_size) * 100:.0f}%"
159
+ )
160
+ print(f" Comparison mode: {args.mode}")
161
+ print(f" Top N words: {args.n_words}")
162
+ print()
163
+ print(" OUTPUT")
164
+ print(" ───────────────────────────────────────────────────────────────────────")
165
+ print(f" Format: {output_mode}")
166
+ print(f" Destination: {output_dest}")
167
+ print()
168
+ print(" Running analysis...")
169
+ print()
170
+
171
+ # Import here to avoid slow startup
172
+ from pystylometry.consistency import compute_kilgarriff_drift
173
+
174
+ # Run analysis
175
+ result = compute_kilgarriff_drift(
176
+ text,
177
+ window_size=args.window_size,
178
+ stride=args.stride,
179
+ comparison_mode=args.mode,
180
+ n_words=args.n_words,
181
+ )
182
+
183
+ # Handle --viz-all: generate all visualizations for testing
184
+ if args.viz_all:
185
+ output_dir = args.viz_all
186
+ output_dir.mkdir(parents=True, exist_ok=True)
187
+ label = args.file.stem
188
+
189
+ from pystylometry.viz.jsx import export_drift_timeline_jsx
190
+
191
+ generated = []
192
+
193
+ # Write chunks to subdirectory
194
+ chunks_dir = output_dir / "chunks"
195
+ chunks_dir.mkdir(parents=True, exist_ok=True)
196
+
197
+ # Re-create windows to get chunk text (simple word-based chunking)
198
+ words = text.split()
199
+ chunk_texts = []
200
+ start = 0
201
+ chunk_idx = 0
202
+ while start + args.window_size <= len(words):
203
+ chunk_words = words[start : start + args.window_size]
204
+ chunk_text = " ".join(chunk_words)
205
+ chunk_texts.append(chunk_text)
206
+
207
+ # Write chunk file
208
+ chunk_path = chunks_dir / f"chunk_{chunk_idx:03d}.txt"
209
+ chunk_path.write_text(chunk_text, encoding="utf-8")
210
+ chunk_idx += 1
211
+ start += args.stride
212
+
213
+ print(f" Created: {chunks_dir}/ ({len(chunk_texts)} chunks)")
214
+
215
+ # Generate timeline HTML with chunk content
216
+ out_path = output_dir / "drift-detection.html"
217
+ export_drift_timeline_jsx(
218
+ result,
219
+ output_file=out_path,
220
+ title=f"Drift Timeline: {label}",
221
+ chunks=chunk_texts,
222
+ )
223
+ generated.append(out_path)
224
+ print(f" Created: {out_path}")
225
+
226
+ print()
227
+ n_viz, n_chunks = len(generated), len(chunk_texts)
228
+ print(f"Generated {n_viz} visualizations + {n_chunks} chunks to: {output_dir.resolve()}")
229
+ sys.exit(0)
230
+
231
+ # Handle JSX export (generates standalone HTML)
232
+ if args.jsx:
233
+ from pystylometry.viz.jsx import (
234
+ export_drift_report_jsx,
235
+ export_drift_timeline_jsx,
236
+ )
237
+
238
+ label = args.file.stem
239
+
240
+ if args.plot_type == "timeline":
241
+ output_path = export_drift_timeline_jsx(
242
+ result,
243
+ output_file=args.jsx,
244
+ title=f"Drift Timeline: {label}",
245
+ )
246
+ else: # report (default)
247
+ output_path = export_drift_report_jsx(
248
+ result,
249
+ output_file=args.jsx,
250
+ label=label,
251
+ )
252
+
253
+ abs_path = output_path.resolve()
254
+ file_url = f"file://{abs_path}"
255
+ print(f"Interactive visualization saved to: {output_path}")
256
+ print(f"Open in browser: {file_url}")
257
+ sys.exit(0)
258
+
259
+ # Handle plot output
260
+ if args.plot is not None:
261
+ try:
262
+ from pystylometry.viz import plot_drift_report, plot_drift_timeline
263
+ except ImportError:
264
+ print(
265
+ "Error: Visualization requires optional dependencies.",
266
+ file=sys.stderr,
267
+ )
268
+ print(
269
+ "Install with: pip install pystylometry[viz] or poetry install --with viz",
270
+ file=sys.stderr,
271
+ )
272
+ sys.exit(1)
273
+
274
+ plot_output: str | None = args.plot if args.plot else None
275
+ label = args.file.stem
276
+
277
+ if args.plot_type == "timeline":
278
+ plot_drift_timeline(result, output=plot_output, title=f"Drift Timeline: {label}")
279
+ else: # report (default)
280
+ plot_drift_report(result, label=label, output=plot_output)
281
+
282
+ if plot_output:
283
+ print(f"Visualization saved to: {plot_output}")
284
+ sys.exit(0)
285
+
286
+ if args.json:
287
+ # JSON output
288
+ output = {
289
+ "status": result.status,
290
+ "status_message": result.status_message,
291
+ "pattern": result.pattern,
292
+ "pattern_confidence": result.pattern_confidence,
293
+ "mean_chi_squared": result.mean_chi_squared,
294
+ "std_chi_squared": result.std_chi_squared,
295
+ "max_chi_squared": result.max_chi_squared,
296
+ "min_chi_squared": result.min_chi_squared,
297
+ "max_location": result.max_location,
298
+ "trend": result.trend,
299
+ "window_size": result.window_size,
300
+ "stride": result.stride,
301
+ "overlap_ratio": result.overlap_ratio,
302
+ "window_count": result.window_count,
303
+ "comparison_mode": result.comparison_mode,
304
+ }
305
+ print(json.dumps(output, indent=2))
306
+ else:
307
+ # Human-readable output
308
+ print("=" * 60)
309
+ print("STYLISTIC DRIFT ANALYSIS")
310
+ print("=" * 60)
311
+ print(f"File: {args.file}")
312
+ print(f"Status: {result.status}")
313
+ print()
314
+
315
+ if result.status == "insufficient_data":
316
+ print(f"⚠️ {result.status_message}")
317
+ print()
318
+ print(f"Windows created: {result.window_count}")
319
+ print("Minimum required: 3")
320
+ print()
321
+ print("Try reducing --window-size or --stride to create more windows.")
322
+ sys.exit(0)
323
+
324
+ print("PATTERN DETECTED")
325
+ print("-" * 40)
326
+ print(f" Pattern: {result.pattern}")
327
+ print(f" Confidence: {result.pattern_confidence:.1%}")
328
+ print()
329
+
330
+ if result.pattern == "consistent":
331
+ print(" ✓ Text shows consistent writing style throughout.")
332
+ elif result.pattern == "gradual_drift":
333
+ print(" ↗ Text shows gradual stylistic drift over its length.")
334
+ print(" Possible causes: author fatigue, topic evolution, revision.")
335
+ elif result.pattern == "sudden_spike":
336
+ print(" ⚡ Text contains a sudden stylistic discontinuity.")
337
+ loc = result.max_location
338
+ print(f" Location: Between windows {loc} and {loc + 1}")
339
+ print(" Possible causes: pasted content, different author, major edit.")
340
+ elif result.pattern == "suspiciously_uniform":
341
+ print(" ⚠️ Text shows unusually uniform style (near-zero variance).")
342
+ print(" Possible causes: AI-generated content, heavy editing, templated text.")
343
+
344
+ print()
345
+ print("CHI-SQUARED STATISTICS")
346
+ print("-" * 40)
347
+ print(f" Mean χ²: {result.mean_chi_squared:.2f}")
348
+ print(f" Std χ²: {result.std_chi_squared:.2f}")
349
+ print(f" Min χ²: {result.min_chi_squared:.2f}")
350
+ print(f" Max χ²: {result.max_chi_squared:.2f}")
351
+ print(f" Trend: {result.trend:+.4f}")
352
+ print()
353
+
354
+ print("WINDOW CONFIGURATION")
355
+ print("-" * 40)
356
+ print(f" Window size: {result.window_size} tokens")
357
+ print(f" Stride: {result.stride} tokens")
358
+ print(f" Overlap: {result.overlap_ratio:.1%}")
359
+ print(f" Windows: {result.window_count}")
360
+ print(f" Comparisons: {len(result.pairwise_scores)}")
361
+ print()
362
+
363
+ if result.status == "marginal_data":
364
+ print(f"⚠️ {result.status_message}")
365
+ print()
366
+
367
+
368
+ def viewer_cli() -> None:
369
+ """CLI entry point for generating a standalone drift viewer."""
370
+ parser = argparse.ArgumentParser(
371
+ prog="pystylometry-viewer",
372
+ description="Generate a standalone HTML drift analysis viewer.",
373
+ formatter_class=argparse.RawDescriptionHelpFormatter,
374
+ epilog="""
375
+ This generates a self-contained HTML file that users can open in any browser
376
+ to analyze their own text files. No Python or server required - just share
377
+ the HTML file and anyone can use it.
378
+
379
+ Examples:
380
+ pystylometry-viewer drift_analyzer.html
381
+ pystylometry-viewer ~/Desktop/analyzer.html --title "My Drift Analyzer"
382
+
383
+ The generated viewer includes:
384
+ - Drag-and-drop file upload
385
+ - Configurable analysis parameters
386
+ - Interactive timeline visualization
387
+ - Client-side Kilgarriff chi-squared implementation
388
+ """,
389
+ )
390
+
391
+ parser.add_argument(
392
+ "output",
393
+ type=Path,
394
+ help="Path to write the HTML viewer file",
395
+ )
396
+ parser.add_argument(
397
+ "--title",
398
+ default="Stylistic Drift Analyzer",
399
+ help="Page title (default: 'Stylistic Drift Analyzer')",
400
+ )
401
+
402
+ args = parser.parse_args()
403
+
404
+ from pystylometry.viz.jsx import export_drift_viewer
405
+
406
+ output_path = export_drift_viewer(args.output, title=args.title)
407
+
408
+ abs_path = output_path.resolve()
409
+ file_url = f"file://{abs_path}"
410
+
411
+ print()
412
+ print(" PYSTYLOMETRY — Standalone Drift Viewer")
413
+ print(" ═══════════════════════════════════════════════════════════════════════")
414
+ print()
415
+ print(f" Generated: {output_path}")
416
+ print(f" Open in browser: {file_url}")
417
+ print()
418
+ print(" This viewer can be shared with anyone. Users can:")
419
+ print(" • Drag-and-drop or upload .txt files")
420
+ print(" • Configure analysis parameters")
421
+ print(" • View interactive drift timeline")
422
+ print(" • Click points to see chunk comparisons")
423
+ print()
424
+
425
+
426
+ if __name__ == "__main__":
427
+ drift_cli()
@@ -0,0 +1,27 @@
1
+ # consistency
2
+
3
+ ![1 public function](https://img.shields.io/badge/functions-1-blue)
4
+ ![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
5
+
6
+ Intra-document style drift detection using sliding-window chi-squared analysis.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Function | What It Does |
11
+ |------|----------|-------------|
12
+ | `drift.py` | `compute_kilgarriff_drift` | Detects stylistic drift, splice points, and AI-generation signatures |
13
+ | `_thresholds.py` | _(internal)_ | Classification thresholds for pattern detection |
14
+
15
+ ## Detected Patterns
16
+
17
+ | Pattern | Meaning |
18
+ |---------|---------|
19
+ | `consistent` | Natural human variation throughout |
20
+ | `gradual_drift` | Style shifts progressively over the document |
21
+ | `sudden_spike` | Abrupt discontinuity (possible splice or paste) |
22
+ | `suspiciously_uniform` | Unnaturally low variation (possible AI generation) |
23
+
24
+ ## See Also
25
+
26
+ - [`authorship/kilgarriff.py`](../authorship/) -- the underlying chi-squared method (between-text comparison)
27
+ - [`viz/`](../viz/) for timeline and report visualizations of drift results
@@ -0,0 +1,57 @@
1
+ """Consistency analysis module for pystylometry.
2
+
3
+ This module provides tools for analyzing internal stylistic consistency within
4
+ a single document. Unlike the `authorship` module (which compares different texts),
5
+ the `consistency` module focuses on detecting patterns within one text:
6
+
7
+ - Stylistic drift over the course of a document
8
+ - Sudden discontinuities suggesting pasted content or different authors
9
+ - Suspiciously uniform style (potential AI generation signature)
10
+ - Natural variation patterns in human writing
11
+
12
+ Related GitHub Issues:
13
+ #36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
14
+ https://github.com/craigtrim/pystylometry/issues/36
15
+ #27 - Native chunked analysis with Distribution dataclass
16
+ https://github.com/craigtrim/pystylometry/issues/27
17
+
18
+ Marketing Names:
19
+ - "Style Drift Detector"
20
+ - "Consistency Fingerprint"
21
+ - "Authorship Continuity Score"
22
+
23
+ Available Functions:
24
+ compute_kilgarriff_drift: Detect stylistic drift using chi-squared method
25
+
26
+ Example Usage:
27
+ >>> from pystylometry.consistency import compute_kilgarriff_drift
28
+ >>>
29
+ >>> # Analyze a document for stylistic consistency
30
+ >>> result = compute_kilgarriff_drift(document_text)
31
+ >>>
32
+ >>> # Check the detected pattern
33
+ >>> print(f"Pattern: {result.pattern}") # e.g., "consistent", "sudden_spike"
34
+ >>> print(f"Confidence: {result.pattern_confidence:.2f}")
35
+ >>>
36
+ >>> # Investigate potential AI generation
37
+ >>> if result.pattern == "suspiciously_uniform":
38
+ ... print("Warning: Text shows unusually uniform style")
39
+ >>>
40
+ >>> # Find where the biggest style shift occurs
41
+ >>> if result.pattern == "sudden_spike":
42
+ ... print(f"Major discontinuity at window boundary {result.max_location}")
43
+
44
+ References:
45
+ Kilgarriff, Adam. "Comparing Corpora." International Journal of Corpus
46
+ Linguistics, vol. 6, no. 1, 2001, pp. 97-133.
47
+
48
+ Eder, Maciej. "Does Size Matter? Authorship Attribution, Small Samples,
49
+ Big Problem." Digital Scholarship in the Humanities, vol. 30, no. 2,
50
+ 2015, pp. 167-182.
51
+ """
52
+
53
+ from .drift import compute_kilgarriff_drift
54
+
55
+ __all__ = [
56
+ "compute_kilgarriff_drift",
57
+ ]
@@ -0,0 +1,162 @@
1
+ """Threshold constants for consistency pattern classification.
2
+
3
+ This module contains calibration constants used for classifying stylistic
4
+ patterns in the consistency module. These thresholds determine how the
5
+ `compute_kilgarriff_drift()` function classifies detected patterns.
6
+
7
+ Related GitHub Issues:
8
+ #36 - Kilgarriff Chi-Squared drift detection for intra-document analysis
9
+ https://github.com/craigtrim/pystylometry/issues/36
10
+
11
+ Calibration Notes:
12
+ These thresholds are initial estimates based on theoretical considerations
13
+ and limited empirical testing. They should be refined through systematic
14
+ evaluation on diverse corpora including:
15
+ - Human-written texts of various lengths and genres
16
+ - AI-generated texts from different models
17
+ - Mixed human/AI texts
18
+ - Multi-author documents
19
+
20
+ The thresholds are exposed as module-level constants to allow:
21
+ 1. Transparency: Users can inspect what values are used
22
+ 2. Customization: Advanced users can override via metadata or subclassing
23
+ 3. Research: Easy adjustment for empirical calibration studies
24
+
25
+ Pattern Classification Logic:
26
+ The pattern classification uses a decision tree approach:
27
+
28
+ 1. First check for insufficient variance (suspiciously uniform)
29
+ - AI-generated text often shows near-identical statistics across chunks
30
+ - This is detected by very low std_chi_squared
31
+
32
+ 2. Then check for sudden discontinuities (sudden spike)
33
+ - Pasted content or different authors cause outlier chi-squared values
34
+ - Detected by max_chi_squared significantly exceeding mean
35
+
36
+ 3. Then check for trends (gradual drift)
37
+ - Author fatigue or topic evolution shows increasing chi-squared
38
+ - Detected by significant slope in chi-squared over time
39
+
40
+ 4. Otherwise classify as consistent (natural variation)
41
+ - Human writing typically shows moderate, stable variance
42
+
43
+ References:
44
+ The thresholds are informed by stylometric literature but require
45
+ empirical validation:
46
+
47
+ Eder, Maciej, et al. "Stylometry with R: A Package for Computational Text
48
+ Analysis." The R Journal, vol. 8, no. 1, 2016, pp. 107-121.
49
+
50
+ Juola, Patrick. "Authorship Attribution." Foundations and Trends in
51
+ Information Retrieval, vol. 1, no. 3, 2006, pp. 233-334.
52
+ """
53
+
54
+ from __future__ import annotations
55
+
56
+ # =============================================================================
57
+ # Window Count Thresholds
58
+ # =============================================================================
59
+ # These determine the minimum data requirements for meaningful analysis.
60
+
61
+ # Absolute minimum: Need at least 2 comparisons for any variance calculation
62
+ # With 3 windows, we get pairs: (1,2), (2,3) = 2 comparisons
63
+ MIN_WINDOWS = 3
64
+
65
+ # Recommended minimum: Need enough data points for reliable pattern classification
66
+ # With 5 windows, we get pairs: (1,2), (2,3), (3,4), (4,5) = 4 comparisons
67
+ # This allows computation of mean, std, and basic trend detection
68
+ RECOMMENDED_WINDOWS = 5
69
+
70
+
71
+ # =============================================================================
72
+ # Pattern Classification Thresholds
73
+ # =============================================================================
74
+ # These control how chi-squared patterns are classified into named patterns.
75
+
76
+ # --- Suspiciously Uniform Detection ---
77
+ # AI-generated text often shows near-zero variance in stylistic metrics.
78
+ # Human writing naturally fluctuates; AI maintains eerie consistency.
79
+
80
+ # Coefficient of variation threshold (std / mean)
81
+ # Below this, variance is suspiciously low
82
+ UNIFORM_CV_THRESHOLD = 0.15
83
+
84
+ # Also require low absolute mean for "uniform" classification
85
+ # (High mean with low variance is just "consistent" with high baseline)
86
+ UNIFORM_MEAN_THRESHOLD = 50.0
87
+
88
+
89
+ # --- Sudden Spike Detection ---
90
+ # Pasted content, different authors, or major edits cause discontinuities.
91
+
92
+ # Ratio of max to mean that indicates a spike
93
+ # If max > SPIKE_RATIO × mean, we have an outlier
94
+ SPIKE_RATIO = 2.5
95
+
96
+ # Absolute minimum spike size (to avoid false positives on low-baseline texts)
97
+ SPIKE_MIN_ABSOLUTE = 100.0
98
+
99
+
100
+ # --- Gradual Drift Detection ---
101
+ # Increasing chi-squared over time suggests evolving style or author fatigue.
102
+
103
+ # Minimum slope (chi-squared units per chunk pair) to detect trend
104
+ # Positive = increasing difference over time
105
+ # Negative = converging style (less common)
106
+ TREND_SLOPE_THRESHOLD = 5.0
107
+
108
+ # R-squared threshold: Trend must explain this much variance to be meaningful
109
+ TREND_R_SQUARED_THRESHOLD = 0.3
110
+
111
+
112
+ # --- Consistent Pattern ---
113
+ # If none of the above patterns are detected, text is classified as "consistent"
114
+ # This represents natural human writing with normal variation.
115
+
116
+
117
+ # =============================================================================
118
+ # Confidence Calculation Weights
119
+ # =============================================================================
120
+ # These control how pattern confidence scores are computed.
121
+
122
+ # Minimum window count for full confidence
123
+ # Below this, confidence is scaled down proportionally
124
+ CONFIDENCE_MIN_WINDOWS = 5
125
+
126
+ # Maximum confidence achievable with marginal data
127
+ MARGINAL_DATA_MAX_CONFIDENCE = 0.6
128
+
129
+
130
+ # =============================================================================
131
+ # Export all thresholds as a dict for easy inspection
132
+ # =============================================================================
133
+
134
+
135
+ def get_all_thresholds() -> dict[str, float]:
136
+ """
137
+ Return all threshold values as a dictionary.
138
+
139
+ This is useful for:
140
+ - Logging/debugging: Record what thresholds were used
141
+ - Transparency: Include in result metadata
142
+ - Research: Compare different threshold settings
143
+
144
+ Returns:
145
+ Dict mapping threshold names to their values
146
+
147
+ Example:
148
+ >>> thresholds = get_all_thresholds()
149
+ >>> print(f"Spike ratio: {thresholds['spike_ratio']}")
150
+ """
151
+ return {
152
+ "min_windows": MIN_WINDOWS,
153
+ "recommended_windows": RECOMMENDED_WINDOWS,
154
+ "uniform_cv_threshold": UNIFORM_CV_THRESHOLD,
155
+ "uniform_mean_threshold": UNIFORM_MEAN_THRESHOLD,
156
+ "spike_ratio": SPIKE_RATIO,
157
+ "spike_min_absolute": SPIKE_MIN_ABSOLUTE,
158
+ "trend_slope_threshold": TREND_SLOPE_THRESHOLD,
159
+ "trend_r_squared_threshold": TREND_R_SQUARED_THRESHOLD,
160
+ "confidence_min_windows": CONFIDENCE_MIN_WINDOWS,
161
+ "marginal_data_max_confidence": MARGINAL_DATA_MAX_CONFIDENCE,
162
+ }