pystylometry 1.1.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
pystylometry/README.md ADDED
@@ -0,0 +1,42 @@
1
+ # pystylometry
2
+
3
+ ![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue)
4
+ ![License: MIT](https://img.shields.io/badge/license-MIT-green)
5
+
6
+ Core package for stylometric analysis and authorship attribution.
7
+
8
+ ## Module Map
9
+
10
+ | Module | Purpose | Key Functions |
11
+ |--------|---------|---------------|
12
+ | [`lexical/`](lexical/) | Vocabulary diversity & richness | `compute_mtld`, `compute_yule`, `compute_ttr`, `compute_hapax_ratios` |
13
+ | [`readability/`](readability/) | Text readability scoring | `compute_flesch`, `compute_gunning_fog`, `compute_ari`, `compute_smog` |
14
+ | [`syntactic/`](syntactic/) | Sentence & parse structure | `compute_pos_ratios`, `compute_sentence_types`, `compute_advanced_syntactic` |
15
+ | [`authorship/`](authorship/) | Author attribution & comparison | `compute_burrows_delta`, `compute_kilgarriff`, `compute_compression_distance` |
16
+ | [`stylistic/`](stylistic/) | Style markers & vocabulary overlap | `compute_stylistic_markers`, `compute_vocabulary_overlap`, `compute_genre_register` |
17
+ | [`character/`](character/) | Character-level features | `compute_character_metrics` |
18
+ | [`ngrams/`](ngrams/) | N-gram entropy & sequences | `compute_extended_ngrams`, `compute_ngram_entropy` |
19
+ | [`dialect/`](dialect/) | Regional dialect detection | `compute_dialect` |
20
+ | [`consistency/`](consistency/) | Intra-document drift detection | `compute_kilgarriff_drift` |
21
+ | [`prosody/`](prosody/) | Rhythm & stress patterns | `compute_rhythm_prosody` |
22
+ | [`viz/`](viz/) | Visualization (PNG & interactive HTML) | `plot_drift_timeline`, `export_drift_report_jsx` |
23
+
24
+ ## Shared Internals
25
+
26
+ | File | Purpose |
27
+ |------|---------|
28
+ | `_types.py` | All dataclass result types (e.g. `FleschResult`, `MTLDResult`, `KilgarriffDriftResult`) |
29
+ | `_normalize.py` | Text normalization for readability and stylometry pipelines |
30
+ | `_utils.py` | Shared tokenization and helper functions |
31
+ | `tokenizer.py` | Configurable tokenizer with sentence/word splitting |
32
+ | `cli.py` | Command-line interface (`pystylometry analyze`) |
33
+
34
+ ## Installation Extras
35
+
36
+ ```
37
+ pip install pystylometry # Core (lexical only)
38
+ pip install pystylometry[readability] # + readability
39
+ pip install pystylometry[syntactic] # + syntactic (requires spaCy)
40
+ pip install pystylometry[authorship] # + authorship attribution
41
+ pip install pystylometry[all] # Everything
42
+ ```
pystylometry/__init__.py CHANGED
@@ -63,18 +63,28 @@ try:
63
63
  except ImportError:
64
64
  _SYNTACTIC_AVAILABLE = False
65
65
 
66
- # Authorship, ngrams, dialect, and consistency use only stdlib (no external dependencies)
66
+ # Prosody requires pronouncing (CMU dictionary) - same dependency as readability
67
+ try:
68
+ from . import prosody # noqa: F401 - Rhythm and prosody metrics (Issue #25)
69
+
70
+ _PROSODY_AVAILABLE = True
71
+ except ImportError:
72
+ _PROSODY_AVAILABLE = False
73
+
74
+ # Authorship, ngrams, dialect, consistency, and stylistic use only stdlib (no external dependencies)
67
75
  from . import (
68
76
  authorship, # noqa: F401
69
77
  consistency, # noqa: F401 - Style drift detection (Issue #36)
70
78
  dialect, # noqa: F401
71
79
  ngrams, # noqa: F401
80
+ stylistic, # noqa: F401 - Vocabulary overlap and similarity (Issue #21)
72
81
  )
73
82
 
74
83
  _AUTHORSHIP_AVAILABLE = True
75
84
  _NGRAMS_AVAILABLE = True
76
85
  _DIALECT_AVAILABLE = True
77
86
  _CONSISTENCY_AVAILABLE = True
87
+ _STYLISTIC_AVAILABLE = True
78
88
 
79
89
 
80
90
  def analyze(
@@ -206,6 +216,8 @@ def get_available_modules() -> dict[str, bool]:
206
216
  "ngrams": _NGRAMS_AVAILABLE,
207
217
  "dialect": _DIALECT_AVAILABLE,
208
218
  "consistency": _CONSISTENCY_AVAILABLE, # Style drift detection (Issue #36)
219
+ "stylistic": _STYLISTIC_AVAILABLE, # Vocabulary overlap (Issue #21)
220
+ "prosody": _PROSODY_AVAILABLE, # Rhythm and prosody (Issue #25)
209
221
  }
210
222
 
211
223
 
@@ -229,3 +241,7 @@ if _DIALECT_AVAILABLE:
229
241
  __all__.append("dialect")
230
242
  if _CONSISTENCY_AVAILABLE:
231
243
  __all__.append("consistency")
244
+ if _STYLISTIC_AVAILABLE:
245
+ __all__.append("stylistic")
246
+ if _PROSODY_AVAILABLE:
247
+ __all__.append("prosody")
pystylometry/_types.py CHANGED
@@ -370,6 +370,158 @@ class TTRResult:
370
370
  metadata: dict[str, Any]
371
371
 
372
372
 
373
+ # ===== Repetition Detection Results =====
374
+ # Related to GitHub Issue #28: Verbal tics detection for slop analysis
375
+ # https://github.com/craigtrim/pystylometry/issues/28
376
+
377
+
378
+ @dataclass
379
+ class RepetitiveWord:
380
+ """A single word flagged as abnormally repetitive.
381
+
382
+ The repetition_score is the ratio of observed count to expected count
383
+ based on the word's frequency in the British National Corpus (BNC).
384
+ Higher scores indicate stronger overrepresentation.
385
+
386
+ Related GitHub Issue:
387
+ #28 - Verbal tics detection for slop analysis
388
+ https://github.com/craigtrim/pystylometry/issues/28
389
+
390
+ Attributes:
391
+ word: The flagged word (lowercased).
392
+ count: Observed count in the text.
393
+ expected_count: Expected count based on BNC relative frequency × text length.
394
+ 0.0 if word not found in BNC.
395
+ repetition_score: count / expected_count. float('inf') if expected_count is 0.
396
+ bnc_bucket: BNC frequency bucket (1-100, 1=most frequent). None if not in BNC.
397
+ chunk_counts: Per-chunk occurrence counts (for distribution analysis).
398
+ distribution_entropy: Shannon entropy of the word's chunk distribution.
399
+ Low entropy = suspiciously even spread (model tic).
400
+ High entropy = clustered usage (human writing about a specific scene).
401
+ distribution_variance: Variance of per-chunk counts.
402
+ """
403
+
404
+ word: str
405
+ count: int
406
+ expected_count: float
407
+ repetition_score: float
408
+ bnc_bucket: int | None
409
+ chunk_counts: list[int]
410
+ distribution_entropy: float
411
+ distribution_variance: float
412
+
413
+
414
+ @dataclass
415
+ class RepetitiveUnigramsResult:
416
+ """Result from repetitive unigram detection.
417
+
418
+ Identifies content words that appear far more frequently than expected
419
+ based on their frequency in the British National Corpus (BNC, ~100M tokens).
420
+ This is a key indicator of AI-generated "slop" where models exhibit verbal
421
+ tics — repeating certain words with suspicious regularity.
422
+
423
+ Related GitHub Issue:
424
+ #28 - Verbal tics detection for slop analysis
425
+ https://github.com/craigtrim/pystylometry/issues/28
426
+
427
+ The slop_score provides a single aggregate metric:
428
+ slop_score = flagged_words_per_10k × mean_repetition_score
429
+
430
+ Where:
431
+ - flagged_words_per_10k = count of flagged words / (total content words / 10000)
432
+ - mean_repetition_score = mean repetition_score across all flagged words
433
+
434
+ Higher slop_score = more likely AI-generated verbal tics.
435
+
436
+ References:
437
+ British National Corpus Consortium. (2007). The British National Corpus,
438
+ version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
439
+
440
+ Example:
441
+ >>> result = compute_repetitive_unigrams(text)
442
+ >>> for w in result.repetitive_words[:5]:
443
+ ... print(f"{w.word}: {w.count}x (expected {w.expected_count:.1f}, "
444
+ ... f"score {w.repetition_score:.1f})")
445
+ shimmered: 23x (expected 0.1, score 266.2)
446
+ >>> result.slop_score
447
+ 42.7
448
+ """
449
+
450
+ repetitive_words: list[RepetitiveWord] # Sorted by repetition_score descending
451
+ total_content_words: int
452
+ flagged_count: int # Number of words exceeding threshold
453
+ flagged_words_per_10k: float # flagged_count / (total_content_words / 10000)
454
+ mean_repetition_score: float # Mean score across flagged words
455
+ slop_score: float # Aggregate: flagged_words_per_10k × mean_repetition_score
456
+ total_content_words_dist: Distribution
457
+ chunk_size: int
458
+ chunk_count: int
459
+ metadata: dict[str, Any]
460
+
461
+
462
+ @dataclass
463
+ class RepetitiveNgram:
464
+ """A single n-gram flagged as abnormally repetitive.
465
+
466
+ Content n-grams (bigrams, trigrams, etc.) should rarely repeat verbatim
467
+ in natural writing. N-grams that repeat beyond a length-scaled threshold
468
+ are flagged.
469
+
470
+ Related GitHub Issue:
471
+ #28 - Verbal tics detection for slop analysis
472
+ https://github.com/craigtrim/pystylometry/issues/28
473
+
474
+ Attributes:
475
+ ngram: The flagged n-gram as a tuple of words.
476
+ count: Observed count in the text.
477
+ frequency_per_10k: Occurrences per 10,000 n-grams.
478
+ chunk_counts: Per-chunk occurrence counts.
479
+ distribution_entropy: Shannon entropy of the n-gram's chunk distribution.
480
+ distribution_variance: Variance of per-chunk counts.
481
+ """
482
+
483
+ ngram: tuple[str, ...]
484
+ count: int
485
+ frequency_per_10k: float
486
+ chunk_counts: list[int]
487
+ distribution_entropy: float
488
+ distribution_variance: float
489
+
490
+
491
+ @dataclass
492
+ class RepetitiveNgramsResult:
493
+ """Result from repetitive n-gram detection.
494
+
495
+ Detects bigrams, trigrams, or higher-order n-grams that repeat more than
496
+ expected within the text. No external corpus is required — content n-grams
497
+ should not repeat verbatim often in natural writing.
498
+
499
+ N-grams composed entirely of function words (e.g., "of the", "in a") are
500
+ excluded since their repetition is expected.
501
+
502
+ Related GitHub Issue:
503
+ #28 - Verbal tics detection for slop analysis
504
+ https://github.com/craigtrim/pystylometry/issues/28
505
+
506
+ Example:
507
+ >>> result = compute_repetitive_ngrams(text, n=2)
508
+ >>> for ng in result.repetitive_ngrams[:5]:
509
+ ... print(f"{' '.join(ng.ngram)}: {ng.count}x "
510
+ ... f"({ng.frequency_per_10k:.1f} per 10k)")
511
+ uncomfortable truth: 8x (1.6 per 10k)
512
+ """
513
+
514
+ repetitive_ngrams: list[RepetitiveNgram] # Sorted by count descending
515
+ n: int | tuple[int, ...] # N-gram order(s) analyzed
516
+ total_ngrams: int
517
+ flagged_count: int
518
+ flagged_per_10k: float # flagged_count / (total_ngrams / 10000)
519
+ total_ngrams_dist: Distribution
520
+ chunk_size: int
521
+ chunk_count: int
522
+ metadata: dict[str, Any]
523
+
524
+
373
525
  # ===== Readability Results =====
374
526
 
375
527
 
@@ -1517,6 +1669,7 @@ class VocabularyOverlapResult:
1517
1669
  - Dice coefficient (2 * intersection / sum of sizes)
1518
1670
  - Overlap coefficient (intersection / min(size1, size2))
1519
1671
  - Cosine similarity (using word frequency vectors)
1672
+ - KL divergence (asymmetric distributional difference)
1520
1673
  - Shared vocabulary size and ratio
1521
1674
  - Unique words in each text
1522
1675
  - Most distinctive words for each text
@@ -1526,6 +1679,10 @@ class VocabularyOverlapResult:
1526
1679
  New Phytologist, 11(2), 37-50.
1527
1680
  Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
1528
1681
  Retrieval. McGraw-Hill.
1682
+ Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
1683
+ Annals of Mathematical Statistics, 22(1), 79-86.
1684
+ Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
1685
+ MIT Press.
1529
1686
 
1530
1687
  Example:
1531
1688
  >>> result = compute_vocabulary_overlap(text1, text2)
@@ -1539,6 +1696,7 @@ class VocabularyOverlapResult:
1539
1696
  dice_coefficient: float # 2 * intersection / (size1 + size2)
1540
1697
  overlap_coefficient: float # Intersection / min(size1, size2)
1541
1698
  cosine_similarity: float # Cosine of frequency vectors
1699
+ kl_divergence: float # Kullback-Leibler divergence (asymmetric, text1 || text2)
1542
1700
 
1543
1701
  # Vocabulary sizes
1544
1702
  text1_vocab_size: int # Unique words in text 1
@@ -1897,6 +2055,54 @@ class JohnsBurrowsResult:
1897
2055
  metadata: dict[str, Any] # Method-specific parameters, z-scores, etc.
1898
2056
 
1899
2057
 
2058
+ @dataclass
2059
+ class CompressionResult:
2060
+ """Result from compression-based authorship attribution.
2061
+
2062
+ Compression-based methods use the Normalized Compression Distance (NCD) to
2063
+ measure similarity between texts. The intuition is that if two texts are
2064
+ similar, compressing them together will yield better compression than
2065
+ compressing separately. This approach is language-independent and captures
2066
+ deep statistical regularities.
2067
+
2068
+ Related GitHub Issue:
2069
+ #24 - Additional Authorship Attribution Methods
2070
+ https://github.com/craigtrim/pystylometry/issues/24
2071
+
2072
+ Formula:
2073
+ NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
2074
+
2075
+ where C(x) is the compressed size of x, and C(xy) is the compressed
2076
+ size of x and y concatenated.
2077
+
2078
+ Interpretation:
2079
+ - NCD ≈ 0: Texts are very similar
2080
+ - NCD ≈ 1: Texts are very different
2081
+ - Typical same-author pairs: 0.3-0.6
2082
+ - Typical different-author pairs: 0.6-0.9
2083
+
2084
+ References:
2085
+ Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
2086
+ IEEE Transactions on Information Theory, 51(4), 1523-1545.
2087
+
2088
+ Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
2089
+ zipping. Physical Review Letters, 88(4), 048702.
2090
+
2091
+ Example:
2092
+ >>> result = compute_compression_distance(text1, text2)
2093
+ >>> print(f"NCD: {result.ncd:.3f}")
2094
+ >>> if result.ncd < 0.5:
2095
+ ... print("Texts likely by same author")
2096
+ """
2097
+
2098
+ ncd: float # Normalized Compression Distance [0, 1+]
2099
+ compressor: str # Compression algorithm used (e.g., "gzip", "zlib", "bz2")
2100
+ text1_compressed_size: int # Compressed size of text1 alone
2101
+ text2_compressed_size: int # Compressed size of text2 alone
2102
+ combined_compressed_size: int # Compressed size of concatenated texts
2103
+ metadata: dict[str, Any] # Raw sizes, compression ratios, etc.
2104
+
2105
+
1900
2106
  # ===== Rhythm and Prosody Results =====
1901
2107
  # Related to GitHub Issue #25: Rhythm and Prosody Metrics
1902
2108
  # https://github.com/craigtrim/pystylometry/issues/25
@@ -0,0 +1,21 @@
1
+ # authorship
2
+
3
+ ![7 public functions](https://img.shields.io/badge/functions-7-blue)
4
+ ![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
5
+
6
+ Authorship attribution methods for comparing texts and determining likely authorship.
7
+
8
+ ## Catalogue
9
+
10
+ | File | Functions | Method |
11
+ |------|-----------|--------|
12
+ | `burrows_delta.py` | `compute_burrows_delta`, `compute_cosine_delta` | Classic Delta and angular distance variant |
13
+ | `zeta.py` | `compute_zeta` | Zeta method for marker word detection |
14
+ | `kilgarriff.py` | `compute_kilgarriff` | Chi-squared corpus comparison |
15
+ | `additional_methods.py` | `compute_minmax`, `compute_johns_delta` | MinMax distance, Quadratic/Weighted Delta |
16
+ | `compression.py` | `compute_compression_distance` | Normalized Compression Distance (NCD) |
17
+
18
+ ## See Also
19
+
20
+ - [`consistency/`](../consistency/) applies `compute_kilgarriff` in sliding windows for intra-document drift detection
21
+ - [`lexical/`](../lexical/) provides the vocabulary features many attribution methods rely on
@@ -2,8 +2,8 @@
2
2
 
3
3
  This module provides methods for authorship attribution - comparing texts to
4
4
  determine whether they were written by the same author. Available methods
5
- include classic approaches (Burrows' Delta, Zeta) and statistical methods
6
- (Kilgarriff's chi-squared).
5
+ include classic approaches (Burrows' Delta, Zeta), statistical methods
6
+ (Kilgarriff's chi-squared), and information-theoretic methods (NCD).
7
7
 
8
8
  Related GitHub Issues:
9
9
  #24 - Additional Authorship Attribution Methods
@@ -16,20 +16,23 @@ Available Functions:
16
16
  compute_cosine_delta: Angular distance variant of Delta
17
17
  compute_zeta: Zeta method for marker word detection
18
18
  compute_kilgarriff: Chi-squared method for corpus comparison
19
- compute_minmax: Burrows' original min-max method (not yet implemented)
20
- compute_johns_delta: Delta variations (not yet implemented)
19
+ compute_minmax: Burrows' original min-max distance method
20
+ compute_johns_delta: Delta variations (quadratic, weighted)
21
+ compute_compression_distance: Normalized Compression Distance (NCD)
21
22
  """
22
23
 
23
24
  from .additional_methods import compute_johns_delta, compute_minmax
24
25
  from .burrows_delta import compute_burrows_delta, compute_cosine_delta
26
+ from .compression import compute_compression_distance
25
27
  from .kilgarriff import compute_kilgarriff
26
28
  from .zeta import compute_zeta
27
29
 
28
30
  __all__ = [
29
31
  "compute_burrows_delta",
32
+ "compute_compression_distance",
30
33
  "compute_cosine_delta",
31
- "compute_zeta",
34
+ "compute_johns_delta",
32
35
  "compute_kilgarriff",
33
36
  "compute_minmax",
34
- "compute_johns_delta",
37
+ "compute_zeta",
35
38
  ]