PyPI - pystylometry - Versions diffs - 1.1.0__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

pystylometry 1.1.0py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

pystylometry/README.md +42 -0
pystylometry/__init__.py +17 -1
pystylometry/_types.py +206 -0
pystylometry/authorship/README.md +21 -0
pystylometry/authorship/__init__.py +9 -6
pystylometry/authorship/additional_methods.py +262 -17
pystylometry/authorship/compression.py +175 -0
pystylometry/authorship/kilgarriff.py +8 -1
pystylometry/character/README.md +17 -0
pystylometry/consistency/README.md +27 -0
pystylometry/dialect/README.md +26 -0
pystylometry/lexical/README.md +23 -0
pystylometry/lexical/__init__.py +3 -0
pystylometry/lexical/repetition.py +506 -0
pystylometry/ngrams/README.md +18 -0
pystylometry/ngrams/extended_ngrams.py +314 -69
pystylometry/prosody/README.md +17 -0
pystylometry/prosody/rhythm_prosody.py +773 -11
pystylometry/readability/README.md +23 -0
pystylometry/stylistic/README.md +20 -0
pystylometry/stylistic/cohesion_coherence.py +669 -13
pystylometry/stylistic/genre_register.py +1560 -17
pystylometry/stylistic/markers.py +611 -17
pystylometry/stylistic/vocabulary_overlap.py +354 -13
pystylometry/syntactic/README.md +20 -0
pystylometry/viz/README.md +27 -0
pystylometry-1.3.1.dist-info/LICENSE +21 -0
pystylometry-1.3.1.dist-info/METADATA +79 -0
{pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/RECORD +31 -16
{pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/WHEEL +1 -1
pystylometry-1.1.0.dist-info/METADATA +0 -278
{pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/entry_points.txt +0 -0

pystylometry/README.md ADDED Viewed

@@ -0,0 +1,42 @@
+# pystylometry
+![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue)
+![License: MIT](https://img.shields.io/badge/license-MIT-green)
+Core package for stylometric analysis and authorship attribution.
+## Module Map
+| Module | Purpose | Key Functions |
+|--------|---------|---------------|
+| [`lexical/`](lexical/) | Vocabulary diversity & richness | `compute_mtld`, `compute_yule`, `compute_ttr`, `compute_hapax_ratios` |
+| [`readability/`](readability/) | Text readability scoring | `compute_flesch`, `compute_gunning_fog`, `compute_ari`, `compute_smog` |
+| [`syntactic/`](syntactic/) | Sentence & parse structure | `compute_pos_ratios`, `compute_sentence_types`, `compute_advanced_syntactic` |
+| [`authorship/`](authorship/) | Author attribution & comparison | `compute_burrows_delta`, `compute_kilgarriff`, `compute_compression_distance` |
+| [`stylistic/`](stylistic/) | Style markers & vocabulary overlap | `compute_stylistic_markers`, `compute_vocabulary_overlap`, `compute_genre_register` |
+| [`character/`](character/) | Character-level features | `compute_character_metrics` |
+| [`ngrams/`](ngrams/) | N-gram entropy & sequences | `compute_extended_ngrams`, `compute_ngram_entropy` |
+| [`dialect/`](dialect/) | Regional dialect detection | `compute_dialect` |
+| [`consistency/`](consistency/) | Intra-document drift detection | `compute_kilgarriff_drift` |
+| [`prosody/`](prosody/) | Rhythm & stress patterns | `compute_rhythm_prosody` |
+| [`viz/`](viz/) | Visualization (PNG & interactive HTML) | `plot_drift_timeline`, `export_drift_report_jsx` |
+## Shared Internals
+| File | Purpose |
+|------|---------|
+| `_types.py` | All dataclass result types (e.g. `FleschResult`, `MTLDResult`, `KilgarriffDriftResult`) |
+| `_normalize.py` | Text normalization for readability and stylometry pipelines |
+| `_utils.py` | Shared tokenization and helper functions |
+| `tokenizer.py` | Configurable tokenizer with sentence/word splitting |
+| `cli.py` | Command-line interface (`pystylometry analyze`) |
+## Installation Extras
+```
+pip install pystylometry                  # Core (lexical only)
+pip install pystylometry[readability]     # + readability
+pip install pystylometry[syntactic]       # + syntactic (requires spaCy)
+pip install pystylometry[authorship]      # + authorship attribution
+pip install pystylometry[all]             # Everything
+```

pystylometry/__init__.py CHANGED Viewed

@@ -63,18 +63,28 @@ try:
 except ImportError:
     _SYNTACTIC_AVAILABLE = False
-# Authorship, ngrams, dialect, and consistency use only stdlib (no external dependencies)
+# Prosody requires pronouncing (CMU dictionary) - same dependency as readability
+try:
+    from . import prosody  # noqa: F401 - Rhythm and prosody metrics (Issue #25)
+    _PROSODY_AVAILABLE = True
+except ImportError:
+    _PROSODY_AVAILABLE = False
+# Authorship, ngrams, dialect, consistency, and stylistic use only stdlib (no external dependencies)
 from . import (
     authorship,  # noqa: F401
     consistency,  # noqa: F401 - Style drift detection (Issue #36)
     dialect,  # noqa: F401
     ngrams,  # noqa: F401
+    stylistic,  # noqa: F401 - Vocabulary overlap and similarity (Issue #21)
 )
 _AUTHORSHIP_AVAILABLE = True
 _NGRAMS_AVAILABLE = True
 _DIALECT_AVAILABLE = True
 _CONSISTENCY_AVAILABLE = True
+_STYLISTIC_AVAILABLE = True
 def analyze(
@@ -206,6 +216,8 @@ def get_available_modules() -> dict[str, bool]:
         "ngrams": _NGRAMS_AVAILABLE,
         "dialect": _DIALECT_AVAILABLE,
         "consistency": _CONSISTENCY_AVAILABLE,  # Style drift detection (Issue #36)
+        "stylistic": _STYLISTIC_AVAILABLE,  # Vocabulary overlap (Issue #21)
+        "prosody": _PROSODY_AVAILABLE,  # Rhythm and prosody (Issue #25)
     }
@@ -229,3 +241,7 @@ if _DIALECT_AVAILABLE:
     __all__.append("dialect")
 if _CONSISTENCY_AVAILABLE:
     __all__.append("consistency")
+if _STYLISTIC_AVAILABLE:
+    __all__.append("stylistic")
+if _PROSODY_AVAILABLE:
+    __all__.append("prosody")

pystylometry/_types.py CHANGED Viewed

@@ -370,6 +370,158 @@ class TTRResult:
     metadata: dict[str, Any]
+# ===== Repetition Detection Results =====
+# Related to GitHub Issue #28: Verbal tics detection for slop analysis
+# https://github.com/craigtrim/pystylometry/issues/28
+@dataclass
+class RepetitiveWord:
+    """A single word flagged as abnormally repetitive.
+    The repetition_score is the ratio of observed count to expected count
+    based on the word's frequency in the British National Corpus (BNC).
+    Higher scores indicate stronger overrepresentation.
+    Related GitHub Issue:
+        #28 - Verbal tics detection for slop analysis
+        https://github.com/craigtrim/pystylometry/issues/28
+    Attributes:
+        word: The flagged word (lowercased).
+        count: Observed count in the text.
+        expected_count: Expected count based on BNC relative frequency × text length.
+            0.0 if word not found in BNC.
+        repetition_score: count / expected_count. float('inf') if expected_count is 0.
+        bnc_bucket: BNC frequency bucket (1-100, 1=most frequent). None if not in BNC.
+        chunk_counts: Per-chunk occurrence counts (for distribution analysis).
+        distribution_entropy: Shannon entropy of the word's chunk distribution.
+            Low entropy = suspiciously even spread (model tic).
+            High entropy = clustered usage (human writing about a specific scene).
+        distribution_variance: Variance of per-chunk counts.
+    """
+    word: str
+    count: int
+    expected_count: float
+    repetition_score: float
+    bnc_bucket: int | None
+    chunk_counts: list[int]
+    distribution_entropy: float
+    distribution_variance: float
+@dataclass
+class RepetitiveUnigramsResult:
+    """Result from repetitive unigram detection.
+    Identifies content words that appear far more frequently than expected
+    based on their frequency in the British National Corpus (BNC, ~100M tokens).
+    This is a key indicator of AI-generated "slop" where models exhibit verbal
+    tics — repeating certain words with suspicious regularity.
+    Related GitHub Issue:
+        #28 - Verbal tics detection for slop analysis
+        https://github.com/craigtrim/pystylometry/issues/28
+    The slop_score provides a single aggregate metric:
+        slop_score = flagged_words_per_10k × mean_repetition_score
+    Where:
+        - flagged_words_per_10k = count of flagged words / (total content words / 10000)
+        - mean_repetition_score = mean repetition_score across all flagged words
+    Higher slop_score = more likely AI-generated verbal tics.
+    References:
+        British National Corpus Consortium. (2007). The British National Corpus,
+            version 3 (BNC XML Edition). http://www.natcorp.ox.ac.uk/
+    Example:
+        >>> result = compute_repetitive_unigrams(text)
+        >>> for w in result.repetitive_words[:5]:
+        ...     print(f"{w.word}: {w.count}x (expected {w.expected_count:.1f}, "
+        ...           f"score {w.repetition_score:.1f})")
+        shimmered: 23x (expected 0.1, score 266.2)
+        >>> result.slop_score
+        42.7
+    """
+    repetitive_words: list[RepetitiveWord]  # Sorted by repetition_score descending
+    total_content_words: int
+    flagged_count: int  # Number of words exceeding threshold
+    flagged_words_per_10k: float  # flagged_count / (total_content_words / 10000)
+    mean_repetition_score: float  # Mean score across flagged words
+    slop_score: float  # Aggregate: flagged_words_per_10k × mean_repetition_score
+    total_content_words_dist: Distribution
+    chunk_size: int
+    chunk_count: int
+    metadata: dict[str, Any]
+@dataclass
+class RepetitiveNgram:
+    """A single n-gram flagged as abnormally repetitive.
+    Content n-grams (bigrams, trigrams, etc.) should rarely repeat verbatim
+    in natural writing. N-grams that repeat beyond a length-scaled threshold
+    are flagged.
+    Related GitHub Issue:
+        #28 - Verbal tics detection for slop analysis
+        https://github.com/craigtrim/pystylometry/issues/28
+    Attributes:
+        ngram: The flagged n-gram as a tuple of words.
+        count: Observed count in the text.
+        frequency_per_10k: Occurrences per 10,000 n-grams.
+        chunk_counts: Per-chunk occurrence counts.
+        distribution_entropy: Shannon entropy of the n-gram's chunk distribution.
+        distribution_variance: Variance of per-chunk counts.
+    """
+    ngram: tuple[str, ...]
+    count: int
+    frequency_per_10k: float
+    chunk_counts: list[int]
+    distribution_entropy: float
+    distribution_variance: float
+@dataclass
+class RepetitiveNgramsResult:
+    """Result from repetitive n-gram detection.
+    Detects bigrams, trigrams, or higher-order n-grams that repeat more than
+    expected within the text. No external corpus is required — content n-grams
+    should not repeat verbatim often in natural writing.
+    N-grams composed entirely of function words (e.g., "of the", "in a") are
+    excluded since their repetition is expected.
+    Related GitHub Issue:
+        #28 - Verbal tics detection for slop analysis
+        https://github.com/craigtrim/pystylometry/issues/28
+    Example:
+        >>> result = compute_repetitive_ngrams(text, n=2)
+        >>> for ng in result.repetitive_ngrams[:5]:
+        ...     print(f"{' '.join(ng.ngram)}: {ng.count}x "
+        ...           f"({ng.frequency_per_10k:.1f} per 10k)")
+        uncomfortable truth: 8x (1.6 per 10k)
+    """
+    repetitive_ngrams: list[RepetitiveNgram]  # Sorted by count descending
+    n: int | tuple[int, ...]  # N-gram order(s) analyzed
+    total_ngrams: int
+    flagged_count: int
+    flagged_per_10k: float  # flagged_count / (total_ngrams / 10000)
+    total_ngrams_dist: Distribution
+    chunk_size: int
+    chunk_count: int
+    metadata: dict[str, Any]
 # ===== Readability Results =====
@@ -1517,6 +1669,7 @@ class VocabularyOverlapResult:
         - Dice coefficient (2 * intersection / sum of sizes)
         - Overlap coefficient (intersection / min(size1, size2))
         - Cosine similarity (using word frequency vectors)
+        - KL divergence (asymmetric distributional difference)
         - Shared vocabulary size and ratio
         - Unique words in each text
         - Most distinctive words for each text
@@ -1526,6 +1679,10 @@ class VocabularyOverlapResult:
             New Phytologist, 11(2), 37-50.
         Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
             Retrieval. McGraw-Hill.
+        Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
+            Annals of Mathematical Statistics, 22(1), 79-86.
+        Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
+            MIT Press.
     Example:
         >>> result = compute_vocabulary_overlap(text1, text2)
@@ -1539,6 +1696,7 @@ class VocabularyOverlapResult:
     dice_coefficient: float  # 2 * intersection / (size1 + size2)
     overlap_coefficient: float  # Intersection / min(size1, size2)
     cosine_similarity: float  # Cosine of frequency vectors
+    kl_divergence: float  # Kullback-Leibler divergence (asymmetric, text1 || text2)
     # Vocabulary sizes
     text1_vocab_size: int  # Unique words in text 1
@@ -1897,6 +2055,54 @@ class JohnsBurrowsResult:
     metadata: dict[str, Any]  # Method-specific parameters, z-scores, etc.
+@dataclass
+class CompressionResult:
+    """Result from compression-based authorship attribution.
+    Compression-based methods use the Normalized Compression Distance (NCD) to
+    measure similarity between texts. The intuition is that if two texts are
+    similar, compressing them together will yield better compression than
+    compressing separately. This approach is language-independent and captures
+    deep statistical regularities.
+    Related GitHub Issue:
+        #24 - Additional Authorship Attribution Methods
+        https://github.com/craigtrim/pystylometry/issues/24
+    Formula:
+        NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
+        where C(x) is the compressed size of x, and C(xy) is the compressed
+        size of x and y concatenated.
+    Interpretation:
+        - NCD ≈ 0: Texts are very similar
+        - NCD ≈ 1: Texts are very different
+        - Typical same-author pairs: 0.3-0.6
+        - Typical different-author pairs: 0.6-0.9
+    References:
+        Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
+            IEEE Transactions on Information Theory, 51(4), 1523-1545.
+        Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
+            zipping. Physical Review Letters, 88(4), 048702.
+    Example:
+        >>> result = compute_compression_distance(text1, text2)
+        >>> print(f"NCD: {result.ncd:.3f}")
+        >>> if result.ncd < 0.5:
+        ...     print("Texts likely by same author")
+    """
+    ncd: float  # Normalized Compression Distance [0, 1+]
+    compressor: str  # Compression algorithm used (e.g., "gzip", "zlib", "bz2")
+    text1_compressed_size: int  # Compressed size of text1 alone
+    text2_compressed_size: int  # Compressed size of text2 alone
+    combined_compressed_size: int  # Compressed size of concatenated texts
+    metadata: dict[str, Any]  # Raw sizes, compression ratios, etc.
 # ===== Rhythm and Prosody Results =====
 # Related to GitHub Issue #25: Rhythm and Prosody Metrics
 # https://github.com/craigtrim/pystylometry/issues/25

pystylometry/authorship/README.md ADDED Viewed

@@ -0,0 +1,21 @@
+# authorship
+![7 public functions](https://img.shields.io/badge/functions-7-blue)
+![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
+Authorship attribution methods for comparing texts and determining likely authorship.
+## Catalogue
+| File | Functions | Method |
+|------|-----------|--------|
+| `burrows_delta.py` | `compute_burrows_delta`, `compute_cosine_delta` | Classic Delta and angular distance variant |
+| `zeta.py` | `compute_zeta` | Zeta method for marker word detection |
+| `kilgarriff.py` | `compute_kilgarriff` | Chi-squared corpus comparison |
+| `additional_methods.py` | `compute_minmax`, `compute_johns_delta` | MinMax distance, Quadratic/Weighted Delta |
+| `compression.py` | `compute_compression_distance` | Normalized Compression Distance (NCD) |
+## See Also
+- [`consistency/`](../consistency/) applies `compute_kilgarriff` in sliding windows for intra-document drift detection
+- [`lexical/`](../lexical/) provides the vocabulary features many attribution methods rely on

pystylometry/authorship/__init__.py CHANGED Viewed

@@ -2,8 +2,8 @@
 This module provides methods for authorship attribution - comparing texts to
 determine whether they were written by the same author. Available methods
-include classic approaches (Burrows' Delta, Zeta) and statistical methods
-(Kilgarriff's chi-squared).
+include classic approaches (Burrows' Delta, Zeta), statistical methods
+(Kilgarriff's chi-squared), and information-theoretic methods (NCD).
 Related GitHub Issues:
     #24 - Additional Authorship Attribution Methods
@@ -16,20 +16,23 @@ Available Functions:
     compute_cosine_delta: Angular distance variant of Delta
     compute_zeta: Zeta method for marker word detection
     compute_kilgarriff: Chi-squared method for corpus comparison
-    compute_minmax: Burrows' original min-max method (not yet implemented)
-    compute_johns_delta: Delta variations (not yet implemented)
+    compute_minmax: Burrows' original min-max distance method
+    compute_johns_delta: Delta variations (quadratic, weighted)
+    compute_compression_distance: Normalized Compression Distance (NCD)
 """
 from .additional_methods import compute_johns_delta, compute_minmax
 from .burrows_delta import compute_burrows_delta, compute_cosine_delta
+from .compression import compute_compression_distance
 from .kilgarriff import compute_kilgarriff
 from .zeta import compute_zeta
 __all__ = [
     "compute_burrows_delta",
+    "compute_compression_distance",
     "compute_cosine_delta",
-    "compute_zeta",
+    "compute_johns_delta",
     "compute_kilgarriff",
     "compute_minmax",
-    "compute_johns_delta",
+    "compute_zeta",
 ]

pystylometry 1.1.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

pystylometry 1.1.0py3-none-any.whl → 1.3.1py3-none-any.whl