PyPI - pystylometry - Versions diffs - 1.1.0__py3-none-any.whl → 1.3.1__py3-none-any.whl - Mend

pystylometry 1.1.0py3-none-any.whl → 1.3.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

pystylometry/README.md +42 -0
pystylometry/__init__.py +17 -1
pystylometry/_types.py +206 -0
pystylometry/authorship/README.md +21 -0
pystylometry/authorship/__init__.py +9 -6
pystylometry/authorship/additional_methods.py +262 -17
pystylometry/authorship/compression.py +175 -0
pystylometry/authorship/kilgarriff.py +8 -1
pystylometry/character/README.md +17 -0
pystylometry/consistency/README.md +27 -0
pystylometry/dialect/README.md +26 -0
pystylometry/lexical/README.md +23 -0
pystylometry/lexical/__init__.py +3 -0
pystylometry/lexical/repetition.py +506 -0
pystylometry/ngrams/README.md +18 -0
pystylometry/ngrams/extended_ngrams.py +314 -69
pystylometry/prosody/README.md +17 -0
pystylometry/prosody/rhythm_prosody.py +773 -11
pystylometry/readability/README.md +23 -0
pystylometry/stylistic/README.md +20 -0
pystylometry/stylistic/cohesion_coherence.py +669 -13
pystylometry/stylistic/genre_register.py +1560 -17
pystylometry/stylistic/markers.py +611 -17
pystylometry/stylistic/vocabulary_overlap.py +354 -13
pystylometry/syntactic/README.md +20 -0
pystylometry/viz/README.md +27 -0
pystylometry-1.3.1.dist-info/LICENSE +21 -0
pystylometry-1.3.1.dist-info/METADATA +79 -0
{pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/RECORD +31 -16
{pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/WHEEL +1 -1
pystylometry-1.1.0.dist-info/METADATA +0 -278
{pystylometry-1.1.0.dist-info → pystylometry-1.3.1.dist-info}/entry_points.txt +0 -0

pystylometry/ngrams/extended_ngrams.py CHANGED Viewed

@@ -26,7 +26,174 @@ References:
         attribution. PACLING.
 """
+from __future__ import annotations
+import math
+from collections import Counter
+from typing import Sequence
 from .._types import ExtendedNgramResult
+from .._utils import advanced_tokenize
+def _generate_ngrams(sequence: Sequence[str], n: int) -> list[tuple[str, ...]]:
+    """
+    Generate n-grams from a sequence.
+    Slides a window of size n across the sequence and yields tuples
+    of n consecutive elements.
+    Related GitHub Issue:
+        #19 - Extended N-gram Features
+        https://github.com/craigtrim/pystylometry/issues/19
+    Args:
+        sequence: List of tokens (words or characters)
+        n: Size of the n-gram (e.g., 3 for trigrams)
+    Returns:
+        List of n-gram tuples
+    Example:
+        >>> _generate_ngrams(["the", "quick", "brown", "fox"], 2)
+        [('the', 'quick'), ('quick', 'brown'), ('brown', 'fox')]
+    """
+    if len(sequence) < n:
+        return []
+    return [tuple(sequence[i : i + n]) for i in range(len(sequence) - n + 1)]
+def _generate_skipgrams(sequence: Sequence[str], n: int, gap: int) -> list[tuple[str, ...]]:
+    """
+    Generate skipgrams (n-grams with gaps) from a sequence.
+    Skipgrams capture non-contiguous word patterns. For example, with n=2 and
+    gap=1, "the quick brown fox" yields ("the", "brown"), ("quick", "fox").
+    This captures syntactic frames independent of specific intervening words.
+    Related GitHub Issue:
+        #19 - Extended N-gram Features
+        https://github.com/craigtrim/pystylometry/issues/19
+    References:
+        Guthrie, D., et al. (2006). A closer look at skip-gram modelling. LREC.
+    Args:
+        sequence: List of tokens
+        n: Number of words to include in each skipgram
+        gap: Number of words to skip between included words
+    Returns:
+        List of skipgram tuples
+    Example:
+        >>> _generate_skipgrams(["the", "quick", "brown", "fox"], 2, 1)
+        [('the', 'brown'), ('quick', 'fox')]
+        >>> _generate_skipgrams(["a", "b", "c", "d", "e"], 3, 1)
+        [('a', 'c', 'd'), ('b', 'd', 'e')]
+    """
+    if n < 2:
+        return list(tuple([s]) for s in sequence)
+    # Total span needed: we need n items with (n-1) gaps of size `gap`
+    # First item at position i, subsequent items at i + (gap+1), i + 2*(gap+1), ...
+    # For n=2, gap=1: positions [i, i+2] -> span of 3
+    # For n=3, gap=1: positions [i, i+2, i+3] (first gap, then contiguous)
+    # Actually for skipgrams like "word1 _ word3 word4" (n=3, gap=1):
+    # positions [i, i+2, i+3]
+    # The pattern is: first word, skip `gap`, then n-1 contiguous words
+    skipgrams = []
+    # Pattern: first word at i, then skip `gap` words, then n-1 contiguous words
+    # Total span = 1 + gap + (n-1) = n + gap
+    total_span = n + gap
+    if len(sequence) < total_span:
+        return []
+    for i in range(len(sequence) - total_span + 1):
+        # First word
+        gram = [sequence[i]]
+        # Skip `gap` words, then take n-1 contiguous words
+        for j in range(n - 1):
+            gram.append(sequence[i + gap + 1 + j])
+        skipgrams.append(tuple(gram))
+    return skipgrams
+def _calculate_shannon_entropy(counter: Counter[tuple[str, ...]]) -> float:
+    """
+    Calculate Shannon entropy of a frequency distribution.
+    Shannon entropy measures the uncertainty or information content in a
+    distribution. Higher entropy indicates more uniform (diverse) distributions,
+    while lower entropy indicates a few dominant n-grams.
+    Related GitHub Issue:
+        #19 - Extended N-gram Features
+        https://github.com/craigtrim/pystylometry/issues/19
+    Formula:
+        H = -Σ p(x) * log2(p(x))
+        where p(x) = count(x) / total
+    Args:
+        counter: Counter object with n-gram frequencies
+    Returns:
+        Shannon entropy in bits. Higher values indicate more diversity.
+    Example:
+        >>> from collections import Counter
+        >>> _calculate_shannon_entropy(Counter({"a": 1, "b": 1, "c": 1, "d": 1}))
+        2.0  # Maximum entropy for 4 equally likely outcomes
+    """
+    if not counter:
+        return 0.0
+    total = sum(counter.values())
+    if total == 0:
+        return 0.0
+    entropy = 0.0
+    for count in counter.values():
+        if count > 0:
+            p = count / total
+            entropy -= p * math.log2(p)
+    return entropy
+def _format_ngram(ngram: tuple[str, ...]) -> str:
+    """
+    Format an n-gram tuple as a readable string.
+    Args:
+        ngram: Tuple of tokens
+    Returns:
+        Space-joined string for word n-grams, concatenated string for characters
+    Example:
+        >>> _format_ngram(("the", "quick", "fox"))
+        'the quick fox'
+    """
+    return " ".join(ngram)
+def _get_top_ngrams(counter: Counter[tuple[str, ...]], n: int) -> list[tuple[str, int]]:
+    """
+    Get top n most frequent n-grams formatted as strings.
+    Args:
+        counter: Counter of n-gram tuples
+        n: Number of top items to return
+    Returns:
+        List of (ngram_string, count) tuples sorted by frequency
+    """
+    return [(_format_ngram(ngram), count) for ngram, count in counter.most_common(n)]
 def compute_extended_ngrams(
@@ -163,73 +330,151 @@ def compute_extended_ngrams(
         - Skipgrams can be very sparse (many unique patterns)
         - Entropy values higher for more diverse n-gram distributions
     """
-    # TODO: Implement extended n-gram analysis
-    # GitHub Issue #19: https://github.com/craigtrim/pystylometry/issues/19
-    #
-    # Implementation steps:
-    #
-    # Word N-grams:
-    # 1. Tokenize text into words (lowercase, basic cleaning)
-    # 2. Generate word trigrams:
-    #    - Slide window of size 3 across word list
-    #    - Create tuples of 3 consecutive words
-    #    - Count frequency of each trigram
-    # 3. Generate word 4-grams (similar, window size 4)
-    # 4. Sort by frequency, extract top_n for each
-    # 5. Calculate Shannon entropy for each distribution:
-    #    H = -sum(p * log2(p)) where p = freq / total
-    #
-    # Skipgrams:
-    # 6. Generate 2-skipgrams with gap 1:
-    #    - For each position i: (word[i], word[i+2])
-    #    - Skips middle word
-    #    - Count frequencies
-    # 7. Generate 3-skipgrams with gap 1:
-    #    - For each position i: (word[i], word[i+2], word[i+3])
-    #    - Pattern: word, skip, word, word
-    #    - Count frequencies
-    # 8. Sort and extract top_n skipgrams
-    #
-    # POS N-grams (if include_pos_ngrams):
-    # 9. Load spaCy model for POS tagging
-    # 10. Parse text to get POS tags for each word
-    # 11. Generate POS trigrams (same as word trigrams, but use POS tags)
-    # 12. Generate POS 4-grams
-    # 13. Count frequencies, extract top_n
-    # 14. Calculate Shannon entropy
-    #
-    # Character N-grams:
-    # 15. Generate character trigrams:
-    #     - Slide window of size 3 across character sequence
-    #     - Include spaces and punctuation
-    #     - Count frequencies
-    # 16. Generate character 4-grams (window size 4)
-    # 17. Sort and extract top_n for each
-    # 18. Calculate Shannon entropy
-    #
-    # Diversity Metrics:
-    # 19. Count total unique n-grams for each type
-    # 20. Calculate type-token ratios (unique / total)
-    #
-    # Metadata:
-    # 21. Store full frequency distributions (optional, can be large)
-    # 22. Store parameters: top_n, include_pos_ngrams, model
-    # 23. Store token/character counts
-    #
-    # Helper Functions Needed:
-    #   - generate_ngrams(sequence, n) -> list[tuple]
-    #   - generate_skipgrams(sequence, n, gap) -> list[tuple]
-    #   - calculate_shannon_entropy(freq_dist) -> float
-    #   - get_top_n(freq_dist, n) -> list[tuple]
-    #
-    # Return ExtendedNgramResult
-    #
-    # Optimization notes:
-    #   - Use Counter from collections for frequency counting
-    #   - Consider sampling for very long texts
-    #   - Limit maximum n-gram types to prevent memory issues
-    #   - POS tagging is slowest step - make it optional
-    raise NotImplementedError(
-        "Extended n-gram features not yet implemented. "
-        "See GitHub Issue #19: https://github.com/craigtrim/pystylometry/issues/19"
+    # =========================================================================
+    # TOKENIZATION
+    # =========================================================================
+    # Word tokenization: lowercase, strip punctuation for word n-grams
+    words = advanced_tokenize(text, lowercase=True, strip_punctuation=True)
+    # Character sequence: lowercase but preserve spaces (for character n-grams)
+    chars = list(text.lower())
+    # =========================================================================
+    # WORD N-GRAMS
+    # =========================================================================
+    # Generate word trigrams (3-grams)
+    word_trigrams = _generate_ngrams(words, 3)
+    word_trigram_counter: Counter[tuple[str, ...]] = Counter(word_trigrams)
+    # Generate word 4-grams
+    word_4grams = _generate_ngrams(words, 4)
+    word_4gram_counter: Counter[tuple[str, ...]] = Counter(word_4grams)
+    # =========================================================================
+    # SKIPGRAMS
+    # =========================================================================
+    # 2-skipgrams with gap of 1: (word1, word3) skipping word2
+    skipgrams_2_1 = _generate_skipgrams(words, 2, 1)
+    skipgram_2_1_counter: Counter[tuple[str, ...]] = Counter(skipgrams_2_1)
+    # 3-skipgrams with gap of 1: (word1, word3, word4) skipping word2
+    skipgrams_3_1 = _generate_skipgrams(words, 3, 1)
+    skipgram_3_1_counter: Counter[tuple[str, ...]] = Counter(skipgrams_3_1)
+    # =========================================================================
+    # POS N-GRAMS (optional, requires spaCy)
+    # =========================================================================
+    pos_trigram_counter: Counter[tuple[str, ...]] = Counter()
+    pos_4gram_counter: Counter[tuple[str, ...]] = Counter()
+    pos_trigram_entropy = 0.0
+    if include_pos_ngrams:
+        try:
+            import spacy
+            # Load spaCy model
+            try:
+                nlp = spacy.load(spacy_model)
+            except OSError:
+                # Model not installed - provide helpful message
+                raise ImportError(
+                    f"spaCy model '{spacy_model}' not found. "
+                    f"Install with: python -m spacy download {spacy_model}"
+                )
+            # Process text and extract POS tags
+            doc = nlp(text)
+            pos_tags = [token.pos_ for token in doc if not token.is_space]
+            # Generate POS trigrams
+            pos_trigrams = _generate_ngrams(pos_tags, 3)
+            pos_trigram_counter = Counter(pos_trigrams)
+            # Generate POS 4-grams
+            pos_4grams = _generate_ngrams(pos_tags, 4)
+            pos_4gram_counter = Counter(pos_4grams)
+            pos_trigram_entropy = _calculate_shannon_entropy(pos_trigram_counter)
+        except ImportError:
+            # spaCy not installed - leave POS results empty
+            pass
+    # =========================================================================
+    # CHARACTER N-GRAMS
+    # =========================================================================
+    # Character trigrams
+    char_trigrams = _generate_ngrams(chars, 3)
+    char_trigram_counter: Counter[tuple[str, ...]] = Counter(char_trigrams)
+    # Character 4-grams
+    char_4grams = _generate_ngrams(chars, 4)
+    char_4gram_counter: Counter[tuple[str, ...]] = Counter(char_4grams)
+    # =========================================================================
+    # ENTROPY CALCULATIONS
+    # =========================================================================
+    word_trigram_entropy = _calculate_shannon_entropy(word_trigram_counter)
+    word_4gram_entropy = _calculate_shannon_entropy(word_4gram_counter)
+    char_trigram_entropy = _calculate_shannon_entropy(char_trigram_counter)
+    char_4gram_entropy = _calculate_shannon_entropy(char_4gram_counter)
+    # =========================================================================
+    # BUILD RESULT
+    # =========================================================================
+    return ExtendedNgramResult(
+        # Word n-grams
+        top_word_trigrams=_get_top_ngrams(word_trigram_counter, top_n),
+        top_word_4grams=_get_top_ngrams(word_4gram_counter, top_n),
+        word_trigram_count=len(word_trigram_counter),
+        word_4gram_count=len(word_4gram_counter),
+        word_trigram_entropy=word_trigram_entropy,
+        word_4gram_entropy=word_4gram_entropy,
+        # Skipgrams
+        top_skipgrams_2_1=_get_top_ngrams(skipgram_2_1_counter, top_n),
+        top_skipgrams_3_1=_get_top_ngrams(skipgram_3_1_counter, top_n),
+        skipgram_2_1_count=len(skipgram_2_1_counter),
+        skipgram_3_1_count=len(skipgram_3_1_counter),
+        # POS n-grams
+        top_pos_trigrams=_get_top_ngrams(pos_trigram_counter, top_n),
+        top_pos_4grams=_get_top_ngrams(pos_4gram_counter, top_n),
+        pos_trigram_count=len(pos_trigram_counter),
+        pos_4gram_count=len(pos_4gram_counter),
+        pos_trigram_entropy=pos_trigram_entropy,
+        # Character n-grams
+        top_char_trigrams=_get_top_ngrams(char_trigram_counter, top_n),
+        top_char_4grams=_get_top_ngrams(char_4gram_counter, top_n),
+        char_trigram_entropy=char_trigram_entropy,
+        char_4gram_entropy=char_4gram_entropy,
+        # Metadata
+        metadata={
+            "parameters": {
+                "top_n": top_n,
+                "include_pos_ngrams": include_pos_ngrams,
+                "spacy_model": spacy_model if include_pos_ngrams else None,
+            },
+            "token_count": len(words),
+            "character_count": len(chars),
+            "word_trigram_tokens": len(word_trigrams),
+            "word_4gram_tokens": len(word_4grams),
+            "char_trigram_tokens": len(char_trigrams),
+            "char_4gram_tokens": len(char_4grams),
+            "full_distributions": {
+                "word_trigrams": dict(word_trigram_counter.most_common(100)),
+                "word_4grams": dict(word_4gram_counter.most_common(100)),
+                "skipgrams_2_1": dict(skipgram_2_1_counter.most_common(100)),
+                "skipgrams_3_1": dict(skipgram_3_1_counter.most_common(100)),
+                "pos_trigrams": dict(pos_trigram_counter.most_common(100)),
+                "pos_4grams": dict(pos_4gram_counter.most_common(100)),
+                "char_trigrams": dict(char_trigram_counter.most_common(100)),
+                "char_4grams": dict(char_4gram_counter.most_common(100)),
+            },
+        },
     )

pystylometry/prosody/README.md ADDED Viewed

@@ -0,0 +1,17 @@
+# prosody
+![1 public function](https://img.shields.io/badge/functions-1-blue)
+![Requires spaCy](https://img.shields.io/badge/requires-spaCy-orange)
+Rhythm and stress pattern analysis for written text.
+## Catalogue
+| File | Function | What It Measures |
+|------|----------|-----------------|
+| `rhythm_prosody.py` | `compute_rhythm_prosody` | Syllable stress patterns, rhythm regularity, prose rhythm metrics |
+## See Also
+- [`readability/syllables.py`](../readability/) for the syllable counting engine
+- [`syntactic/`](../syntactic/) for sentence structure features that interact with prosodic rhythm

pystylometry 1.1.0__py3-none-any.whl → 1.3.1__py3-none-any.whl

pystylometry 1.1.0py3-none-any.whl → 1.3.1py3-none-any.whl