PyPI - pystylometry - Versions diffs - 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

pystylometry 1.0.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

pystylometry/README.md +42 -0
pystylometry/__init__.py +45 -3
pystylometry/_types.py +1017 -259
pystylometry/authorship/README.md +21 -0
pystylometry/authorship/__init__.py +28 -4
pystylometry/authorship/additional_methods.py +260 -40
pystylometry/authorship/compression.py +175 -0
pystylometry/authorship/kilgarriff.py +354 -0
pystylometry/character/README.md +17 -0
pystylometry/character/character_metrics.py +267 -179
pystylometry/cli.py +427 -0
pystylometry/consistency/README.md +27 -0
pystylometry/consistency/__init__.py +57 -0
pystylometry/consistency/_thresholds.py +162 -0
pystylometry/consistency/drift.py +549 -0
pystylometry/dialect/README.md +26 -0
pystylometry/dialect/__init__.py +65 -0
pystylometry/dialect/_data/dialect_markers.json +1134 -0
pystylometry/dialect/_loader.py +360 -0
pystylometry/dialect/detector.py +533 -0
pystylometry/lexical/README.md +23 -0
pystylometry/lexical/advanced_diversity.py +61 -22
pystylometry/lexical/function_words.py +255 -56
pystylometry/lexical/hapax.py +182 -52
pystylometry/lexical/mtld.py +108 -26
pystylometry/lexical/ttr.py +76 -10
pystylometry/lexical/word_frequency_sophistication.py +1522 -298
pystylometry/lexical/yule.py +136 -50
pystylometry/ngrams/README.md +18 -0
pystylometry/ngrams/entropy.py +150 -49
pystylometry/ngrams/extended_ngrams.py +314 -69
pystylometry/prosody/README.md +17 -0
pystylometry/prosody/rhythm_prosody.py +773 -11
pystylometry/readability/README.md +23 -0
pystylometry/readability/additional_formulas.py +1887 -762
pystylometry/readability/ari.py +144 -82
pystylometry/readability/coleman_liau.py +136 -109
pystylometry/readability/flesch.py +177 -73
pystylometry/readability/gunning_fog.py +165 -161
pystylometry/readability/smog.py +123 -42
pystylometry/stylistic/README.md +20 -0
pystylometry/stylistic/cohesion_coherence.py +669 -13
pystylometry/stylistic/genre_register.py +1560 -17
pystylometry/stylistic/markers.py +611 -17
pystylometry/stylistic/vocabulary_overlap.py +354 -13
pystylometry/syntactic/README.md +20 -0
pystylometry/syntactic/advanced_syntactic.py +76 -14
pystylometry/syntactic/pos_ratios.py +70 -6
pystylometry/syntactic/sentence_stats.py +55 -12
pystylometry/syntactic/sentence_types.py +71 -15
pystylometry/viz/README.md +27 -0
pystylometry/viz/__init__.py +71 -0
pystylometry/viz/drift.py +589 -0
pystylometry/viz/jsx/__init__.py +31 -0
pystylometry/viz/jsx/_base.py +144 -0
pystylometry/viz/jsx/report.py +677 -0
pystylometry/viz/jsx/timeline.py +716 -0
pystylometry/viz/jsx/viewer.py +1032 -0
pystylometry-1.3.0.dist-info/METADATA +136 -0
pystylometry-1.3.0.dist-info/RECORD +76 -0
{pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
pystylometry-1.0.0.dist-info/METADATA +0 -275
pystylometry-1.0.0.dist-info/RECORD +0 -46

pystylometry/lexical/yule.py CHANGED Viewed

@@ -1,15 +1,71 @@
-"""Yule's K and I statistics for vocabulary richness."""
+"""Yule's K and I statistics for vocabulary richness.
+This module implements Yule's K and I metrics with native chunked analysis
+for stylometric fingerprinting.
+Related GitHub Issue:
+    #27 - Native chunked analysis with Distribution dataclass
+    https://github.com/craigtrim/pystylometry/issues/27
+"""
+import math
 from collections import Counter
-from .._types import YuleResult
+from .._types import Distribution, YuleResult, chunk_text, make_distribution
 from .._utils import tokenize
-def compute_yule(text: str) -> YuleResult:
+def _compute_yule_single(text: str) -> tuple[float, float, dict]:
+    """Compute Yule's K and I for a single chunk of text.
+    Returns:
+        Tuple of (yule_k, yule_i, metadata_dict).
+        Returns (nan, nan, metadata) for empty/invalid input.
+    """
+    tokens = tokenize(text.lower())
+    N = len(tokens)  # noqa: N806
+    if N == 0:
+        return (
+            float("nan"),
+            float("nan"),
+            {"token_count": 0, "vocabulary_size": 0},
+        )
+    # Count frequency of each token
+    freq_counter = Counter(tokens)
+    V = len(freq_counter)  # noqa: N806
+    # Count how many words occur with each frequency
+    freq_of_freqs = Counter(freq_counter.values())
+    # Calculate Σm²×Vm
+    sum_m2_vm = sum(m * m * vm for m, vm in freq_of_freqs.items())
+    # Yule's K: 10⁴ × (Σm²×Vm - N) / N²
+    yule_k = 10_000 * (sum_m2_vm - N) / (N * N)
+    # Yule's I: V² / (Σm²×Vm - N)
+    denominator = sum_m2_vm - N
+    if denominator == 0:
+        yule_i = float("nan")
+    else:
+        yule_i = (V * V) / denominator
+    return (
+        yule_k,
+        yule_i,
+        {"token_count": N, "vocabulary_size": V},
+    )
+def compute_yule(text: str, chunk_size: int = 1000) -> YuleResult:
     """
     Compute Yule's K and I metrics for vocabulary richness.
+    This function uses native chunked analysis to capture variance and patterns
+    across the text, which is essential for stylometric fingerprinting.
     Yule's K measures vocabulary repetitiveness (higher = more repetitive).
     Yule's I is the inverse measure (higher = more diverse).
@@ -23,71 +79,101 @@ def compute_yule(text: str) -> YuleResult:
         - Vm = number of types occurring m times
         - m = frequency count
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     References:
         Yule, G. U. (1944). The Statistical Study of Literary Vocabulary.
         Cambridge University Press.
     Args:
         text: Input text to analyze
+        chunk_size: Number of words per chunk (default: 1000)
     Returns:
-        YuleResult with .yule_k, .yule_i, and metadata
-        Note: For empty input or when Σm²×Vm = N (perfectly uniform vocabulary),
-        metrics will be float('nan') to indicate undefined values.
+        YuleResult with yule_k, yule_i, distributions, and metadata
     Example:
-        >>> result = compute_yule("The quick brown fox jumps over the lazy dog.")
-        >>> print(f"Yule's K: {result.yule_k:.2f}")
-        >>> print(f"Yule's I: {result.yule_i:.2f}")
-        >>> # Empty input returns NaN
-        >>> import math
-        >>> result_empty = compute_yule("")
-        >>> math.isnan(result_empty.yule_k)
-        True
+        >>> result = compute_yule("Long text here...", chunk_size=1000)
+        >>> result.yule_k  # Mean across chunks
+        120.5
+        >>> result.yule_k_dist.std  # Variance reveals fingerprint
+        15.2
     """
-    tokens = tokenize(text.lower())
-    N = len(tokens)  # noqa: N806
-    if N == 0:
+    # Chunk the text
+    chunks = chunk_text(text, chunk_size)
+    # Compute metrics per chunk
+    yule_k_values = []
+    yule_i_values = []
+    total_tokens = 0
+    total_vocab = 0
+    for chunk in chunks:
+        k, i, meta = _compute_yule_single(chunk)
+        if not math.isnan(k):
+            yule_k_values.append(k)
+        if not math.isnan(i):
+            yule_i_values.append(i)
+        total_tokens += meta.get("token_count", 0)
+        total_vocab += meta.get("vocabulary_size", 0)
+    # Handle empty or all-invalid chunks
+    if not yule_k_values:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return YuleResult(
             yule_k=float("nan"),
             yule_i=float("nan"),
-            metadata={"token_count": 0, "vocabulary_size": 0},
+            yule_k_dist=empty_dist,
+            yule_i_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=len(chunks),
+            metadata={
+                # Backward-compatible keys
+                "token_count": 0,
+                "vocabulary_size": 0,
+                # New prefixed keys for consistency
+                "total_token_count": 0,
+                "total_vocabulary_size": 0,
+            },
         )
-    # Count frequency of each token
-    freq_counter = Counter(tokens)
-    V = len(freq_counter)  # noqa: N806
-    # Count how many words occur with each frequency
-    # Vm[m] = number of words that occur exactly m times
-    freq_of_freqs = Counter(freq_counter.values())
-    # Calculate Σm²×Vm (sum of m-squared times Vm for all m)
-    # This is the sum across all frequency levels of:
-    # (frequency)² × (count of words at that frequency)
-    sum_m2_vm = sum(m * m * vm for m, vm in freq_of_freqs.items())
-    # Yule's K: 10⁴ × (Σm²×Vm - N) / N²
-    # K measures vocabulary repetitiveness (higher K = more repetitive)
-    yule_k = 10_000 * (sum_m2_vm - N) / (N * N)
-    # Yule's I: V² / (Σm²×Vm - N)
-    # I is the inverse measure (higher I = more diverse)
-    # If Σm²×Vm = N (perfectly uniform vocabulary), denominator is 0, return NaN
-    denominator = sum_m2_vm - N
-    if denominator == 0:
-        yule_i = float("nan")
-    else:
-        yule_i = (V * V) / denominator
+    # Build distributions
+    yule_k_dist = make_distribution(yule_k_values)
+    yule_i_dist = (
+        make_distribution(yule_i_values)
+        if yule_i_values
+        else Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
+    )
     return YuleResult(
-        yule_k=yule_k,
-        yule_i=yule_i,
+        yule_k=yule_k_dist.mean,
+        yule_i=yule_i_dist.mean,
+        yule_k_dist=yule_k_dist,
+        yule_i_dist=yule_i_dist,
+        chunk_size=chunk_size,
+        chunk_count=len(chunks),
         metadata={
-            "token_count": N,
-            "vocabulary_size": V,
+            # Backward-compatible keys
+            "token_count": total_tokens,
+            "vocabulary_size": total_vocab,
+            # New prefixed keys for consistency
+            "total_token_count": total_tokens,
+            "total_vocabulary_size": total_vocab,
         },
     )

pystylometry/ngrams/README.md ADDED Viewed

@@ -0,0 +1,18 @@
+# ngrams
+![4 public functions](https://img.shields.io/badge/functions-4-blue)
+![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
+N-gram generation, entropy computation, and sequence analysis.
+## Catalogue
+| File | Functions | What It Measures |
+|------|-----------|-----------------|
+| `entropy.py` | `compute_ngram_entropy`, `compute_character_bigram_entropy`, `compute_word_bigram_entropy` | Shannon entropy at character and word n-gram levels |
+| `extended_ngrams.py` | `compute_extended_ngrams` | Word, character, and POS n-gram profiles with frequency distributions |
+## See Also
+- [`syntactic/`](../syntactic/) provides POS tags consumed by `compute_extended_ngrams(text, pos=True)`
+- [`character/`](../character/) for character-level features without n-gram structure

pystylometry/ngrams/entropy.py CHANGED Viewed

@@ -1,16 +1,83 @@
-"""N-gram entropy and perplexity calculations."""
+"""N-gram entropy and perplexity calculations.
+This module implements n-gram entropy computation with native chunked analysis
+for stylometric fingerprinting.
+Related GitHub Issue:
+    #27 - Native chunked analysis with Distribution dataclass
+    https://github.com/craigtrim/pystylometry/issues/27
+"""
 import math
 from collections import Counter
-from .._types import EntropyResult
+from .._types import Distribution, EntropyResult, chunk_text, make_distribution
 from .._utils import tokenize
-def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> EntropyResult:
+def _compute_ngram_entropy_single(text: str, n: int, ngram_type: str) -> tuple[float, float, dict]:
+    """Compute n-gram entropy for a single chunk of text.
+    Returns:
+        Tuple of (entropy, perplexity, metadata_dict).
+        Returns (nan, nan, metadata) for empty/invalid input.
+    """
+    # Generate n-grams
+    if ngram_type == "character":
+        items = list(text)
+    else:  # word
+        items = tokenize(text)
+    if len(items) < n:
+        return (
+            float("nan"),
+            float("nan"),
+            {
+                "item_count": len(items),
+                "unique_ngrams": 0,
+                "total_ngrams": 0,
+            },
+        )
+    # Create n-grams using sliding window
+    ngram_list = []
+    for i in range(len(items) - n + 1):
+        ngram = tuple(items[i : i + n])
+        ngram_list.append(ngram)
+    # Count n-gram frequencies
+    ngram_counts = Counter(ngram_list)
+    total_ngrams = len(ngram_list)
+    # Calculate entropy: H(X) = -Σ p(x) × log₂(p(x))
+    entropy = 0.0
+    for count in ngram_counts.values():
+        probability = count / total_ngrams
+        entropy -= probability * math.log2(probability)
+    # Calculate perplexity: 2^H(X)
+    perplexity = 2**entropy
+    return (
+        entropy,
+        perplexity,
+        {
+            "item_count": len(items),
+            "unique_ngrams": len(ngram_counts),
+            "total_ngrams": total_ngrams,
+        },
+    )
+def compute_ngram_entropy(
+    text: str, n: int = 2, ngram_type: str = "word", chunk_size: int = 1000
+) -> EntropyResult:
     """
     Compute n-gram entropy and perplexity for text.
+    This function uses native chunked analysis to capture variance and patterns
+    across the text, which is essential for stylometric fingerprinting.
     Entropy measures the unpredictability of the next item in a sequence.
     Higher entropy = more unpredictable = more diverse/complex text.
@@ -20,6 +87,10 @@ def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> En
     Where p(x) is the probability of n-gram x occurring.
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     References:
         Shannon, C. E. (1948). A mathematical theory of communication.
         Bell System Technical Journal, 27(3), 379-423.
@@ -31,100 +102,130 @@ def compute_ngram_entropy(text: str, n: int = 2, ngram_type: str = "word") -> En
         text: Input text to analyze
         n: N-gram size (2 for bigrams, 3 for trigrams, etc.)
         ngram_type: "word" or "character" (default: "word")
+        chunk_size: Number of words per chunk (default: 1000)
     Returns:
-        EntropyResult with entropy, perplexity, and metadata
+        EntropyResult with entropy, perplexity, distributions, and metadata
     Example:
-        >>> result = compute_ngram_entropy("The quick brown fox jumps", n=2, ngram_type="word")
-        >>> print(f"Bigram entropy: {result.entropy:.3f}")
-        >>> print(f"Perplexity: {result.perplexity:.3f}")
+        >>> result = compute_ngram_entropy("Long text here...", n=2, chunk_size=1000)
+        >>> result.entropy  # Mean across chunks
+        5.2
+        >>> result.entropy_dist.std  # Variance reveals fingerprint
+        0.3
     """
-    # Generate n-grams
-    if ngram_type == "character":
-        items = list(text)
-    else:  # word
-        items = tokenize(text)
-    if len(items) < n:
+    # Chunk the text
+    chunks = chunk_text(text, chunk_size)
+    # Compute metrics per chunk
+    entropy_values = []
+    perplexity_values = []
+    total_items = 0
+    total_unique_ngrams = 0
+    total_ngrams = 0
+    for chunk in chunks:
+        ent, perp, meta = _compute_ngram_entropy_single(chunk, n, ngram_type)
+        if not math.isnan(ent):
+            entropy_values.append(ent)
+            perplexity_values.append(perp)
+        total_items += meta.get("item_count", 0)
+        total_unique_ngrams += meta.get("unique_ngrams", 0)
+        total_ngrams += meta.get("total_ngrams", 0)
+    # Handle empty or all-invalid chunks
+    if not entropy_values:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return EntropyResult(
-            entropy=0.0,
-            perplexity=1.0,
+            entropy=float("nan"),
+            perplexity=float("nan"),
             ngram_type=f"{ngram_type}_{n}gram",
+            entropy_dist=empty_dist,
+            perplexity_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=len(chunks),
             metadata={
                 "n": n,
                 "ngram_type": ngram_type,
-                "item_count": len(items),
+                "total_item_count": total_items,
                 "warning": "Text too short for n-gram analysis",
             },
         )
-    # Create n-grams using sliding window
-    ngram_list = []
-    for i in range(len(items) - n + 1):
-        ngram = tuple(items[i : i + n])
-        ngram_list.append(ngram)
-    # Count n-gram frequencies
-    ngram_counts = Counter(ngram_list)
-    total_ngrams = len(ngram_list)
-    # Calculate entropy: H(X) = -Σ p(x) × log₂(p(x))
-    entropy = 0.0
-    for count in ngram_counts.values():
-        probability = count / total_ngrams
-        entropy -= probability * math.log2(probability)
-    # Calculate perplexity: 2^H(X)
-    perplexity = 2**entropy
+    # Build distributions
+    entropy_dist = make_distribution(entropy_values)
+    perplexity_dist = make_distribution(perplexity_values)
     return EntropyResult(
-        entropy=entropy,
-        perplexity=perplexity,
+        entropy=entropy_dist.mean,
+        perplexity=perplexity_dist.mean,
         ngram_type=f"{ngram_type}_{n}gram",
+        entropy_dist=entropy_dist,
+        perplexity_dist=perplexity_dist,
+        chunk_size=chunk_size,
+        chunk_count=len(chunks),
         metadata={
             "n": n,
             "ngram_type": ngram_type,
-            "item_count": len(items),
-            "unique_ngrams": len(ngram_counts),
+            "total_item_count": total_items,
+            "total_unique_ngrams": total_unique_ngrams,
             "total_ngrams": total_ngrams,
         },
     )
-def compute_character_bigram_entropy(text: str) -> EntropyResult:
+def compute_character_bigram_entropy(text: str, chunk_size: int = 1000) -> EntropyResult:
     """
     Compute character bigram entropy.
     Convenience function that calls compute_ngram_entropy with n=2, ngram_type="character".
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     Args:
         text: Input text to analyze
+        chunk_size: Number of words per chunk (default: 1000)
     Returns:
-        EntropyResult with character bigram entropy and perplexity
+        EntropyResult with character bigram entropy, perplexity, and distributions
     Example:
-        >>> result = compute_character_bigram_entropy("The quick brown fox")
-        >>> print(f"Character bigram entropy: {result.entropy:.3f}")
+        >>> result = compute_character_bigram_entropy("Long text here...", chunk_size=1000)
+        >>> result.entropy  # Mean across chunks
+        3.8
     """
-    return compute_ngram_entropy(text, n=2, ngram_type="character")
+    return compute_ngram_entropy(text, n=2, ngram_type="character", chunk_size=chunk_size)
-def compute_word_bigram_entropy(text: str) -> EntropyResult:
+def compute_word_bigram_entropy(text: str, chunk_size: int = 1000) -> EntropyResult:
     """
     Compute word bigram entropy.
     Convenience function that calls compute_ngram_entropy with n=2, ngram_type="word".
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     Args:
         text: Input text to analyze
+        chunk_size: Number of words per chunk (default: 1000)
     Returns:
-        EntropyResult with word bigram entropy and perplexity
+        EntropyResult with word bigram entropy, perplexity, and distributions
     Example:
-        >>> result = compute_word_bigram_entropy("The quick brown fox jumps")
-        >>> print(f"Word bigram entropy: {result.entropy:.3f}")
+        >>> result = compute_word_bigram_entropy("Long text here...", chunk_size=1000)
+        >>> result.entropy  # Mean across chunks
+        5.2
     """
-    return compute_ngram_entropy(text, n=2, ngram_type="word")
+    return compute_ngram_entropy(text, n=2, ngram_type="word", chunk_size=chunk_size)

pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

pystylometry 1.0.0py3-none-any.whl → 1.3.0py3-none-any.whl