PyPI - pystylometry - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

pystylometry 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

pystylometry/__init__.py +29 -3
pystylometry/_types.py +963 -259
pystylometry/authorship/__init__.py +23 -2
pystylometry/authorship/additional_methods.py +4 -29
pystylometry/authorship/kilgarriff.py +347 -0
pystylometry/character/character_metrics.py +267 -179
pystylometry/cli.py +427 -0
pystylometry/consistency/__init__.py +57 -0
pystylometry/consistency/_thresholds.py +162 -0
pystylometry/consistency/drift.py +549 -0
pystylometry/dialect/__init__.py +65 -0
pystylometry/dialect/_data/dialect_markers.json +1134 -0
pystylometry/dialect/_loader.py +360 -0
pystylometry/dialect/detector.py +533 -0
pystylometry/lexical/advanced_diversity.py +61 -22
pystylometry/lexical/function_words.py +255 -56
pystylometry/lexical/hapax.py +182 -52
pystylometry/lexical/mtld.py +108 -26
pystylometry/lexical/ttr.py +76 -10
pystylometry/lexical/word_frequency_sophistication.py +1522 -298
pystylometry/lexical/yule.py +136 -50
pystylometry/ngrams/entropy.py +150 -49
pystylometry/readability/additional_formulas.py +1887 -762
pystylometry/readability/ari.py +144 -82
pystylometry/readability/coleman_liau.py +136 -109
pystylometry/readability/flesch.py +177 -73
pystylometry/readability/gunning_fog.py +165 -161
pystylometry/readability/smog.py +123 -42
pystylometry/syntactic/advanced_syntactic.py +76 -14
pystylometry/syntactic/pos_ratios.py +70 -6
pystylometry/syntactic/sentence_stats.py +55 -12
pystylometry/syntactic/sentence_types.py +71 -15
pystylometry/viz/__init__.py +71 -0
pystylometry/viz/drift.py +589 -0
pystylometry/viz/jsx/__init__.py +31 -0
pystylometry/viz/jsx/_base.py +144 -0
pystylometry/viz/jsx/report.py +677 -0
pystylometry/viz/jsx/timeline.py +716 -0
pystylometry/viz/jsx/viewer.py +1032 -0
{pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
pystylometry-1.1.0.dist-info/RECORD +63 -0
{pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
pystylometry-1.0.0.dist-info/RECORD +0 -46

pystylometry/readability/ari.py CHANGED Viewed

@@ -1,34 +1,26 @@
-"""Automated Readability Index (ARI)."""
+"""Automated Readability Index (ARI).
+This module implements the ARI readability formula with native chunked
+analysis for stylometric fingerprinting.
+Related GitHub Issue:
+    #27 - Native chunked analysis with Distribution dataclass
+    https://github.com/craigtrim/pystylometry/issues/27
+"""
 import math
-from .._types import ARIResult
+from .._types import ARIResult, Distribution, chunk_text, make_distribution
 from .._utils import split_sentences, tokenize
 # Formula coefficients from Senter & Smith (1967)
-# Reference: Senter, R. J., & Smith, E. A. (1967). Automated readability index.
-#            AMRL-TR-6620. Aerospace Medical Research Laboratories.
-# Coefficient for characters per word
 _CHARACTER_COEFFICIENT = 4.71
-# Coefficient for words per sentence
 _WORD_COEFFICIENT = 0.5
-# Intercept to calibrate scale to U.S. grade levels
 _INTERCEPT = -21.43
-def _get_age_range(grade_level: int) -> str:
-    """
-    Map grade level to age range.
-    Args:
-        grade_level: U.S. grade level (0-20+)
-    Returns:
-        Age range string
-    """
+def _get_age_range(grade_level: float) -> str:
+    """Map grade level to age range."""
     if grade_level <= 0:
         return "5-6 years (Kindergarten)"
     elif grade_level <= 5:
@@ -43,10 +35,55 @@ def _get_age_range(grade_level: int) -> str:
         return "22+ years (Graduate)"
-def compute_ari(text: str) -> ARIResult:
+def _compute_ari_single(text: str) -> tuple[float, float, dict]:
+    """Compute ARI metrics for a single chunk of text.
+    Returns:
+        Tuple of (ari_score, grade_level, metadata_dict).
+        Returns (nan, nan, metadata) for empty/invalid input.
+    """
+    sentences = split_sentences(text)
+    tokens = tokenize(text)
+    character_count = sum(1 for char in text if char.isalnum())
+    if len(sentences) == 0 or len(tokens) == 0:
+        return (
+            float("nan"),
+            float("nan"),
+            {"sentence_count": 0, "word_count": 0, "character_count": 0},
+        )
+    # Calculate ratios
+    chars_per_word = character_count / len(tokens)
+    words_per_sentence = len(tokens) / len(sentences)
+    # Apply ARI formula
+    ari_score = (
+        _CHARACTER_COEFFICIENT * chars_per_word
+        + _WORD_COEFFICIENT * words_per_sentence
+        + _INTERCEPT
+    )
+    grade_level = max(0, min(20, math.floor(ari_score + 0.5)))
+    metadata = {
+        "sentence_count": len(sentences),
+        "word_count": len(tokens),
+        "character_count": character_count,
+        "characters_per_word": chars_per_word,
+        "words_per_sentence": words_per_sentence,
+    }
+    return (ari_score, float(grade_level), metadata)
+def compute_ari(text: str, chunk_size: int = 1000) -> ARIResult:
     """
     Compute Automated Readability Index (ARI).
+    This function uses native chunked analysis to capture variance and patterns
+    across the text, which is essential for stylometric fingerprinting.
     Formula:
         ARI = 4.71 × (characters/words) + 0.5 × (words/sentences) - 21.43
@@ -54,18 +91,9 @@ def compute_ari(text: str) -> ARIResult:
     but adds sentence length as a factor. It produces an approximate
     representation of the US grade level needed to comprehend the text.
-    **Implementation Notes:**
-    - Grade levels are clamped to [0, 20] range
-    - Uses round-half-up rounding for grade level calculation
-    - Character count includes alphanumeric characters only (letters and digits)
-    - Reliability heuristic: 100+ words recommended
-    Grade Level to Age mapping:
-        1-5:   6-11 years (Elementary)
-        6-8:   11-14 years (Middle School)
-        9-12:  14-18 years (High School)
-        13-14: 18-22 years (College)
-        15+:   22+ years (Graduate)
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     References:
         Senter, R. J., & Smith, E. A. (1967). Automated readability index.
@@ -73,74 +101,108 @@ def compute_ari(text: str) -> ARIResult:
     Args:
         text: Input text to analyze
+        chunk_size: Number of words per chunk (default: 1000).
+            The text is divided into chunks of this size, and metrics are
+            computed per-chunk.
     Returns:
-        ARIResult with ARI score, grade level, and age range
+        ARIResult with:
+            - ari_score: Mean ARI score across chunks
+            - grade_level: Mean grade level across chunks
+            - age_range: Age range based on mean grade level
+            - ari_score_dist: Distribution with per-chunk values and stats
+            - grade_level_dist: Distribution with per-chunk values and stats
+            - chunk_size: The chunk size used
+            - chunk_count: Number of chunks analyzed
     Example:
-        >>> result = compute_ari("The quick brown fox jumps over the lazy dog.")
-        >>> print(f"ARI Score: {result.ari_score:.1f}")
-        ARI Score: 0.1
-        >>> print(f"Grade Level: {result.grade_level}")
-        Grade Level: 0
-        >>> print(f"Age Range: {result.age_range}")
-        Age Range: 5-6 years (Kindergarten)
-        >>> result.metadata["reliable"]
-        False
+        >>> result = compute_ari("Long text here...", chunk_size=1000)
+        >>> result.ari_score  # Mean across chunks
+        9.5
+        >>> result.ari_score_dist.std  # Variance reveals fingerprint
+        1.5
     """
-    sentences = split_sentences(text)
-    tokens = tokenize(text)
-    # Count characters (alphanumeric: letters and digits, excluding spaces/punctuation)
-    # Computed before early return to ensure metadata consistency
-    character_count = sum(1 for char in text if char.isalnum())
-    if len(sentences) == 0 or len(tokens) == 0:
+    # Chunk the text
+    chunks = chunk_text(text, chunk_size)
+    # Compute metrics per chunk
+    ari_values = []
+    grade_values = []
+    total_sentences = 0
+    total_words = 0
+    total_chars = 0
+    for chunk in chunks:
+        ai, gl, meta = _compute_ari_single(chunk)
+        if not math.isnan(ai):
+            ari_values.append(ai)
+            grade_values.append(gl)
+        total_sentences += meta.get("sentence_count", 0)
+        total_words += meta.get("word_count", 0)
+        total_chars += meta.get("character_count", 0)
+    # Handle empty or all-invalid chunks
+    if not ari_values:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return ARIResult(
-            ari_score=0.0,
-            grade_level=0,
-            age_range="5-6 years (Kindergarten)",
+            ari_score=float("nan"),
+            grade_level=float("nan"),
+            age_range="Unknown",
+            ari_score_dist=empty_dist,
+            grade_level_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=len(chunks),
             metadata={
-                "sentence_count": len(sentences),
-                "word_count": len(tokens),
-                "character_count": character_count,
+                # Backward-compatible keys
+                "sentence_count": 0,
+                "word_count": 0,
+                "character_count": 0,
                 "characters_per_word": 0.0,
                 "words_per_sentence": 0.0,
+                # New prefixed keys for consistency
+                "total_sentence_count": 0,
+                "total_word_count": 0,
+                "total_character_count": 0,
                 "reliable": False,
             },
         )
-    # Calculate ratios
-    chars_per_word = character_count / len(tokens)
-    words_per_sentence = len(tokens) / len(sentences)
-    # Apply ARI formula
-    ari_score = (
-        _CHARACTER_COEFFICIENT * chars_per_word
-        + _WORD_COEFFICIENT * words_per_sentence
-        + _INTERCEPT
-    )
-    # Use round-half-up rounding and clamp to valid grade range [0, 20]
-    # math.floor(x + 0.5) implements round-half-up for both positive and negative values
-    grade_level = max(0, min(20, math.floor(ari_score + 0.5)))
+    # Build distributions
+    ari_dist = make_distribution(ari_values)
+    grade_dist = make_distribution(grade_values)
-    # Get age range from grade level
-    age_range = _get_age_range(grade_level)
+    # Get age range from mean grade level
+    age_range = _get_age_range(grade_dist.mean)
-    # Reliability heuristic: like other readability metrics, 100+ words recommended
-    reliable = len(tokens) >= 100
+    # Reliability heuristic
+    reliable = total_words >= 100
     return ARIResult(
-        ari_score=ari_score,
-        grade_level=grade_level,
+        ari_score=ari_dist.mean,
+        grade_level=grade_dist.mean,
         age_range=age_range,
+        ari_score_dist=ari_dist,
+        grade_level_dist=grade_dist,
+        chunk_size=chunk_size,
+        chunk_count=len(chunks),
         metadata={
-            "sentence_count": len(sentences),
-            "word_count": len(tokens),
-            "character_count": character_count,
-            "characters_per_word": chars_per_word,
-            "words_per_sentence": words_per_sentence,
+            # Backward-compatible keys
+            "sentence_count": total_sentences,
+            "word_count": total_words,
+            "character_count": total_chars,
+            "characters_per_word": total_chars / total_words if total_words > 0 else 0,
+            "words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
+            # New prefixed keys for consistency
+            "total_sentence_count": total_sentences,
+            "total_word_count": total_words,
+            "total_character_count": total_chars,
             "reliable": reliable,
         },
     )

pystylometry/readability/coleman_liau.py CHANGED Viewed

@@ -1,31 +1,69 @@
-"""Coleman-Liau Index."""
+"""Coleman-Liau Index.
+This module implements the Coleman-Liau readability formula with native chunked
+analysis for stylometric fingerprinting.
+Related GitHub Issue:
+    #27 - Native chunked analysis with Distribution dataclass
+    https://github.com/craigtrim/pystylometry/issues/27
+"""
 import math
-from .._types import ColemanLiauResult
+from .._types import ColemanLiauResult, Distribution, chunk_text, make_distribution
 from .._utils import split_sentences, tokenize
 # Regression coefficients from Coleman & Liau (1975)
-# Derived from empirical analysis of Cloze test results on graded texts
-# Reference: Coleman, M., & Liau, T. L. (1975). A computer readability formula
-#            designed for machine scoring. Journal of Applied Psychology, 60(2), 283.
-# Coefficient for letters per 100 words
-# Represents impact of word length on reading difficulty
 _LETTER_COEFFICIENT = 0.0588
-# Coefficient for sentences per 100 words (negative: more sentences = easier)
-# Represents impact of sentence length on reading difficulty
 _SENTENCE_COEFFICIENT = -0.296
-# Intercept to calibrate scale to U.S. grade levels (1-16)
 _INTERCEPT = -15.8
-def compute_coleman_liau(text: str) -> ColemanLiauResult:
+def _compute_coleman_liau_single(text: str) -> tuple[float, float, dict]:
+    """Compute Coleman-Liau metrics for a single chunk of text.
+    Returns:
+        Tuple of (cli_index, grade_level, metadata_dict).
+        Returns (nan, nan, metadata) for empty/invalid input.
+    """
+    sentences = split_sentences(text)
+    all_tokens = tokenize(text)
+    tokens = [token for token in all_tokens if any(char.isalpha() for char in token)]
+    letter_count = sum(1 for token in tokens for char in token if char.isalpha())
+    if len(sentences) == 0 or len(tokens) == 0:
+        return (
+            float("nan"),
+            float("nan"),
+            {"sentence_count": 0, "word_count": 0, "letter_count": 0},
+        )
+    # Calculate per 100 words
+    L = (letter_count / len(tokens)) * 100  # noqa: N806
+    S = (len(sentences) / len(tokens)) * 100  # noqa: N806
+    # Compute Coleman-Liau Index
+    cli_index = _LETTER_COEFFICIENT * L + _SENTENCE_COEFFICIENT * S + _INTERCEPT
+    grade_level = max(0, math.floor(cli_index + 0.5))
+    metadata = {
+        "sentence_count": len(sentences),
+        "word_count": len(tokens),
+        "letter_count": letter_count,
+        "letters_per_100_words": L,
+        "sentences_per_100_words": S,
+    }
+    return (cli_index, float(grade_level), metadata)
+def compute_coleman_liau(text: str, chunk_size: int = 1000) -> ColemanLiauResult:
     """
     Compute Coleman-Liau Index.
+    This function uses native chunked analysis to capture variance and patterns
+    across the text, which is essential for stylometric fingerprinting.
     Formula:
         CLI = 0.0588 × L - 0.296 × S - 15.8
@@ -36,19 +74,9 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
     The Coleman-Liau index relies on characters rather than syllables,
     making it easier to compute and not requiring syllable-counting algorithms.
-    **Implementation Notes:**
-    - Grade levels are NOT clamped (removed upper bound of 20 per PR #2 review).
-      The original Coleman & Liau (1975) paper calibrated to grades 1-16 but did not
-      specify an upper bound. Post-graduate texts may exceed grade 20.
-    - Uses round-half-up rounding (not banker's rounding) for grade level calculation
-    - Letter counts (Unicode alphabetic characters only) computed from tokenized words
-      to ensure measurement consistency. Both letter count and word count use identical
-      tokenization logic, preventing divergence in edge cases (emails, URLs, hyphens).
-      See PR #2 review discussion: https://github.com/craigtrim/pystylometry/pull/2
-    - Reliability heuristic based on validation study passage lengths (~100 words);
-      shorter texts flagged in metadata
-    - English-centric sentence splitting and Unicode assumptions limit true
-      cross-language applicability
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     References:
         Coleman, M., & Liau, T. L. (1975). A computer readability formula
@@ -56,105 +84,104 @@ def compute_coleman_liau(text: str) -> ColemanLiauResult:
     Args:
         text: Input text to analyze
+        chunk_size: Number of words per chunk (default: 1000).
+            The text is divided into chunks of this size, and metrics are
+            computed per-chunk.
     Returns:
-        ColemanLiauResult with CLI index and grade level
+        ColemanLiauResult with:
+            - cli_index: Mean CLI across chunks
+            - grade_level: Mean grade level across chunks
+            - cli_index_dist: Distribution with per-chunk values and stats
+            - grade_level_dist: Distribution with per-chunk values and stats
+            - chunk_size: The chunk size used
+            - chunk_count: Number of chunks analyzed
     Example:
-        >>> result = compute_coleman_liau("The quick brown fox jumps over the lazy dog.")
-        >>> print(f"CLI Index: {result.cli_index:.1f}")
-        CLI Index: 3.8
-        >>> print(f"Grade Level: {result.grade_level}")
-        Grade Level: 4
-        >>> result.metadata["reliable"]
-        False
+        >>> result = compute_coleman_liau("Long text here...", chunk_size=1000)
+        >>> result.cli_index  # Mean across chunks
+        8.5
+        >>> result.cli_index_dist.std  # Variance reveals fingerprint
+        1.2
     """
-    sentences = split_sentences(text)
-    all_tokens = tokenize(text)
-    # Filter to only tokens that contain at least one alphabetic character
-    # This excludes pure punctuation (. ! ?) but keeps words with mixed content
-    # (Hello123, Test@example.com) to count their letters per Coleman-Liau spec.
-    # This is different from Gunning Fog which uses stricter normalization.
-    tokens = [token for token in all_tokens if any(char.isalpha() for char in token)]
-    # CRITICAL: Count letters from tokenized words, NOT from raw text
-    # ===============================================================
-    # Coleman & Liau (1975) define L as "average number of letters per 100 words"
-    # where both letters and words must be measured consistently from the same text units.
-    #
-    # Original implementation (buggy):
-    #   letter_count = sum(1 for char in text if char.isalpha())
-    #   This counted letters from RAW text but words from TOKENIZED text
-    #
-    # Problem cases (PR #2 review https://github.com/craigtrim/pystylometry/pull/2):
-    #   - "test@example.com" → tokenizer may split into ['test', '@', 'example', '.', 'com']
-    #     Raw letter count: 15 letters, Token count: 5 tokens → wrong ratio
-    #   - "co-operate" → tokenizer may split into ['co', '-', 'operate']
-    #     Raw letter count: 9 letters, Token count: 3 tokens → wrong ratio
-    #   - URLs, special tokens, etc. → similar inconsistencies
-    #
-    # Fixed implementation:
-    #   Count only alphabetic characters that appear in valid word tokens (after normalization).
-    #   This ensures both letter count and word count use identical tokenization logic,
-    #   maintaining the mathematical integrity of the L term in the Coleman-Liau formula.
-    letter_count = sum(1 for token in tokens for char in token if char.isalpha())
-    if len(sentences) == 0 or len(tokens) == 0:
+    # Chunk the text
+    chunks = chunk_text(text, chunk_size)
+    # Compute metrics per chunk
+    cli_values = []
+    grade_values = []
+    total_sentences = 0
+    total_words = 0
+    total_letters = 0
+    for chunk in chunks:
+        ci, gl, meta = _compute_coleman_liau_single(chunk)
+        if not math.isnan(ci):
+            cli_values.append(ci)
+            grade_values.append(gl)
+        total_sentences += meta.get("sentence_count", 0)
+        total_words += meta.get("word_count", 0)
+        total_letters += meta.get("letter_count", 0)
+    # Handle empty or all-invalid chunks
+    if not cli_values:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return ColemanLiauResult(
-            cli_index=0.0,
-            grade_level=0,
+            cli_index=float("nan"),
+            grade_level=float("nan"),
+            cli_index_dist=empty_dist,
+            grade_level_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=len(chunks),
             metadata={
-                "sentence_count": len(sentences),
-                "word_count": len(tokens),
-                "letter_count": letter_count,
+                # Backward-compatible keys
+                "sentence_count": 0,
+                "word_count": 0,
+                "letter_count": 0,
                 "letters_per_100_words": 0.0,
                 "sentences_per_100_words": 0.0,
+                # New prefixed keys for consistency
+                "total_sentence_count": 0,
+                "total_word_count": 0,
+                "total_letter_count": 0,
                 "reliable": False,
             },
         )
-    # Calculate per 100 words
-    L = (letter_count / len(tokens)) * 100  # noqa: N806
-    S = (len(sentences) / len(tokens)) * 100  # noqa: N806
-    # Compute Coleman-Liau Index using empirically-derived coefficients
-    cli_index = _LETTER_COEFFICIENT * L + _SENTENCE_COEFFICIENT * S + _INTERCEPT
-    # Grade Level Calculation and Bounds
-    # ===================================
-    # Round-half-up rounding (not Python's default banker's rounding):
-    #   4.5 → 5 (always rounds up), not round-half-to-even
-    #   math.floor(x + 0.5) implements this for both positive and negative values
-    #
-    # Lower bound (0): Prevent negative grades for very simple texts
-    #   Coleman & Liau (1975) calibrated to U.S. grades 1-16, but simpler texts
-    #   (e.g., "Go. Run. Stop.") can produce negative CLI values. We clamp to 0
-    #   as there is no "negative grade level" in the educational system.
-    #
-    # Upper bound (REMOVED per PR #2 review):
-    #   Original implementation clamped at grade 20, but this was arbitrary.
-    #   Coleman & Liau (1975) did not specify an upper bound in their paper.
-    #   Clamping discards information: PhD dissertations (grade 25) and complex
-    #   legal documents (grade 30+) would both report as grade 20, making them
-    #   indistinguishable. The empirical formula should determine the full range.
-    #
-    # See PR #2 discussion: https://github.com/craigtrim/pystylometry/pull/2
-    grade_level = max(0, math.floor(cli_index + 0.5))
+    # Build distributions
+    cli_dist = make_distribution(cli_values)
+    grade_dist = make_distribution(grade_values)
-    # Reliability heuristic: validation study used ~100-word passages
-    # Not a hard minimum, but shorter texts may deviate from expected behavior
-    reliable = len(tokens) >= 100
+    # Reliability heuristic
+    reliable = total_words >= 100
     return ColemanLiauResult(
-        cli_index=cli_index,
-        grade_level=grade_level,
+        cli_index=cli_dist.mean,
+        grade_level=grade_dist.mean,
+        cli_index_dist=cli_dist,
+        grade_level_dist=grade_dist,
+        chunk_size=chunk_size,
+        chunk_count=len(chunks),
         metadata={
-            "sentence_count": len(sentences),
-            "word_count": len(tokens),
-            "letter_count": letter_count,
-            "letters_per_100_words": L,
-            "sentences_per_100_words": S,
+            # Backward-compatible keys
+            "sentence_count": total_sentences,
+            "word_count": total_words,
+            "letter_count": total_letters,
+            "letters_per_100_words": (total_letters / total_words * 100) if total_words > 0 else 0,
+            "sentences_per_100_words": (total_sentences / total_words * 100)
+            if total_words > 0
+            else 0,
+            # New prefixed keys for consistency
+            "total_sentence_count": total_sentences,
+            "total_word_count": total_words,
+            "total_letter_count": total_letters,
             "reliable": reliable,
         },
     )

pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

pystylometry 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl