PyPI - pystylometry - Versions diffs - 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

pystylometry 1.0.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

pystylometry/README.md +42 -0
pystylometry/__init__.py +45 -3
pystylometry/_types.py +1017 -259
pystylometry/authorship/README.md +21 -0
pystylometry/authorship/__init__.py +28 -4
pystylometry/authorship/additional_methods.py +260 -40
pystylometry/authorship/compression.py +175 -0
pystylometry/authorship/kilgarriff.py +354 -0
pystylometry/character/README.md +17 -0
pystylometry/character/character_metrics.py +267 -179
pystylometry/cli.py +427 -0
pystylometry/consistency/README.md +27 -0
pystylometry/consistency/__init__.py +57 -0
pystylometry/consistency/_thresholds.py +162 -0
pystylometry/consistency/drift.py +549 -0
pystylometry/dialect/README.md +26 -0
pystylometry/dialect/__init__.py +65 -0
pystylometry/dialect/_data/dialect_markers.json +1134 -0
pystylometry/dialect/_loader.py +360 -0
pystylometry/dialect/detector.py +533 -0
pystylometry/lexical/README.md +23 -0
pystylometry/lexical/advanced_diversity.py +61 -22
pystylometry/lexical/function_words.py +255 -56
pystylometry/lexical/hapax.py +182 -52
pystylometry/lexical/mtld.py +108 -26
pystylometry/lexical/ttr.py +76 -10
pystylometry/lexical/word_frequency_sophistication.py +1522 -298
pystylometry/lexical/yule.py +136 -50
pystylometry/ngrams/README.md +18 -0
pystylometry/ngrams/entropy.py +150 -49
pystylometry/ngrams/extended_ngrams.py +314 -69
pystylometry/prosody/README.md +17 -0
pystylometry/prosody/rhythm_prosody.py +773 -11
pystylometry/readability/README.md +23 -0
pystylometry/readability/additional_formulas.py +1887 -762
pystylometry/readability/ari.py +144 -82
pystylometry/readability/coleman_liau.py +136 -109
pystylometry/readability/flesch.py +177 -73
pystylometry/readability/gunning_fog.py +165 -161
pystylometry/readability/smog.py +123 -42
pystylometry/stylistic/README.md +20 -0
pystylometry/stylistic/cohesion_coherence.py +669 -13
pystylometry/stylistic/genre_register.py +1560 -17
pystylometry/stylistic/markers.py +611 -17
pystylometry/stylistic/vocabulary_overlap.py +354 -13
pystylometry/syntactic/README.md +20 -0
pystylometry/syntactic/advanced_syntactic.py +76 -14
pystylometry/syntactic/pos_ratios.py +70 -6
pystylometry/syntactic/sentence_stats.py +55 -12
pystylometry/syntactic/sentence_types.py +71 -15
pystylometry/viz/README.md +27 -0
pystylometry/viz/__init__.py +71 -0
pystylometry/viz/drift.py +589 -0
pystylometry/viz/jsx/__init__.py +31 -0
pystylometry/viz/jsx/_base.py +144 -0
pystylometry/viz/jsx/report.py +677 -0
pystylometry/viz/jsx/timeline.py +716 -0
pystylometry/viz/jsx/viewer.py +1032 -0
pystylometry-1.3.0.dist-info/METADATA +136 -0
pystylometry-1.3.0.dist-info/RECORD +76 -0
{pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
pystylometry-1.0.0.dist-info/METADATA +0 -275
pystylometry-1.0.0.dist-info/RECORD +0 -46

pystylometry/readability/flesch.py CHANGED Viewed

@@ -1,19 +1,95 @@
-"""Flesch Reading Ease and Flesch-Kincaid Grade Level."""
+"""Flesch Reading Ease and Flesch-Kincaid Grade Level.
+This module implements the Flesch readability formulas with native chunked
+analysis for stylometric fingerprinting.
+Related GitHub Issue:
+    #27 - Native chunked analysis with Distribution dataclass
+    https://github.com/craigtrim/pystylometry/issues/27
+"""
 from .._normalize import normalize_for_readability
-from .._types import FleschResult
+from .._types import Distribution, FleschResult, chunk_text, make_distribution
 from .._utils import split_sentences, tokenize
 from .syllables import count_syllables
-def compute_flesch(text: str) -> FleschResult:
+def _compute_flesch_single(text: str) -> tuple[float, float, dict]:
+    """Compute Flesch metrics for a single chunk of text.
+    Returns:
+        Tuple of (reading_ease, grade_level, metadata_dict).
+        Returns (nan, nan, metadata) for empty/invalid input.
+    """
+    sentences = split_sentences(text)
+    tokens = tokenize(text)
+    # Filter tokens to only valid words for syllable counting
+    word_tokens = normalize_for_readability(tokens)
+    if len(sentences) == 0 or len(word_tokens) == 0:
+        return (
+            float("nan"),
+            float("nan"),
+            {"sentence_count": 0, "word_count": 0, "syllable_count": 0},
+        )
+    # Count syllables
+    total_syllables = sum(count_syllables(word) for word in word_tokens)
+    # Calculate metrics
+    words_per_sentence = len(word_tokens) / len(sentences)
+    syllables_per_word = total_syllables / len(word_tokens)
+    # Flesch Reading Ease
+    reading_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
+    # Flesch-Kincaid Grade Level
+    grade_level = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
+    metadata = {
+        "sentence_count": len(sentences),
+        "word_count": len(word_tokens),
+        "syllable_count": total_syllables,
+        "words_per_sentence": words_per_sentence,
+        "syllables_per_word": syllables_per_word,
+    }
+    return (reading_ease, grade_level, metadata)
+def _get_difficulty(reading_ease: float) -> str:
+    """Determine difficulty rating based on reading ease score."""
+    import math
+    if math.isnan(reading_ease):
+        return "Unknown"
+    if reading_ease >= 90:
+        return "Very Easy"
+    if reading_ease >= 80:
+        return "Easy"
+    if reading_ease >= 70:
+        return "Fairly Easy"
+    if reading_ease >= 60:
+        return "Standard"
+    if reading_ease >= 50:
+        return "Fairly Difficult"
+    if reading_ease >= 30:
+        return "Difficult"
+    return "Very Difficult"
+def compute_flesch(text: str, chunk_size: int = 1000) -> FleschResult:
     """
     Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
+    This function uses native chunked analysis to capture variance and patterns
+    across the text, which is essential for stylometric fingerprinting.
     Flesch Reading Ease:
         Score = 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
         Higher scores = easier to read
-        Typical range: 0-100, but can exceed bounds for extremely simple (>100) or complex (<0) text
+        Typical range: 0-100, but can exceed bounds
     Flesch-Kincaid Grade Level:
         Grade = 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
@@ -27,6 +103,10 @@ def compute_flesch(text: str) -> FleschResult:
         30-49:  Difficult (College)
         0-29:   Very Difficult (College graduate)
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     References:
         Flesch, R. (1948). A new readability yardstick.
         Journal of Applied Psychology, 32(3), 221.
@@ -36,91 +116,115 @@ def compute_flesch(text: str) -> FleschResult:
     Args:
         text: Input text to analyze
+        chunk_size: Number of words per chunk (default: 1000).
+            The text is divided into chunks of this size, and metrics are
+            computed per-chunk. Use a large value (e.g., 1_000_000) for
+            single-chunk "aggregate" mode.
     Returns:
-        FleschResult with reading ease, grade level, and difficulty rating
-        Note: The difficulty label ("Very Easy", "Easy", etc.) is determined solely
-        from the reading_ease score and does NOT consider the grade_level score.
-        This means text with high reading_ease (e.g., 85 = "Easy") but high
-        grade_level (e.g., 12 = college) will still be labeled "Easy". The two
-        metrics measure different aspects of readability and may not always align.
-        Note: For empty input (no sentences or words), reading_ease and grade_level
-        will be float('nan'). This prevents conflating "no data" with "extremely
-        difficult text" (score of 0). Consumers should check for NaN before
-        performing arithmetic operations (e.g., using math.isnan() or filtering
-        before aggregation) to avoid silent propagation of NaN in statistics.
+        FleschResult with:
+            - reading_ease: Mean reading ease across chunks
+            - grade_level: Mean grade level across chunks
+            - difficulty: Difficulty rating based on mean reading_ease
+            - reading_ease_dist: Distribution with per-chunk values and stats
+            - grade_level_dist: Distribution with per-chunk values and stats
+            - chunk_size: The chunk size used
+            - chunk_count: Number of chunks analyzed
     Example:
-        >>> result = compute_flesch("The quick brown fox jumps over the lazy dog.")
-        >>> print(f"Reading Ease: {result.reading_ease:.1f}")
-        >>> print(f"Grade Level: {result.grade_level:.1f}")
-        >>> print(f"Difficulty: {result.difficulty}")
-        >>> # Empty input returns NaN
-        >>> import math
-        >>> result_empty = compute_flesch("")
-        >>> math.isnan(result_empty.reading_ease)
-        True
-        >>> result_empty.difficulty
-        'Unknown'
+        >>> result = compute_flesch("Long text here...", chunk_size=1000)
+        >>> result.reading_ease  # Mean across chunks
+        68.54
+        >>> result.reading_ease_dist.std  # Variance reveals fingerprint
+        4.2
+        >>> result.reading_ease_dist.values  # Per-chunk values
+        [65.2, 71.1, 68.8, ...]
+        >>> result.chunk_count
+        59
+        >>> # Single-chunk mode (no chunking)
+        >>> result = compute_flesch("Short text.", chunk_size=1_000_000)
+        >>> result.chunk_count
+        1
     """
-    sentences = split_sentences(text)
-    tokens = tokenize(text)
-    # Filter tokens to only valid words for syllable counting
-    # Removes numbers, URLs, emails, etc. that would cause errors
-    word_tokens = normalize_for_readability(tokens)
-    if len(sentences) == 0 or len(word_tokens) == 0:
+    import math
+    # Chunk the text
+    chunks = chunk_text(text, chunk_size)
+    # Compute metrics per chunk
+    reading_ease_values = []
+    grade_level_values = []
+    total_sentences = 0
+    total_words = 0
+    total_syllables = 0
+    for chunk in chunks:
+        re, gl, meta = _compute_flesch_single(chunk)
+        if not math.isnan(re):  # Only include valid results
+            reading_ease_values.append(re)
+            grade_level_values.append(gl)
+        total_sentences += meta.get("sentence_count", 0)
+        total_words += meta.get("word_count", 0)
+        total_syllables += meta.get("syllable_count", 0)
+    # Handle empty or all-invalid chunks
+    if not reading_ease_values:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return FleschResult(
             reading_ease=float("nan"),
             grade_level=float("nan"),
             difficulty="Unknown",
-            metadata={"sentence_count": 0, "word_count": 0, "syllable_count": 0},
+            reading_ease_dist=empty_dist,
+            grade_level_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=len(chunks),
+            metadata={
+                # Backward-compatible keys
+                "sentence_count": 0,
+                "word_count": 0,
+                "syllable_count": 0,
+                # New prefixed keys for consistency
+                "total_sentence_count": 0,
+                "total_word_count": 0,
+                "total_syllable_count": 0,
+            },
         )
-    # Count syllables (safe now - only valid words)
-    total_syllables = sum(count_syllables(word) for word in word_tokens)
-    # Calculate metrics
-    words_per_sentence = len(word_tokens) / len(sentences)
-    syllables_per_word = total_syllables / len(word_tokens)
-    # Flesch Reading Ease: 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
-    reading_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
-    # Flesch-Kincaid Grade Level: 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
-    grade_level = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
+    # Build distributions
+    reading_ease_dist = make_distribution(reading_ease_values)
+    grade_level_dist = make_distribution(grade_level_values)
-    # Determine difficulty rating based ONLY on reading ease score (not grade level)
-    # This is a conscious design choice: difficulty labels follow the Reading Ease
-    # thresholds exclusively, even though grade_level may suggest a different difficulty
-    if reading_ease >= 90:
-        difficulty = "Very Easy"
-    elif reading_ease >= 80:
-        difficulty = "Easy"
-    elif reading_ease >= 70:
-        difficulty = "Fairly Easy"
-    elif reading_ease >= 60:
-        difficulty = "Standard"
-    elif reading_ease >= 50:
-        difficulty = "Fairly Difficult"
-    elif reading_ease >= 30:
-        difficulty = "Difficult"
-    else:
-        difficulty = "Very Difficult"
+    # Use mean for convenient access
+    mean_reading_ease = reading_ease_dist.mean
+    mean_grade_level = grade_level_dist.mean
+    difficulty = _get_difficulty(mean_reading_ease)
     return FleschResult(
-        reading_ease=reading_ease,
-        grade_level=grade_level,
+        reading_ease=mean_reading_ease,
+        grade_level=mean_grade_level,
         difficulty=difficulty,
+        reading_ease_dist=reading_ease_dist,
+        grade_level_dist=grade_level_dist,
+        chunk_size=chunk_size,
+        chunk_count=len(chunks),
         metadata={
-            "sentence_count": len(sentences),
-            "word_count": len(word_tokens),
+            # Backward-compatible keys
+            "sentence_count": total_sentences,
+            "word_count": total_words,
             "syllable_count": total_syllables,
-            "words_per_sentence": words_per_sentence,
-            "syllables_per_word": syllables_per_word,
+            # New prefixed keys for consistency
+            "total_sentence_count": total_sentences,
+            "total_word_count": total_words,
+            "total_syllable_count": total_syllables,
+            "words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
+            "syllables_per_word": total_syllables / total_words if total_words > 0 else 0,
         },
     )

pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

pystylometry 1.0.0py3-none-any.whl → 1.3.0py3-none-any.whl