PyPI - pystylometry - Versions diffs - 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

pystylometry 0.1.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

pystylometry/__init__.py +30 -5
pystylometry/_normalize.py +277 -0
pystylometry/_types.py +1954 -28
pystylometry/_utils.py +4 -0
pystylometry/authorship/__init__.py +26 -1
pystylometry/authorship/additional_methods.py +75 -0
pystylometry/authorship/kilgarriff.py +347 -0
pystylometry/character/__init__.py +15 -0
pystylometry/character/character_metrics.py +389 -0
pystylometry/cli.py +427 -0
pystylometry/consistency/__init__.py +57 -0
pystylometry/consistency/_thresholds.py +162 -0
pystylometry/consistency/drift.py +549 -0
pystylometry/dialect/__init__.py +65 -0
pystylometry/dialect/_data/dialect_markers.json +1134 -0
pystylometry/dialect/_loader.py +360 -0
pystylometry/dialect/detector.py +533 -0
pystylometry/lexical/__init__.py +13 -6
pystylometry/lexical/advanced_diversity.py +680 -0
pystylometry/lexical/function_words.py +590 -0
pystylometry/lexical/hapax.py +310 -33
pystylometry/lexical/mtld.py +180 -22
pystylometry/lexical/ttr.py +149 -0
pystylometry/lexical/word_frequency_sophistication.py +1805 -0
pystylometry/lexical/yule.py +142 -29
pystylometry/ngrams/__init__.py +2 -0
pystylometry/ngrams/entropy.py +150 -49
pystylometry/ngrams/extended_ngrams.py +235 -0
pystylometry/prosody/__init__.py +12 -0
pystylometry/prosody/rhythm_prosody.py +53 -0
pystylometry/readability/__init__.py +12 -0
pystylometry/readability/additional_formulas.py +2110 -0
pystylometry/readability/ari.py +173 -35
pystylometry/readability/coleman_liau.py +150 -30
pystylometry/readability/complex_words.py +531 -0
pystylometry/readability/flesch.py +181 -32
pystylometry/readability/gunning_fog.py +208 -35
pystylometry/readability/smog.py +126 -28
pystylometry/readability/syllables.py +137 -30
pystylometry/stylistic/__init__.py +20 -0
pystylometry/stylistic/cohesion_coherence.py +45 -0
pystylometry/stylistic/genre_register.py +45 -0
pystylometry/stylistic/markers.py +131 -0
pystylometry/stylistic/vocabulary_overlap.py +47 -0
pystylometry/syntactic/__init__.py +4 -0
pystylometry/syntactic/advanced_syntactic.py +494 -0
pystylometry/syntactic/pos_ratios.py +172 -17
pystylometry/syntactic/sentence_stats.py +105 -18
pystylometry/syntactic/sentence_types.py +526 -0
pystylometry/viz/__init__.py +71 -0
pystylometry/viz/drift.py +589 -0
pystylometry/viz/jsx/__init__.py +31 -0
pystylometry/viz/jsx/_base.py +144 -0
pystylometry/viz/jsx/report.py +677 -0
pystylometry/viz/jsx/timeline.py +716 -0
pystylometry/viz/jsx/viewer.py +1032 -0
{pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +49 -9
pystylometry-1.1.0.dist-info/RECORD +63 -0
pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
pystylometry-0.1.0.dist-info/RECORD +0 -26
{pystylometry-0.1.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +0 -0

pystylometry/readability/flesch.py CHANGED Viewed

@@ -1,17 +1,95 @@
-"""Flesch Reading Ease and Flesch-Kincaid Grade Level."""
+"""Flesch Reading Ease and Flesch-Kincaid Grade Level.
-from .._types import FleschResult
+This module implements the Flesch readability formulas with native chunked
+analysis for stylometric fingerprinting.
+Related GitHub Issue:
+    #27 - Native chunked analysis with Distribution dataclass
+    https://github.com/craigtrim/pystylometry/issues/27
+"""
+from .._normalize import normalize_for_readability
+from .._types import Distribution, FleschResult, chunk_text, make_distribution
 from .._utils import split_sentences, tokenize
 from .syllables import count_syllables
-def compute_flesch(text: str) -> FleschResult:
+def _compute_flesch_single(text: str) -> tuple[float, float, dict]:
+    """Compute Flesch metrics for a single chunk of text.
+    Returns:
+        Tuple of (reading_ease, grade_level, metadata_dict).
+        Returns (nan, nan, metadata) for empty/invalid input.
+    """
+    sentences = split_sentences(text)
+    tokens = tokenize(text)
+    # Filter tokens to only valid words for syllable counting
+    word_tokens = normalize_for_readability(tokens)
+    if len(sentences) == 0 or len(word_tokens) == 0:
+        return (
+            float("nan"),
+            float("nan"),
+            {"sentence_count": 0, "word_count": 0, "syllable_count": 0},
+        )
+    # Count syllables
+    total_syllables = sum(count_syllables(word) for word in word_tokens)
+    # Calculate metrics
+    words_per_sentence = len(word_tokens) / len(sentences)
+    syllables_per_word = total_syllables / len(word_tokens)
+    # Flesch Reading Ease
+    reading_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * syllables_per_word)
+    # Flesch-Kincaid Grade Level
+    grade_level = (0.39 * words_per_sentence) + (11.8 * syllables_per_word) - 15.59
+    metadata = {
+        "sentence_count": len(sentences),
+        "word_count": len(word_tokens),
+        "syllable_count": total_syllables,
+        "words_per_sentence": words_per_sentence,
+        "syllables_per_word": syllables_per_word,
+    }
+    return (reading_ease, grade_level, metadata)
+def _get_difficulty(reading_ease: float) -> str:
+    """Determine difficulty rating based on reading ease score."""
+    import math
+    if math.isnan(reading_ease):
+        return "Unknown"
+    if reading_ease >= 90:
+        return "Very Easy"
+    if reading_ease >= 80:
+        return "Easy"
+    if reading_ease >= 70:
+        return "Fairly Easy"
+    if reading_ease >= 60:
+        return "Standard"
+    if reading_ease >= 50:
+        return "Fairly Difficult"
+    if reading_ease >= 30:
+        return "Difficult"
+    return "Very Difficult"
+def compute_flesch(text: str, chunk_size: int = 1000) -> FleschResult:
     """
     Compute Flesch Reading Ease and Flesch-Kincaid Grade Level.
+    This function uses native chunked analysis to capture variance and patterns
+    across the text, which is essential for stylometric fingerprinting.
     Flesch Reading Ease:
         Score = 206.835 - 1.015 × (words/sentences) - 84.6 × (syllables/words)
-        Higher scores = easier to read (0-100 scale)
+        Higher scores = easier to read
+        Typical range: 0-100, but can exceed bounds
     Flesch-Kincaid Grade Level:
         Grade = 0.39 × (words/sentences) + 11.8 × (syllables/words) - 15.59
@@ -25,6 +103,10 @@ def compute_flesch(text: str) -> FleschResult:
         30-49:  Difficult (College)
         0-29:   Very Difficult (College graduate)
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     References:
         Flesch, R. (1948). A new readability yardstick.
         Journal of Applied Psychology, 32(3), 221.
@@ -34,48 +116,115 @@ def compute_flesch(text: str) -> FleschResult:
     Args:
         text: Input text to analyze
+        chunk_size: Number of words per chunk (default: 1000).
+            The text is divided into chunks of this size, and metrics are
+            computed per-chunk. Use a large value (e.g., 1_000_000) for
+            single-chunk "aggregate" mode.
     Returns:
-        FleschResult with reading ease, grade level, and difficulty rating
+        FleschResult with:
+            - reading_ease: Mean reading ease across chunks
+            - grade_level: Mean grade level across chunks
+            - difficulty: Difficulty rating based on mean reading_ease
+            - reading_ease_dist: Distribution with per-chunk values and stats
+            - grade_level_dist: Distribution with per-chunk values and stats
+            - chunk_size: The chunk size used
+            - chunk_count: Number of chunks analyzed
     Example:
-        >>> result = compute_flesch("The quick brown fox jumps over the lazy dog.")
-        >>> print(f"Reading Ease: {result.reading_ease:.1f}")
-        >>> print(f"Grade Level: {result.grade_level:.1f}")
-        >>> print(f"Difficulty: {result.difficulty}")
+        >>> result = compute_flesch("Long text here...", chunk_size=1000)
+        >>> result.reading_ease  # Mean across chunks
+        68.54
+        >>> result.reading_ease_dist.std  # Variance reveals fingerprint
+        4.2
+        >>> result.reading_ease_dist.values  # Per-chunk values
+        [65.2, 71.1, 68.8, ...]
+        >>> result.chunk_count
+        59
+        >>> # Single-chunk mode (no chunking)
+        >>> result = compute_flesch("Short text.", chunk_size=1_000_000)
+        >>> result.chunk_count
+        1
     """
-    sentences = split_sentences(text)
-    tokens = tokenize(text)
-    if len(sentences) == 0 or len(tokens) == 0:
+    import math
+    # Chunk the text
+    chunks = chunk_text(text, chunk_size)
+    # Compute metrics per chunk
+    reading_ease_values = []
+    grade_level_values = []
+    total_sentences = 0
+    total_words = 0
+    total_syllables = 0
+    for chunk in chunks:
+        re, gl, meta = _compute_flesch_single(chunk)
+        if not math.isnan(re):  # Only include valid results
+            reading_ease_values.append(re)
+            grade_level_values.append(gl)
+        total_sentences += meta.get("sentence_count", 0)
+        total_words += meta.get("word_count", 0)
+        total_syllables += meta.get("syllable_count", 0)
+    # Handle empty or all-invalid chunks
+    if not reading_ease_values:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return FleschResult(
-            reading_ease=0.0,
-            grade_level=0.0,
+            reading_ease=float("nan"),
+            grade_level=float("nan"),
             difficulty="Unknown",
-            metadata={"sentence_count": 0, "word_count": 0, "syllable_count": 0},
+            reading_ease_dist=empty_dist,
+            grade_level_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=len(chunks),
+            metadata={
+                # Backward-compatible keys
+                "sentence_count": 0,
+                "word_count": 0,
+                "syllable_count": 0,
+                # New prefixed keys for consistency
+                "total_sentence_count": 0,
+                "total_word_count": 0,
+                "total_syllable_count": 0,
+            },
         )
-    # Count syllables
-    total_syllables = sum(count_syllables(word) for word in tokens)
-    # Calculate metrics
-    words_per_sentence = len(tokens) / len(sentences)
-    syllables_per_word = total_syllables / len(tokens)
+    # Build distributions
+    reading_ease_dist = make_distribution(reading_ease_values)
+    grade_level_dist = make_distribution(grade_level_values)
-    # TODO: Implement Flesch formulas
-    reading_ease = 0.0  # Placeholder
-    grade_level = 0.0  # Placeholder
-    difficulty = "Unknown"  # Placeholder
+    # Use mean for convenient access
+    mean_reading_ease = reading_ease_dist.mean
+    mean_grade_level = grade_level_dist.mean
+    difficulty = _get_difficulty(mean_reading_ease)
     return FleschResult(
-        reading_ease=reading_ease,
-        grade_level=grade_level,
+        reading_ease=mean_reading_ease,
+        grade_level=mean_grade_level,
         difficulty=difficulty,
+        reading_ease_dist=reading_ease_dist,
+        grade_level_dist=grade_level_dist,
+        chunk_size=chunk_size,
+        chunk_count=len(chunks),
         metadata={
-            "sentence_count": len(sentences),
-            "word_count": len(tokens),
+            # Backward-compatible keys
+            "sentence_count": total_sentences,
+            "word_count": total_words,
             "syllable_count": total_syllables,
-            "words_per_sentence": words_per_sentence,
-            "syllables_per_word": syllables_per_word,
+            # New prefixed keys for consistency
+            "total_sentence_count": total_sentences,
+            "total_word_count": total_words,
+            "total_syllable_count": total_syllables,
+            "words_per_sentence": total_words / total_sentences if total_sentences > 0 else 0,
+            "syllables_per_word": total_syllables / total_words if total_words > 0 else 0,
         },
     )

pystylometry/readability/gunning_fog.py CHANGED Viewed

@@ -1,63 +1,236 @@
-"""Gunning Fog Index."""
+"""Gunning Fog Index with NLP-enhanced complex word detection.
-from .._types import GunningFogResult
+This module computes the Gunning Fog Index, a readability metric that
+estimates the years of formal education needed to understand text on first reading.
+This implementation includes native chunked analysis for stylometric fingerprinting.
+Related GitHub Issues:
+    #4 - NLP-enhanced complex word detection
+    #27 - Native chunked analysis with Distribution dataclass
+Historical Background:
+----------------------
+The Gunning Fog Index was developed by Robert Gunning in 1952 as part of his
+work helping businesses improve the clarity of their writing. The formula produces
+a U.S. grade-level score (e.g., 12 = high school senior reading level).
+Reference:
+    Gunning, R. (1952). The Technique of Clear Writing.
+    McGraw-Hill, New York.
+"""
+import math
+from .._normalize import normalize_for_readability
+from .._types import Distribution, GunningFogResult, chunk_text, make_distribution
 from .._utils import split_sentences, tokenize
-from .syllables import count_syllables
+from .complex_words import process_text_for_complex_words
+# Formula coefficient from Gunning (1952)
+_FOG_COEFFICIENT = 0.4
+def _compute_gunning_fog_single(text: str, spacy_model: str) -> tuple[float, float, dict]:
+    """Compute Gunning Fog metrics for a single chunk of text.
-def compute_gunning_fog(text: str) -> GunningFogResult:
+    Returns:
+        Tuple of (fog_index, grade_level, metadata_dict).
+        Returns (nan, nan, metadata) for empty/invalid input.
     """
-    Compute Gunning Fog Index.
+    sentences = split_sentences(text)
+    all_tokens = tokenize(text)
+    tokens = normalize_for_readability(all_tokens)
+    if len(sentences) == 0 or len(tokens) == 0:
+        return (
+            float("nan"),
+            float("nan"),
+            {
+                "sentence_count": 0,
+                "word_count": 0,
+                "complex_word_count": 0,
+                "complex_word_percentage": 0.0,
+            },
+        )
+    # Count complex words using NLP-enhanced detection
+    complex_word_count, detection_metadata = process_text_for_complex_words(
+        text, tokens, model=spacy_model
+    )
-    Formula:
+    # Calculate formula components
+    average_words_per_sentence = len(tokens) / len(sentences)
+    complex_word_percentage = (complex_word_count / len(tokens)) * 100
+    # Apply Gunning Fog formula
+    fog_index = _FOG_COEFFICIENT * (average_words_per_sentence + complex_word_percentage)
+    grade_level = max(0, min(20, round(fog_index)))
+    metadata = {
+        "sentence_count": len(sentences),
+        "word_count": len(tokens),
+        "complex_word_count": complex_word_count,
+        "complex_word_percentage": complex_word_percentage,
+        "average_words_per_sentence": average_words_per_sentence,
+        **detection_metadata,
+    }
+    return (fog_index, float(grade_level), metadata)
+def compute_gunning_fog(
+    text: str, chunk_size: int = 1000, spacy_model: str = "en_core_web_sm"
+) -> GunningFogResult:
+    """
+    Compute Gunning Fog Index with NLP-enhanced complex word detection.
+    This function uses native chunked analysis to capture variance and patterns
+    across the text, which is essential for stylometric fingerprinting.
+    Formula (Gunning, 1952):
+    ------------------------
         Fog Index = 0.4 × [(words/sentences) + 100 × (complex words/words)]
-    Where complex words are defined as words with 3+ syllables,
-    excluding proper nouns, compound words, and common suffixes.
+    Where complex words are words with 3+ syllables, EXCLUDING:
+        1. Proper nouns (names, places, organizations)
+        2. Compound words (hyphenated)
+        3. Common verb forms (-es, -ed, -ing endings)
-    The index estimates years of formal education needed to understand the text
-    on first reading.
+    Related GitHub Issues:
+        #4 - NLP-enhanced complex word detection
+        #27 - Native chunked analysis with Distribution dataclass
-    References:
-        Gunning, R. (1952). The Technique of Clear Writing.
-        McGraw-Hill.
+    Reference:
+        Gunning, R. (1952). The Technique of Clear Writing. McGraw-Hill.
     Args:
         text: Input text to analyze
+        chunk_size: Number of words per chunk (default: 1000).
+            The text is divided into chunks of this size, and metrics are
+            computed per-chunk.
+        spacy_model: spaCy model name for enhanced mode (default: "en_core_web_sm")
     Returns:
-        GunningFogResult with fog index and grade level
+        GunningFogResult with:
+            - fog_index: Mean Fog Index across chunks
+            - grade_level: Mean grade level across chunks
+            - fog_index_dist: Distribution with per-chunk values and stats
+            - grade_level_dist: Distribution with per-chunk values and stats
+            - chunk_size: The chunk size used
+            - chunk_count: Number of chunks analyzed
     Example:
-        >>> result = compute_gunning_fog("The quick brown fox jumps over the lazy dog.")
-        >>> print(f"Fog Index: {result.fog_index:.1f}")
-        >>> print(f"Grade Level: {result.grade_level}")
+        >>> result = compute_gunning_fog("Long text here...", chunk_size=1000)
+        >>> result.fog_index  # Mean across chunks
+        12.5
+        >>> result.fog_index_dist.std  # Variance reveals fingerprint
+        2.1
     """
-    sentences = split_sentences(text)
-    tokens = tokenize(text)
+    # Chunk the text
+    chunks = chunk_text(text, chunk_size)
-    if len(sentences) == 0 or len(tokens) == 0:
+    # Compute metrics per chunk
+    fog_values = []
+    grade_values = []
+    total_sentences = 0
+    total_words = 0
+    total_complex = 0
+    detection_metadata: dict = {}
+    for chunk in chunks:
+        fi, gl, meta = _compute_gunning_fog_single(chunk, spacy_model)
+        if not math.isnan(fi):
+            fog_values.append(fi)
+            grade_values.append(gl)
+        total_sentences += meta.get("sentence_count", 0)
+        total_words += meta.get("word_count", 0)
+        total_complex += meta.get("complex_word_count", 0)
+        # Capture detection metadata from first chunk (same for all chunks)
+        if not detection_metadata and "mode" in meta:
+            detection_metadata = {
+                "mode": meta.get("mode"),
+                "proper_noun_detection": meta.get("proper_noun_detection"),
+                "inflection_handling": meta.get("inflection_handling"),
+            }
+            if "spacy_model" in meta:
+                detection_metadata["spacy_model"] = meta.get("spacy_model")
+    # Handle empty or all-invalid chunks
+    if not fog_values:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return GunningFogResult(
-            fog_index=0.0,
-            grade_level=0,
-            metadata={"sentence_count": 0, "word_count": 0, "complex_word_count": 0},
+            fog_index=float("nan"),
+            grade_level=float("nan"),
+            fog_index_dist=empty_dist,
+            grade_level_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=len(chunks),
+            metadata={
+                # Backward-compatible keys
+                "sentence_count": 0,
+                "word_count": 0,
+                "complex_word_count": 0,
+                "complex_word_percentage": 0.0,
+                "average_words_per_sentence": 0.0,
+                # New prefixed keys for consistency
+                "total_sentence_count": 0,
+                "total_word_count": 0,
+                "total_complex_word_count": 0,
+                "reliable": False,
+                # Detection metadata
+                "mode": "none",
+                "proper_noun_detection": "none",
+                "inflection_handling": "none",
+            },
         )
-    # Count complex words (3+ syllables)
-    # TODO: Exclude proper nouns, compound words, and -es/-ed/-ing endings
-    complex_word_count = sum(1 for word in tokens if count_syllables(word) >= 3)
+    # Build distributions
+    fog_dist = make_distribution(fog_values)
+    grade_dist = make_distribution(grade_values)
+    # Reliability heuristic
+    reliable = total_words >= 100 and total_sentences >= 3
-    # TODO: Implement Gunning Fog formula
-    fog_index = 0.0  # Placeholder
-    grade_level = 0  # Placeholder
+    # Ensure detection metadata has defaults
+    if not detection_metadata:
+        detection_metadata = {
+            "mode": "none",
+            "proper_noun_detection": "none",
+            "inflection_handling": "none",
+        }
     return GunningFogResult(
-        fog_index=fog_index,
-        grade_level=grade_level,
+        fog_index=fog_dist.mean,
+        grade_level=grade_dist.mean,
+        fog_index_dist=fog_dist,
+        grade_level_dist=grade_dist,
+        chunk_size=chunk_size,
+        chunk_count=len(chunks),
         metadata={
-            "sentence_count": len(sentences),
-            "word_count": len(tokens),
-            "complex_word_count": complex_word_count,
-            "complex_word_percentage": (complex_word_count / len(tokens) * 100) if tokens else 0,
+            # Backward-compatible keys
+            "sentence_count": total_sentences,
+            "word_count": total_words,
+            "complex_word_count": total_complex,
+            "complex_word_percentage": (total_complex / total_words * 100)
+            if total_words > 0
+            else 0,
+            "average_words_per_sentence": total_words / total_sentences
+            if total_sentences > 0
+            else 0,
+            # New prefixed keys for consistency
+            "total_sentence_count": total_sentences,
+            "total_word_count": total_words,
+            "total_complex_word_count": total_complex,
+            "reliable": reliable,
+            # Detection metadata
+            **detection_metadata,
         },
     )

pystylometry 0.1.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

pystylometry 0.1.0py3-none-any.whl → 1.1.0py3-none-any.whl