PyPI - pystylometry - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

pystylometry 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

pystylometry/__init__.py +29 -3
pystylometry/_types.py +963 -259
pystylometry/authorship/__init__.py +23 -2
pystylometry/authorship/additional_methods.py +4 -29
pystylometry/authorship/kilgarriff.py +347 -0
pystylometry/character/character_metrics.py +267 -179
pystylometry/cli.py +427 -0
pystylometry/consistency/__init__.py +57 -0
pystylometry/consistency/_thresholds.py +162 -0
pystylometry/consistency/drift.py +549 -0
pystylometry/dialect/__init__.py +65 -0
pystylometry/dialect/_data/dialect_markers.json +1134 -0
pystylometry/dialect/_loader.py +360 -0
pystylometry/dialect/detector.py +533 -0
pystylometry/lexical/advanced_diversity.py +61 -22
pystylometry/lexical/function_words.py +255 -56
pystylometry/lexical/hapax.py +182 -52
pystylometry/lexical/mtld.py +108 -26
pystylometry/lexical/ttr.py +76 -10
pystylometry/lexical/word_frequency_sophistication.py +1522 -298
pystylometry/lexical/yule.py +136 -50
pystylometry/ngrams/entropy.py +150 -49
pystylometry/readability/additional_formulas.py +1887 -762
pystylometry/readability/ari.py +144 -82
pystylometry/readability/coleman_liau.py +136 -109
pystylometry/readability/flesch.py +177 -73
pystylometry/readability/gunning_fog.py +165 -161
pystylometry/readability/smog.py +123 -42
pystylometry/syntactic/advanced_syntactic.py +76 -14
pystylometry/syntactic/pos_ratios.py +70 -6
pystylometry/syntactic/sentence_stats.py +55 -12
pystylometry/syntactic/sentence_types.py +71 -15
pystylometry/viz/__init__.py +71 -0
pystylometry/viz/drift.py +589 -0
pystylometry/viz/jsx/__init__.py +31 -0
pystylometry/viz/jsx/_base.py +144 -0
pystylometry/viz/jsx/report.py +677 -0
pystylometry/viz/jsx/timeline.py +716 -0
pystylometry/viz/jsx/viewer.py +1032 -0
{pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/METADATA +5 -2
pystylometry-1.1.0.dist-info/RECORD +63 -0
{pystylometry-1.0.0.dist-info → pystylometry-1.1.0.dist-info}/WHEEL +1 -1
pystylometry-1.1.0.dist-info/entry_points.txt +4 -0
pystylometry-1.0.0.dist-info/RECORD +0 -46

pystylometry/character/character_metrics.py CHANGED Viewed

@@ -5,9 +5,9 @@ in writing style. Character-level metrics are fundamental for authorship
 attribution and can reveal distinctive patterns in punctuation usage,
 word construction, and formatting preferences.
-Related GitHub Issue:
+Related GitHub Issues:
     #12 - Character-Level Metrics
-    https://github.com/craigtrim/pystylometry/issues/12
+    #27 - Native chunked analysis with Distribution dataclass
 Features implemented:
     - Average word length (characters per word)
@@ -26,114 +26,80 @@ References:
         JASIST, 60(3), 538-556.
 """
-from .._types import CharacterMetricsResult
-def compute_character_metrics(text: str) -> CharacterMetricsResult:
+import math
+from .._types import CharacterMetricsResult, Distribution, chunk_text, make_distribution
+# Character sets
+_PUNCTUATION = {
+    ".",
+    ",",
+    "!",
+    "?",
+    ";",
+    ":",
+    "-",
+    "—",
+    "–",  # Basic punctuation
+    "'",
+    '"',
+    """, """,
+    "'",
+    "'",  # Quotes
+    "(",
+    ")",
+    "[",
+    "]",
+    "{",
+    "}",  # Brackets
+    "/",
+    "\\",
+    "|",  # Slashes
+    "…",  # Ellipsis
+    "*",
+    "&",
+    "@",
+    "#",
+    "$",
+    "%",
+    "^",
+    "~",
+    "`",  # Special symbols
+}
+_VOWELS = {"a", "e", "i", "o", "u"}
+_STANDARD_LETTERS = set("abcdefghijklmnopqrstuvwxyz")
+def _compute_character_metrics_single(text: str) -> dict:
+    """Compute character-level metrics for a single chunk of text.
+    Returns a dict with all computed values, or values containing nan for empty text.
     """
-    Compute character-level stylometric metrics.
-    This function analyzes text at the character level to extract features
-    related to word length, punctuation usage, letter distribution, and
-    other low-level patterns that can be distinctive for authorship
-    attribution and style analysis.
-    Related GitHub Issue:
-        #12 - Character-Level Metrics
-        https://github.com/craigtrim/pystylometry/issues/12
-    Character-level features are particularly valuable because:
-        1. They are language-independent (work across languages)
-        2. They capture subconscious writing patterns
-        3. They are resistant to topic variation
-        4. They complement higher-level metrics (words, syntax)
-    Metrics computed:
-        - Average word length: Mean characters per word
-        - Average sentence length (chars): Mean characters per sentence
-        - Punctuation density: Punctuation marks per 100 words
-        - Punctuation variety: Count of unique punctuation types used
-        - Letter frequency: Distribution of a-z (case-insensitive)
-        - Vowel-to-consonant ratio: Ratio of vowels to consonants
-        - Digit count/ratio: Numeric character usage
-        - Uppercase ratio: Uppercase letters / total letters
-        - Whitespace ratio: Whitespace characters / total characters
-    Args:
-        text: Input text to analyze. Should contain at least one sentence
-              for meaningful results. Empty text will return NaN for ratios
-              and 0 for counts.
-    Returns:
-        CharacterMetricsResult with all character-level features and metadata.
-        For empty text, all ratios will be NaN and counts will be 0.
-    Example:
-        >>> result = compute_character_metrics("The quick brown fox jumps!")
-        >>> print(f"Avg word length: {result.avg_word_length:.2f}")
-        Avg word length: 4.17
-        >>> print(f"Punctuation density: {result.punctuation_density:.2f}")
-        Punctuation density: 16.67
-        >>> print(f"Vowel/consonant ratio: {result.vowel_consonant_ratio:.2f}")
-        Vowel/consonant ratio: 0.71
-        >>> # Empty text handling
-        >>> result = compute_character_metrics("")
-        >>> import math
-        >>> math.isnan(result.avg_word_length)
-        True
-        >>> result.digit_count
-        0
-    Note:
-        - Punctuation marks include: . , ! ? ; : - ' " ( ) [ ] { } ... etc.
-        - Whitespace includes spaces, tabs, newlines
-        - Letter frequency is case-insensitive (lowercase normalized)
-        - Words are tokenized by whitespace for length calculation
-        - Sentences are split using standard sentence delimiters (. ! ?)
-    """
-    # Define character sets
-    # GitHub Issue #12: https://github.com/craigtrim/pystylometry/issues/12
-    PUNCTUATION = {
-        ".", ",", "!", "?", ";", ":", "-", "—", "–",  # Basic punctuation
-        "'", '"', """, """, "'", "'",  # Quotes
-        "(", ")", "[", "]", "{", "}",  # Brackets
-        "/", "\\", "|",  # Slashes
-        "…",  # Ellipsis
-        "*", "&", "@", "#", "$", "%", "^", "~", "`",  # Special symbols
-    }
-    VOWELS = {"a", "e", "i", "o", "u"}
-    # Handle empty text
     if not text:
-        # Return NaN for all ratios, 0 for all counts
-        empty_letter_freq = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
-        return CharacterMetricsResult(
-            avg_word_length=float("nan"),
-            avg_sentence_length_chars=float("nan"),
-            punctuation_density=float("nan"),
-            punctuation_variety=0,
-            letter_frequency=empty_letter_freq,
-            vowel_consonant_ratio=float("nan"),
-            digit_count=0,
-            digit_ratio=float("nan"),
-            uppercase_ratio=float("nan"),
-            whitespace_ratio=float("nan"),
-            metadata={
-                "total_characters": 0,
-                "total_letters": 0,
-                "total_words": 0,
-                "total_sentences": 0,
-                "total_punctuation": 0,
-                "total_whitespace": 0,
-                "total_digits": 0,
-                "punctuation_types": [],
-                "vowel_count": 0,
-                "consonant_count": 0,
-                "uppercase_count": 0,
-                "lowercase_count": 0,
-            },
-        )
+        return {
+            "avg_word_length": float("nan"),
+            "avg_sentence_length_chars": float("nan"),
+            "punctuation_density": float("nan"),
+            "punctuation_variety": 0,
+            "vowel_consonant_ratio": float("nan"),
+            "digit_count": 0,
+            "digit_ratio": float("nan"),
+            "uppercase_ratio": float("nan"),
+            "whitespace_ratio": float("nan"),
+            "letter_frequency": {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"},
+            "total_characters": 0,
+            "total_letters": 0,
+            "total_words": 0,
+            "total_sentences": 0,
+            "total_punctuation": 0,
+            "total_whitespace": 0,
+            "total_digits": 0,
+            "punctuation_types": [],
+            "vowel_count": 0,
+            "consonant_count": 0,
+            "uppercase_count": 0,
+            "lowercase_count": 0,
+        }
     # Initialize counters
     total_chars = len(text)
@@ -147,19 +113,18 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
     punctuation_count = 0
     punctuation_types = set()
-    # Single pass through text to classify and count all characters
+    # Single pass through text
     for char in text:
         if char.isalpha():
-            # Letter - update letter frequency (case-insensitive)
-            letter_counts[char.lower()] += 1
+            lower_char = char.lower()
+            if lower_char in _STANDARD_LETTERS:
+                letter_counts[lower_char] += 1
-            # Count vowels and consonants
-            if char.lower() in VOWELS:
+            if lower_char in _VOWELS:
                 vowel_count += 1
-            else:
+            elif lower_char in _STANDARD_LETTERS:
                 consonant_count += 1
-            # Count uppercase and lowercase
             if char.isupper():
                 uppercase_count += 1
             else:
@@ -167,44 +132,35 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
         elif char.isdigit():
             digit_count += 1
         elif char.isspace():
             whitespace_count += 1
-        elif char in PUNCTUATION:
+        elif char in _PUNCTUATION:
             punctuation_count += 1
             punctuation_types.add(char)
     total_letters = vowel_count + consonant_count
-    # Calculate letter frequency distribution (normalize to sum to 1.0)
+    # Letter frequency distribution
     if total_letters > 0:
-        letter_frequency = {letter: count / total_letters for letter, count in letter_counts.items()}
+        letter_frequency = {
+            letter: count / total_letters for letter, count in letter_counts.items()
+        }
     else:
         letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
-    # Tokenize into words (split on whitespace, then strip punctuation for length)
+    # Word metrics
     words = text.split()
     total_words = len(words)
-    # Calculate average word length (count only letters and digits in words)
     if total_words > 0:
-        word_lengths = []
-        for word in words:
-            # Count only alphanumeric characters for word length
-            word_length = sum(1 for char in word if char.isalnum())
-            if word_length > 0:  # Only count words with at least one alphanumeric char
-                word_lengths.append(word_length)
-        if word_lengths:
-            avg_word_length = sum(word_lengths) / len(word_lengths)
-        else:
-            avg_word_length = float("nan")
+        word_lengths = [
+            sum(1 for c in w if c.isalnum()) for w in words if any(c.isalnum() for c in w)
+        ]
+        avg_word_length = sum(word_lengths) / len(word_lengths) if word_lengths else float("nan")
     else:
         avg_word_length = float("nan")
-    # Segment text into sentences (split on . ! ?)
-    # Simple approach: split on sentence delimiters
+    # Sentence metrics
     sentence_delimiters = {".", "!", "?"}
     sentences = []
     current_sentence = []
@@ -212,13 +168,11 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
     for char in text:
         current_sentence.append(char)
         if char in sentence_delimiters:
-            # End of sentence
             sentence_text = "".join(current_sentence).strip()
-            if sentence_text:  # Only add non-empty sentences
+            if sentence_text:
                 sentences.append(sentence_text)
             current_sentence = []
-    # Add any remaining text as a sentence if it's non-empty and doesn't end with delimiter
     if current_sentence:
         sentence_text = "".join(current_sentence).strip()
         if sentence_text:
@@ -226,52 +180,40 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
     total_sentences = len(sentences)
-    # Calculate average sentence length in characters
     if total_sentences > 0:
         sentence_lengths = [len(sent) for sent in sentences]
         avg_sentence_length_chars = sum(sentence_lengths) / total_sentences
     else:
         avg_sentence_length_chars = float("nan")
-    # Calculate punctuation density (per 100 words)
-    if total_words > 0:
-        punctuation_density = (punctuation_count / total_words) * 100
-    else:
-        punctuation_density = float("nan")
-    # Punctuation variety (count of unique punctuation types)
+    # Ratios
+    punctuation_density = (
+        (punctuation_count / total_words * 100) if total_words > 0 else float("nan")
+    )
     punctuation_variety = len(punctuation_types)
-    # Calculate vowel-to-consonant ratio
     if consonant_count > 0:
         vowel_consonant_ratio = vowel_count / consonant_count
     elif vowel_count > 0:
-        # Vowels but no consonants - ratio is infinity
         vowel_consonant_ratio = float("inf")
     else:
-        # No letters at all
         vowel_consonant_ratio = float("nan")
-    # Calculate digit ratio
-    if total_chars > 0:
-        digit_ratio = digit_count / total_chars
-    else:
-        digit_ratio = float("nan")
-    # Calculate uppercase ratio
-    if total_letters > 0:
-        uppercase_ratio = uppercase_count / total_letters
-    else:
-        uppercase_ratio = float("nan")
-    # Calculate whitespace ratio
-    if total_chars > 0:
-        whitespace_ratio = whitespace_count / total_chars
-    else:
-        whitespace_ratio = float("nan")
-    # Build metadata
-    metadata = {
+    digit_ratio = digit_count / total_chars if total_chars > 0 else float("nan")
+    uppercase_ratio = uppercase_count / total_letters if total_letters > 0 else float("nan")
+    whitespace_ratio = whitespace_count / total_chars if total_chars > 0 else float("nan")
+    return {
+        "avg_word_length": avg_word_length,
+        "avg_sentence_length_chars": avg_sentence_length_chars,
+        "punctuation_density": punctuation_density,
+        "punctuation_variety": punctuation_variety,
+        "vowel_consonant_ratio": vowel_consonant_ratio,
+        "digit_count": digit_count,
+        "digit_ratio": digit_ratio,
+        "uppercase_ratio": uppercase_ratio,
+        "whitespace_ratio": whitespace_ratio,
+        "letter_frequency": letter_frequency,
         "total_characters": total_chars,
         "total_letters": total_letters,
         "total_words": total_words,
@@ -286,16 +228,162 @@ def compute_character_metrics(text: str) -> CharacterMetricsResult:
         "lowercase_count": lowercase_count,
     }
+def compute_character_metrics(text: str, chunk_size: int = 1000) -> CharacterMetricsResult:
+    """
+    Compute character-level stylometric metrics.
+    This function uses native chunked analysis to capture variance and patterns
+    across the text, which is essential for stylometric fingerprinting.
+    Related GitHub Issues:
+        #12 - Character-Level Metrics
+        #27 - Native chunked analysis with Distribution dataclass
+    Character-level features are particularly valuable because:
+        1. They are language-independent (work across languages)
+        2. They capture subconscious writing patterns
+        3. They are resistant to topic variation
+        4. They complement higher-level metrics (words, syntax)
+    Metrics computed:
+        - Average word length: Mean characters per word
+        - Average sentence length (chars): Mean characters per sentence
+        - Punctuation density: Punctuation marks per 100 words
+        - Punctuation variety: Count of unique punctuation types used
+        - Letter frequency: Distribution of a-z (case-insensitive)
+        - Vowel-to-consonant ratio: Ratio of vowels to consonants
+        - Digit count/ratio: Numeric character usage
+        - Uppercase ratio: Uppercase letters / total letters
+        - Whitespace ratio: Whitespace characters / total characters
+    Args:
+        text: Input text to analyze
+        chunk_size: Number of words per chunk (default: 1000)
+    Returns:
+        CharacterMetricsResult with all character-level features, distributions,
+        and metadata.
+    Example:
+        >>> result = compute_character_metrics("Long text...", chunk_size=1000)
+        >>> result.avg_word_length  # Mean across chunks
+        4.5
+        >>> result.avg_word_length_dist.std  # Variance reveals fingerprint
+        0.3
+    """
+    # Chunk the text
+    chunks = chunk_text(text, chunk_size)
+    # Compute metrics per chunk
+    chunk_results = [_compute_character_metrics_single(chunk) for chunk in chunks]
+    # Collect values for distributions
+    avg_word_length_vals = [
+        r["avg_word_length"] for r in chunk_results if not math.isnan(r["avg_word_length"])
+    ]
+    avg_sentence_vals = [
+        r["avg_sentence_length_chars"]
+        for r in chunk_results
+        if not math.isnan(r["avg_sentence_length_chars"])
+    ]
+    punct_density_vals = [
+        r["punctuation_density"] for r in chunk_results if not math.isnan(r["punctuation_density"])
+    ]
+    punct_variety_vals = [float(r["punctuation_variety"]) for r in chunk_results]
+    vc_ratio_vals = [
+        r["vowel_consonant_ratio"]
+        for r in chunk_results
+        if not math.isnan(r["vowel_consonant_ratio"]) and not math.isinf(r["vowel_consonant_ratio"])
+    ]
+    digit_ratio_vals = [r["digit_ratio"] for r in chunk_results if not math.isnan(r["digit_ratio"])]
+    uppercase_ratio_vals = [
+        r["uppercase_ratio"] for r in chunk_results if not math.isnan(r["uppercase_ratio"])
+    ]
+    whitespace_ratio_vals = [
+        r["whitespace_ratio"] for r in chunk_results if not math.isnan(r["whitespace_ratio"])
+    ]
+    # Aggregate totals
+    total_digits = sum(r["digit_count"] for r in chunk_results)
+    total_characters = sum(r["total_characters"] for r in chunk_results)
+    total_letters = sum(r["total_letters"] for r in chunk_results)
+    total_words = sum(r["total_words"] for r in chunk_results)
+    total_sentences = sum(r["total_sentences"] for r in chunk_results)
+    total_punctuation = sum(r["total_punctuation"] for r in chunk_results)
+    total_whitespace = sum(r["total_whitespace"] for r in chunk_results)
+    total_vowel_count = sum(r["vowel_count"] for r in chunk_results)
+    total_consonant_count = sum(r["consonant_count"] for r in chunk_results)
+    total_uppercase_count = sum(r["uppercase_count"] for r in chunk_results)
+    total_lowercase_count = sum(r["lowercase_count"] for r in chunk_results)
+    all_punctuation_types = set()
+    for r in chunk_results:
+        all_punctuation_types.update(r["punctuation_types"])
+    # Aggregate letter frequency
+    total_letter_counts = {letter: 0 for letter in "abcdefghijklmnopqrstuvwxyz"}
+    for r in chunk_results:
+        if r["total_letters"] > 0:
+            for letter, freq in r["letter_frequency"].items():
+                total_letter_counts[letter] += freq * r["total_letters"]
+    if total_letters > 0:
+        letter_frequency = {
+            letter: count / total_letters for letter, count in total_letter_counts.items()
+        }
+    else:
+        letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
+    # Build distributions (handle empty case)
+    def safe_dist(values: list[float]) -> Distribution:
+        if not values:
+            return Distribution(
+                values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
+            )
+        return make_distribution(values)
+    avg_word_length_dist = safe_dist(avg_word_length_vals)
+    avg_sentence_dist = safe_dist(avg_sentence_vals)
+    punct_density_dist = safe_dist(punct_density_vals)
+    punct_variety_dist = safe_dist(punct_variety_vals)
+    vc_ratio_dist = safe_dist(vc_ratio_vals)
+    digit_ratio_dist = safe_dist(digit_ratio_vals)
+    uppercase_ratio_dist = safe_dist(uppercase_ratio_vals)
+    whitespace_ratio_dist = safe_dist(whitespace_ratio_vals)
     return CharacterMetricsResult(
-        avg_word_length=avg_word_length,
-        avg_sentence_length_chars=avg_sentence_length_chars,
-        punctuation_density=punctuation_density,
-        punctuation_variety=punctuation_variety,
+        avg_word_length=avg_word_length_dist.mean,
+        avg_sentence_length_chars=avg_sentence_dist.mean,
+        punctuation_density=punct_density_dist.mean,
+        punctuation_variety=punct_variety_dist.mean,
         letter_frequency=letter_frequency,
-        vowel_consonant_ratio=vowel_consonant_ratio,
-        digit_count=digit_count,
-        digit_ratio=digit_ratio,
-        uppercase_ratio=uppercase_ratio,
-        whitespace_ratio=whitespace_ratio,
-        metadata=metadata,
+        vowel_consonant_ratio=vc_ratio_dist.mean,
+        digit_count=total_digits,
+        digit_ratio=digit_ratio_dist.mean,
+        uppercase_ratio=uppercase_ratio_dist.mean,
+        whitespace_ratio=whitespace_ratio_dist.mean,
+        avg_word_length_dist=avg_word_length_dist,
+        avg_sentence_length_chars_dist=avg_sentence_dist,
+        punctuation_density_dist=punct_density_dist,
+        punctuation_variety_dist=punct_variety_dist,
+        vowel_consonant_ratio_dist=vc_ratio_dist,
+        digit_ratio_dist=digit_ratio_dist,
+        uppercase_ratio_dist=uppercase_ratio_dist,
+        whitespace_ratio_dist=whitespace_ratio_dist,
+        chunk_size=chunk_size,
+        chunk_count=len(chunks),
+        metadata={
+            "total_characters": total_characters,
+            "total_letters": total_letters,
+            "total_words": total_words,
+            "total_sentences": total_sentences,
+            "total_punctuation": total_punctuation,
+            "total_whitespace": total_whitespace,
+            "total_digits": total_digits,
+            "punctuation_types": sorted(list(all_punctuation_types)),
+            "vowel_count": total_vowel_count,
+            "consonant_count": total_consonant_count,
+            "uppercase_count": total_uppercase_count,
+            "lowercase_count": total_lowercase_count,
+        },
     )

pystylometry 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

pystylometry 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl