PyPI - pystylometry - Versions diffs - 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

pystylometry 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

pystylometry/__init__.py +1 -2
pystylometry/_normalize.py +277 -0
pystylometry/_types.py +1224 -2
pystylometry/_utils.py +4 -0
pystylometry/authorship/__init__.py +4 -0
pystylometry/authorship/additional_methods.py +100 -0
pystylometry/character/__init__.py +15 -0
pystylometry/character/character_metrics.py +301 -0
pystylometry/lexical/__init__.py +13 -6
pystylometry/lexical/advanced_diversity.py +641 -0
pystylometry/lexical/function_words.py +391 -0
pystylometry/lexical/hapax.py +154 -7
pystylometry/lexical/mtld.py +83 -7
pystylometry/lexical/ttr.py +83 -0
pystylometry/lexical/word_frequency_sophistication.py +581 -0
pystylometry/lexical/yule.py +34 -7
pystylometry/ngrams/__init__.py +2 -0
pystylometry/ngrams/extended_ngrams.py +235 -0
pystylometry/prosody/__init__.py +12 -0
pystylometry/prosody/rhythm_prosody.py +53 -0
pystylometry/readability/__init__.py +12 -0
pystylometry/readability/additional_formulas.py +985 -0
pystylometry/readability/ari.py +93 -17
pystylometry/readability/coleman_liau.py +102 -9
pystylometry/readability/complex_words.py +531 -0
pystylometry/readability/flesch.py +59 -14
pystylometry/readability/gunning_fog.py +194 -25
pystylometry/readability/smog.py +31 -14
pystylometry/readability/syllables.py +137 -30
pystylometry/stylistic/__init__.py +20 -0
pystylometry/stylistic/cohesion_coherence.py +45 -0
pystylometry/stylistic/genre_register.py +45 -0
pystylometry/stylistic/markers.py +131 -0
pystylometry/stylistic/vocabulary_overlap.py +47 -0
pystylometry/syntactic/__init__.py +4 -0
pystylometry/syntactic/advanced_syntactic.py +432 -0
pystylometry/syntactic/pos_ratios.py +104 -13
pystylometry/syntactic/sentence_stats.py +57 -13
pystylometry/syntactic/sentence_types.py +470 -0
{pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/METADATA +49 -12
pystylometry-1.0.0.dist-info/RECORD +46 -0
{pystylometry-0.1.0.dist-info → pystylometry-1.0.0.dist-info}/WHEEL +1 -1
pystylometry-0.1.0.dist-info/RECORD +0 -26

pystylometry/_utils.py CHANGED Viewed

@@ -9,9 +9,13 @@ from .tokenizer import Tokenizer
 # ===== Convenience Functions =====
 # Default tokenizer instance for backward compatibility
+# Preserves emails and URLs to allow readability metrics (like Coleman-Liau)
+# to count their alphabetic characters
 _default_tokenizer = Tokenizer(
     lowercase=False,
     strip_punctuation=False,
+    preserve_urls=True,
+    preserve_emails=True,
 )

pystylometry/authorship/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """Authorship attribution metrics."""
+from .additional_methods import compute_johns_delta, compute_kilgarriff, compute_minmax
 from .burrows_delta import compute_burrows_delta, compute_cosine_delta
 from .zeta import compute_zeta
@@ -7,4 +8,7 @@ __all__ = [
     "compute_burrows_delta",
     "compute_cosine_delta",
     "compute_zeta",
+    "compute_kilgarriff",
+    "compute_minmax",
+    "compute_johns_delta",
 ]

pystylometry/authorship/additional_methods.py ADDED Viewed

@@ -0,0 +1,100 @@
+"""Additional authorship attribution methods.
+This module provides alternative distance/similarity metrics for authorship
+attribution beyond Burrows' Delta and Zeta.
+Related GitHub Issue:
+    #24 - Additional Authorship Attribution Methods
+    https://github.com/craigtrim/pystylometry/issues/24
+Methods implemented:
+    - Kilgarriff's Chi-squared
+    - Min-Max (Burrows' original method)
+    - John Burrows' Delta variations
+References:
+    Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics.
+    Burrows, J. F. (1992). Not unless you ask nicely. Literary and Linguistic Computing.
+    Burrows, J. (2005). Who wrote Shamela? Literary and Linguistic Computing.
+"""
+from .._types import JohnsBurrowsResult, KilgarriffResult, MinMaxResult
+def compute_kilgarriff(text1: str, text2: str, mfw: int = 100) -> KilgarriffResult:
+    """
+    Compute Kilgarriff's Chi-squared distance between two texts.
+    Related GitHub Issue:
+        #24 - Additional Authorship Attribution Methods
+        https://github.com/craigtrim/pystylometry/issues/24
+    Args:
+        text1: First text for comparison
+        text2: Second text for comparison
+        mfw: Number of most frequent words to analyze
+    Returns:
+        KilgarriffResult with chi-squared statistic, p-value, and
+        most distinctive features.
+    """
+    # TODO: Implement Kilgarriff's chi-squared
+    # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
+    raise NotImplementedError(
+        "Kilgarriff's chi-squared not yet implemented. "
+        "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
+    )
+def compute_minmax(text1: str, text2: str, mfw: int = 100) -> MinMaxResult:
+    """
+    Compute Min-Max distance (Burrows' original method).
+    Related GitHub Issue:
+        #24 - Additional Authorship Attribution Methods
+        https://github.com/craigtrim/pystylometry/issues/24
+    Args:
+        text1: First text for comparison
+        text2: Second text for comparison
+        mfw: Number of most frequent words to analyze
+    Returns:
+        MinMaxResult with min-max distance and distinctive features.
+    """
+    # TODO: Implement Min-Max distance
+    # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
+    raise NotImplementedError(
+        "Min-Max distance not yet implemented. "
+        "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
+    )
+def compute_johns_delta(
+    text1: str,
+    text2: str,
+    mfw: int = 100,
+    method: str = "quadratic",
+) -> JohnsBurrowsResult:
+    """
+    Compute John Burrows' Delta variations.
+    Related GitHub Issue:
+        #24 - Additional Authorship Attribution Methods
+        https://github.com/craigtrim/pystylometry/issues/24
+    Args:
+        text1: First text for comparison
+        text2: Second text for comparison
+        mfw: Number of most frequent words to analyze
+        method: Delta variation ("quadratic", "weighted", "rotated")
+    Returns:
+        JohnsBurrowsResult with delta score and method details.
+    """
+    # TODO: Implement John's Delta variations
+    # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
+    raise NotImplementedError(
+        "John's Delta variations not yet implemented. "
+        "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
+    )

pystylometry/character/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""Character-level metrics for stylometric analysis.
+This package provides character-level features for authorship attribution
+and style analysis.
+Related GitHub Issue:
+    #12 - Character-Level Metrics
+    https://github.com/craigtrim/pystylometry/issues/12
+"""
+from .character_metrics import compute_character_metrics
+__all__ = [
+    "compute_character_metrics",
+]

pystylometry/character/character_metrics.py ADDED Viewed

@@ -0,0 +1,301 @@
+"""Character-level metrics for stylometric analysis.
+This module provides character-level features that capture low-level patterns
+in writing style. Character-level metrics are fundamental for authorship
+attribution and can reveal distinctive patterns in punctuation usage,
+word construction, and formatting preferences.
+Related GitHub Issue:
+    #12 - Character-Level Metrics
+    https://github.com/craigtrim/pystylometry/issues/12
+Features implemented:
+    - Average word length (characters per word)
+    - Average sentence length (characters per sentence)
+    - Punctuation density and variety
+    - Letter frequency distribution
+    - Vowel-to-consonant ratio
+    - Digit frequency and ratio
+    - Uppercase ratio
+    - Whitespace ratio
+References:
+    Grieve, J. (2007). Quantitative authorship attribution: An evaluation
+        of techniques. Literary and Linguistic Computing, 22(3), 251-270.
+    Stamatatos, E. (2009). A survey of modern authorship attribution methods.
+        JASIST, 60(3), 538-556.
+"""
+from .._types import CharacterMetricsResult
+def compute_character_metrics(text: str) -> CharacterMetricsResult:
+    """
+    Compute character-level stylometric metrics.
+    This function analyzes text at the character level to extract features
+    related to word length, punctuation usage, letter distribution, and
+    other low-level patterns that can be distinctive for authorship
+    attribution and style analysis.
+    Related GitHub Issue:
+        #12 - Character-Level Metrics
+        https://github.com/craigtrim/pystylometry/issues/12
+    Character-level features are particularly valuable because:
+        1. They are language-independent (work across languages)
+        2. They capture subconscious writing patterns
+        3. They are resistant to topic variation
+        4. They complement higher-level metrics (words, syntax)
+    Metrics computed:
+        - Average word length: Mean characters per word
+        - Average sentence length (chars): Mean characters per sentence
+        - Punctuation density: Punctuation marks per 100 words
+        - Punctuation variety: Count of unique punctuation types used
+        - Letter frequency: Distribution of a-z (case-insensitive)
+        - Vowel-to-consonant ratio: Ratio of vowels to consonants
+        - Digit count/ratio: Numeric character usage
+        - Uppercase ratio: Uppercase letters / total letters
+        - Whitespace ratio: Whitespace characters / total characters
+    Args:
+        text: Input text to analyze. Should contain at least one sentence
+              for meaningful results. Empty text will return NaN for ratios
+              and 0 for counts.
+    Returns:
+        CharacterMetricsResult with all character-level features and metadata.
+        For empty text, all ratios will be NaN and counts will be 0.
+    Example:
+        >>> result = compute_character_metrics("The quick brown fox jumps!")
+        >>> print(f"Avg word length: {result.avg_word_length:.2f}")
+        Avg word length: 4.17
+        >>> print(f"Punctuation density: {result.punctuation_density:.2f}")
+        Punctuation density: 16.67
+        >>> print(f"Vowel/consonant ratio: {result.vowel_consonant_ratio:.2f}")
+        Vowel/consonant ratio: 0.71
+        >>> # Empty text handling
+        >>> result = compute_character_metrics("")
+        >>> import math
+        >>> math.isnan(result.avg_word_length)
+        True
+        >>> result.digit_count
+        0
+    Note:
+        - Punctuation marks include: . , ! ? ; : - ' " ( ) [ ] { } ... etc.
+        - Whitespace includes spaces, tabs, newlines
+        - Letter frequency is case-insensitive (lowercase normalized)
+        - Words are tokenized by whitespace for length calculation
+        - Sentences are split using standard sentence delimiters (. ! ?)
+    """
+    # Define character sets
+    # GitHub Issue #12: https://github.com/craigtrim/pystylometry/issues/12
+    PUNCTUATION = {
+        ".", ",", "!", "?", ";", ":", "-", "—", "–",  # Basic punctuation
+        "'", '"', """, """, "'", "'",  # Quotes
+        "(", ")", "[", "]", "{", "}",  # Brackets
+        "/", "\\", "|",  # Slashes
+        "…",  # Ellipsis
+        "*", "&", "@", "#", "$", "%", "^", "~", "`",  # Special symbols
+    }
+    VOWELS = {"a", "e", "i", "o", "u"}
+    # Handle empty text
+    if not text:
+        # Return NaN for all ratios, 0 for all counts
+        empty_letter_freq = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
+        return CharacterMetricsResult(
+            avg_word_length=float("nan"),
+            avg_sentence_length_chars=float("nan"),
+            punctuation_density=float("nan"),
+            punctuation_variety=0,
+            letter_frequency=empty_letter_freq,
+            vowel_consonant_ratio=float("nan"),
+            digit_count=0,
+            digit_ratio=float("nan"),
+            uppercase_ratio=float("nan"),
+            whitespace_ratio=float("nan"),
+            metadata={
+                "total_characters": 0,
+                "total_letters": 0,
+                "total_words": 0,
+                "total_sentences": 0,
+                "total_punctuation": 0,
+                "total_whitespace": 0,
+                "total_digits": 0,
+                "punctuation_types": [],
+                "vowel_count": 0,
+                "consonant_count": 0,
+                "uppercase_count": 0,
+                "lowercase_count": 0,
+            },
+        )
+    # Initialize counters
+    total_chars = len(text)
+    letter_counts = {letter: 0 for letter in "abcdefghijklmnopqrstuvwxyz"}
+    vowel_count = 0
+    consonant_count = 0
+    uppercase_count = 0
+    lowercase_count = 0
+    digit_count = 0
+    whitespace_count = 0
+    punctuation_count = 0
+    punctuation_types = set()
+    # Single pass through text to classify and count all characters
+    for char in text:
+        if char.isalpha():
+            # Letter - update letter frequency (case-insensitive)
+            letter_counts[char.lower()] += 1
+            # Count vowels and consonants
+            if char.lower() in VOWELS:
+                vowel_count += 1
+            else:
+                consonant_count += 1
+            # Count uppercase and lowercase
+            if char.isupper():
+                uppercase_count += 1
+            else:
+                lowercase_count += 1
+        elif char.isdigit():
+            digit_count += 1
+        elif char.isspace():
+            whitespace_count += 1
+        elif char in PUNCTUATION:
+            punctuation_count += 1
+            punctuation_types.add(char)
+    total_letters = vowel_count + consonant_count
+    # Calculate letter frequency distribution (normalize to sum to 1.0)
+    if total_letters > 0:
+        letter_frequency = {letter: count / total_letters for letter, count in letter_counts.items()}
+    else:
+        letter_frequency = {letter: 0.0 for letter in "abcdefghijklmnopqrstuvwxyz"}
+    # Tokenize into words (split on whitespace, then strip punctuation for length)
+    words = text.split()
+    total_words = len(words)
+    # Calculate average word length (count only letters and digits in words)
+    if total_words > 0:
+        word_lengths = []
+        for word in words:
+            # Count only alphanumeric characters for word length
+            word_length = sum(1 for char in word if char.isalnum())
+            if word_length > 0:  # Only count words with at least one alphanumeric char
+                word_lengths.append(word_length)
+        if word_lengths:
+            avg_word_length = sum(word_lengths) / len(word_lengths)
+        else:
+            avg_word_length = float("nan")
+    else:
+        avg_word_length = float("nan")
+    # Segment text into sentences (split on . ! ?)
+    # Simple approach: split on sentence delimiters
+    sentence_delimiters = {".", "!", "?"}
+    sentences = []
+    current_sentence = []
+    for char in text:
+        current_sentence.append(char)
+        if char in sentence_delimiters:
+            # End of sentence
+            sentence_text = "".join(current_sentence).strip()
+            if sentence_text:  # Only add non-empty sentences
+                sentences.append(sentence_text)
+            current_sentence = []
+    # Add any remaining text as a sentence if it's non-empty and doesn't end with delimiter
+    if current_sentence:
+        sentence_text = "".join(current_sentence).strip()
+        if sentence_text:
+            sentences.append(sentence_text)
+    total_sentences = len(sentences)
+    # Calculate average sentence length in characters
+    if total_sentences > 0:
+        sentence_lengths = [len(sent) for sent in sentences]
+        avg_sentence_length_chars = sum(sentence_lengths) / total_sentences
+    else:
+        avg_sentence_length_chars = float("nan")
+    # Calculate punctuation density (per 100 words)
+    if total_words > 0:
+        punctuation_density = (punctuation_count / total_words) * 100
+    else:
+        punctuation_density = float("nan")
+    # Punctuation variety (count of unique punctuation types)
+    punctuation_variety = len(punctuation_types)
+    # Calculate vowel-to-consonant ratio
+    if consonant_count > 0:
+        vowel_consonant_ratio = vowel_count / consonant_count
+    elif vowel_count > 0:
+        # Vowels but no consonants - ratio is infinity
+        vowel_consonant_ratio = float("inf")
+    else:
+        # No letters at all
+        vowel_consonant_ratio = float("nan")
+    # Calculate digit ratio
+    if total_chars > 0:
+        digit_ratio = digit_count / total_chars
+    else:
+        digit_ratio = float("nan")
+    # Calculate uppercase ratio
+    if total_letters > 0:
+        uppercase_ratio = uppercase_count / total_letters
+    else:
+        uppercase_ratio = float("nan")
+    # Calculate whitespace ratio
+    if total_chars > 0:
+        whitespace_ratio = whitespace_count / total_chars
+    else:
+        whitespace_ratio = float("nan")
+    # Build metadata
+    metadata = {
+        "total_characters": total_chars,
+        "total_letters": total_letters,
+        "total_words": total_words,
+        "total_sentences": total_sentences,
+        "total_punctuation": punctuation_count,
+        "total_whitespace": whitespace_count,
+        "total_digits": digit_count,
+        "punctuation_types": sorted(list(punctuation_types)),
+        "vowel_count": vowel_count,
+        "consonant_count": consonant_count,
+        "uppercase_count": uppercase_count,
+        "lowercase_count": lowercase_count,
+    }
+    return CharacterMetricsResult(
+        avg_word_length=avg_word_length,
+        avg_sentence_length_chars=avg_sentence_length_chars,
+        punctuation_density=punctuation_density,
+        punctuation_variety=punctuation_variety,
+        letter_frequency=letter_frequency,
+        vowel_consonant_ratio=vowel_consonant_ratio,
+        digit_count=digit_count,
+        digit_ratio=digit_ratio,
+        uppercase_ratio=uppercase_ratio,
+        whitespace_ratio=whitespace_ratio,
+        metadata=metadata,
+    )

pystylometry/lexical/__init__.py CHANGED Viewed

@@ -1,17 +1,24 @@
 """Lexical diversity metrics."""
-# Re-export from stylometry-ttr
-# from stylometry_ttr import compute_ttr, TTRResult
 # Local implementations
-from .hapax import compute_hapax_ratios
+from .advanced_diversity import compute_hdd, compute_mattr, compute_msttr, compute_vocd_d
+from .function_words import compute_function_words
+from .hapax import compute_hapax_ratios, compute_hapax_with_lexicon_analysis
 from .mtld import compute_mtld
+from .ttr import compute_ttr
+from .word_frequency_sophistication import compute_word_frequency_sophistication
 from .yule import compute_yule
 __all__ = [
-    # "compute_ttr",  # From stylometry-ttr
-    # "TTRResult",    # From stylometry-ttr
+    "compute_ttr",
     "compute_mtld",
     "compute_yule",
     "compute_hapax_ratios",
+    "compute_hapax_with_lexicon_analysis",
+    "compute_function_words",
+    "compute_vocd_d",
+    "compute_mattr",
+    "compute_hdd",
+    "compute_msttr",
+    "compute_word_frequency_sophistication",
 ]

pystylometry 0.1.0__py3-none-any.whl → 1.0.0__py3-none-any.whl

pystylometry 0.1.0py3-none-any.whl → 1.0.0py3-none-any.whl