PyPI - pystylometry - Versions diffs - 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

pystylometry 1.0.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

pystylometry/README.md +42 -0
pystylometry/__init__.py +45 -3
pystylometry/_types.py +1017 -259
pystylometry/authorship/README.md +21 -0
pystylometry/authorship/__init__.py +28 -4
pystylometry/authorship/additional_methods.py +260 -40
pystylometry/authorship/compression.py +175 -0
pystylometry/authorship/kilgarriff.py +354 -0
pystylometry/character/README.md +17 -0
pystylometry/character/character_metrics.py +267 -179
pystylometry/cli.py +427 -0
pystylometry/consistency/README.md +27 -0
pystylometry/consistency/__init__.py +57 -0
pystylometry/consistency/_thresholds.py +162 -0
pystylometry/consistency/drift.py +549 -0
pystylometry/dialect/README.md +26 -0
pystylometry/dialect/__init__.py +65 -0
pystylometry/dialect/_data/dialect_markers.json +1134 -0
pystylometry/dialect/_loader.py +360 -0
pystylometry/dialect/detector.py +533 -0
pystylometry/lexical/README.md +23 -0
pystylometry/lexical/advanced_diversity.py +61 -22
pystylometry/lexical/function_words.py +255 -56
pystylometry/lexical/hapax.py +182 -52
pystylometry/lexical/mtld.py +108 -26
pystylometry/lexical/ttr.py +76 -10
pystylometry/lexical/word_frequency_sophistication.py +1522 -298
pystylometry/lexical/yule.py +136 -50
pystylometry/ngrams/README.md +18 -0
pystylometry/ngrams/entropy.py +150 -49
pystylometry/ngrams/extended_ngrams.py +314 -69
pystylometry/prosody/README.md +17 -0
pystylometry/prosody/rhythm_prosody.py +773 -11
pystylometry/readability/README.md +23 -0
pystylometry/readability/additional_formulas.py +1887 -762
pystylometry/readability/ari.py +144 -82
pystylometry/readability/coleman_liau.py +136 -109
pystylometry/readability/flesch.py +177 -73
pystylometry/readability/gunning_fog.py +165 -161
pystylometry/readability/smog.py +123 -42
pystylometry/stylistic/README.md +20 -0
pystylometry/stylistic/cohesion_coherence.py +669 -13
pystylometry/stylistic/genre_register.py +1560 -17
pystylometry/stylistic/markers.py +611 -17
pystylometry/stylistic/vocabulary_overlap.py +354 -13
pystylometry/syntactic/README.md +20 -0
pystylometry/syntactic/advanced_syntactic.py +76 -14
pystylometry/syntactic/pos_ratios.py +70 -6
pystylometry/syntactic/sentence_stats.py +55 -12
pystylometry/syntactic/sentence_types.py +71 -15
pystylometry/viz/README.md +27 -0
pystylometry/viz/__init__.py +71 -0
pystylometry/viz/drift.py +589 -0
pystylometry/viz/jsx/__init__.py +31 -0
pystylometry/viz/jsx/_base.py +144 -0
pystylometry/viz/jsx/report.py +677 -0
pystylometry/viz/jsx/timeline.py +716 -0
pystylometry/viz/jsx/viewer.py +1032 -0
pystylometry-1.3.0.dist-info/METADATA +136 -0
pystylometry-1.3.0.dist-info/RECORD +76 -0
{pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
pystylometry-1.0.0.dist-info/METADATA +0 -275
pystylometry-1.0.0.dist-info/RECORD +0 -46

pystylometry/lexical/advanced_diversity.py CHANGED Viewed

@@ -32,7 +32,13 @@ References:
 import random
 from typing import Optional
-from .._types import HDDResult, MATTRResult, MSTTRResult, VocdDResult
+from .._types import (
+    HDDResult,
+    MATTRResult,
+    MSTTRResult,
+    VocdDResult,
+    make_distribution,
+)
 def _tokenize_for_diversity(text: str) -> list[str]:
@@ -61,13 +67,13 @@ def _tokenize_for_diversity(text: str) -> list[str]:
     raw_tokens = text_lower.split()
     # Comprehensive punctuation set for stripping
-    PUNCTUATION = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
+    punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„\"\"''‚'")
     # Strip punctuation from each token
     tokens = []
     for token in raw_tokens:
         # Strip leading and trailing punctuation
-        clean_token = token.strip("".join(PUNCTUATION))
+        clean_token = token.strip("".join(punctuation_chars))
         if clean_token:  # Only add non-empty tokens
             tokens.append(clean_token)
@@ -80,6 +86,7 @@ def compute_vocd_d(
     num_samples: int = 100,
     min_tokens: int = 100,
     random_seed: Optional[int] = None,
+    chunk_size: int = 1000,
 ) -> VocdDResult:
     """
     Compute voc-D (vocabulary D) using curve-fitting approach.
@@ -167,9 +174,7 @@ def compute_vocd_d(
     # Step 2: Validate minimum length
     if total_tokens < min_tokens:
-        raise ValueError(
-            f"Text has {total_tokens} tokens, minimum {min_tokens} required for voc-D"
-        )
+        raise ValueError(f"Text has {total_tokens} tokens, minimum {min_tokens} required for voc-D")
     # Step 3: Determine sample sizes to test
     # Test from 10 tokens up to min(100, total_tokens - 10)
@@ -212,12 +217,12 @@ def compute_vocd_d(
         numerator += ttr / (size**0.5)
         denominator += 1.0 / size
-    D = numerator / denominator if denominator > 0 else 0.0
+    d_param = numerator / denominator if denominator > 0 else 0.0
     # Step 6: Calculate R² (goodness of fit)
     # Predicted TTR = D / sqrt(sample_size)
     y_actual = list(sample_size_to_mean_ttr.values())
-    y_predicted = [D / (size**0.5) for size in sample_sizes]
+    y_predicted = [d_param / (size**0.5) for size in sample_sizes]
     # R² calculation
     mean_y = sum(y_actual) / len(y_actual)
@@ -237,17 +242,25 @@ def compute_vocd_d(
         "random_seed": random_seed,
     }
-    # Step 8: Return result
+    # Step 8: Create distributions (single-pass analysis)
+    d_parameter_dist = make_distribution([d_param])
+    curve_fit_r_squared_dist = make_distribution([r_squared])
+    # Step 9: Return result
     return VocdDResult(
-        d_parameter=D,
+        d_parameter=d_param,
         curve_fit_r_squared=r_squared,
         sample_count=len(sample_sizes),
         optimal_sample_size=sample_size,  # Input parameter
+        d_parameter_dist=d_parameter_dist,
+        curve_fit_r_squared_dist=curve_fit_r_squared_dist,
+        chunk_size=chunk_size,
+        chunk_count=1,  # Single pass analysis
         metadata=metadata,
     )
-def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
+def compute_mattr(text: str, window_size: int = 50, chunk_size: int = 1000) -> MATTRResult:
     """
     Compute Moving-Average Type-Token Ratio (MATTR).
@@ -360,7 +373,13 @@ def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
         "last_window_ttr": window_ttrs[-1],
     }
-    # Step 7: Return result
+    # Step 7: Create distributions (single-pass analysis)
+    mattr_score_dist = make_distribution([mattr_score])
+    ttr_std_dev_dist = make_distribution([ttr_std_dev])
+    min_ttr_dist = make_distribution([min_ttr])
+    max_ttr_dist = make_distribution([max_ttr])
+    # Step 8: Return result
     return MATTRResult(
         mattr_score=mattr_score,
         window_size=window_size,
@@ -368,11 +387,17 @@ def compute_mattr(text: str, window_size: int = 50) -> MATTRResult:
         ttr_std_dev=ttr_std_dev,
         min_ttr=min_ttr,
         max_ttr=max_ttr,
+        mattr_score_dist=mattr_score_dist,
+        ttr_std_dev_dist=ttr_std_dev_dist,
+        min_ttr_dist=min_ttr_dist,
+        max_ttr_dist=max_ttr_dist,
+        chunk_size=chunk_size,
+        chunk_count=1,  # Single pass analysis
         metadata=metadata,
     )
-def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
+def compute_hdd(text: str, sample_size: int = 42, chunk_size: int = 1000) -> HDDResult:
     """
     Compute HD-D (Hypergeometric Distribution D).
@@ -451,9 +476,7 @@ def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
     # Step 2: Validate minimum length
     if total_tokens < sample_size:
-        raise ValueError(
-            f"Text has {total_tokens} tokens, minimum {sample_size} required for HD-D"
-        )
+        raise ValueError(f"Text has {total_tokens} tokens, minimum {sample_size} required for HD-D")
     # Step 3: Build frequency distribution
     type_counts: dict[str, int] = {}
@@ -485,17 +508,23 @@ def compute_hdd(text: str, sample_size: int = 42) -> HDDResult:
         "calculation_method": "simplified",
     }
-    # Step 6: Return result
+    # Step 6: Create distribution (single-pass analysis)
+    hdd_score_dist = make_distribution([hdd_sum])
+    # Step 7: Return result
     return HDDResult(
         hdd_score=hdd_sum,
         sample_size=sample_size,
         type_count=total_types,
         token_count=total_tokens,
+        hdd_score_dist=hdd_score_dist,
+        chunk_size=chunk_size,
+        chunk_count=1,  # Single pass analysis
         metadata=metadata,
     )
-def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
+def compute_msttr(text: str, segment_size: int = 100, chunk_size: int = 1000) -> MSTTRResult:
     """
     Compute Mean Segmental Type-Token Ratio (MSTTR).
@@ -604,9 +633,7 @@ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
     # Step 6: Calculate statistics
     # Standard deviation
-    variance = sum((ttr - msttr_score) ** 2 for ttr in segment_ttrs) / len(
-        segment_ttrs
-    )
+    variance = sum((ttr - msttr_score) ** 2 for ttr in segment_ttrs) / len(segment_ttrs)
     ttr_std_dev = variance**0.5
     # Min and max
@@ -628,7 +655,13 @@ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
         "last_segment_ttr": segment_ttrs[-1],
     }
-    # Step 9: Return result
+    # Step 9: Create distributions (single-pass analysis)
+    msttr_score_dist = make_distribution([msttr_score])
+    ttr_std_dev_dist = make_distribution([ttr_std_dev])
+    min_ttr_dist = make_distribution([min_ttr])
+    max_ttr_dist = make_distribution([max_ttr])
+    # Step 10: Return result
     return MSTTRResult(
         msttr_score=msttr_score,
         segment_size=segment_size,
@@ -637,5 +670,11 @@ def compute_msttr(text: str, segment_size: int = 100) -> MSTTRResult:
         min_ttr=min_ttr,
         max_ttr=max_ttr,
         segment_ttrs=segment_ttrs,
+        msttr_score_dist=msttr_score_dist,
+        ttr_std_dev_dist=ttr_std_dev_dist,
+        min_ttr_dist=min_ttr_dist,
+        max_ttr_dist=max_ttr_dist,
+        chunk_size=chunk_size,
+        chunk_count=1,  # Single pass analysis
         metadata=metadata,
     )

pystylometry/lexical/function_words.py CHANGED Viewed

@@ -32,8 +32,7 @@ References:
         words for authorship attribution. ACH/ALLC.
 """
-from .._types import FunctionWordResult
+from .._types import Distribution, FunctionWordResult, make_distribution
 # Function word lists for English
 # GitHub Issue #13: https://github.com/craigtrim/pystylometry/issues/13
@@ -42,78 +41,249 @@ from .._types import FunctionWordResult
 # Determiners (articles, demonstratives, possessives, quantifiers)
 DETERMINERS = {
-    "the", "a", "an",  # Articles
-    "this", "that", "these", "those",  # Demonstratives
-    "my", "your", "his", "her", "its", "our", "their",  # Possessive determiners
-    "some", "any", "no", "every", "each", "either", "neither",  # Quantifiers
-    "much", "many", "more", "most", "few", "fewer", "less", "least",
-    "all", "both", "half", "several", "enough",
+    "the",
+    "a",
+    "an",  # Articles
+    "this",
+    "that",
+    "these",
+    "those",  # Demonstratives
+    "my",
+    "your",
+    "his",
+    "her",
+    "its",
+    "our",
+    "their",  # Possessive determiners
+    "some",
+    "any",
+    "no",
+    "every",
+    "each",
+    "either",
+    "neither",  # Quantifiers
+    "much",
+    "many",
+    "more",
+    "most",
+    "few",
+    "fewer",
+    "less",
+    "least",
+    "all",
+    "both",
+    "half",
+    "several",
+    "enough",
 }
 # Prepositions (locative, temporal, other)
 PREPOSITIONS = {
-    "in", "on", "at", "by", "for", "with", "from", "to", "of",
-    "about", "above", "across", "after", "against", "along", "among",
-    "around", "as", "before", "behind", "below", "beneath", "beside",
-    "between", "beyond", "but", "concerning", "considering", "despite",
-    "down", "during", "except", "inside", "into", "like", "near",
-    "off", "onto", "out", "outside", "over", "past", "regarding",
-    "since", "through", "throughout", "till", "toward", "under",
-    "underneath", "until", "up", "upon", "via", "within", "without",
+    "in",
+    "on",
+    "at",
+    "by",
+    "for",
+    "with",
+    "from",
+    "to",
+    "of",
+    "about",
+    "above",
+    "across",
+    "after",
+    "against",
+    "along",
+    "among",
+    "around",
+    "as",
+    "before",
+    "behind",
+    "below",
+    "beneath",
+    "beside",
+    "between",
+    "beyond",
+    "but",
+    "concerning",
+    "considering",
+    "despite",
+    "down",
+    "during",
+    "except",
+    "inside",
+    "into",
+    "like",
+    "near",
+    "off",
+    "onto",
+    "out",
+    "outside",
+    "over",
+    "past",
+    "regarding",
+    "since",
+    "through",
+    "throughout",
+    "till",
+    "toward",
+    "under",
+    "underneath",
+    "until",
+    "up",
+    "upon",
+    "via",
+    "within",
+    "without",
 }
 # Conjunctions (coordinating, subordinating, correlative)
 CONJUNCTIONS = {
     # Coordinating
-    "and", "but", "or", "nor", "for", "yet", "so",
+    "and",
+    "but",
+    "or",
+    "nor",
+    "for",
+    "yet",
+    "so",
     # Subordinating
-    "although", "because", "since", "unless", "while", "if", "when",
-    "where", "after", "before", "once", "until", "as", "though",
-    "even", "whereas", "wherever", "whenever",
+    "although",
+    "because",
+    "since",
+    "unless",
+    "while",
+    "if",
+    "when",
+    "where",
+    "after",
+    "before",
+    "once",
+    "until",
+    "as",
+    "though",
+    "even",
+    "whereas",
+    "wherever",
+    "whenever",
     # Correlative components
-    "either", "neither", "both", "whether",
+    "either",
+    "neither",
+    "both",
+    "whether",
 }
 # Pronouns (personal, possessive, reflexive, demonstrative, relative, indefinite)
 PRONOUNS = {
     # Personal (subject)
-    "i", "you", "he", "she", "it", "we", "they",
+    "i",
+    "you",
+    "he",
+    "she",
+    "it",
+    "we",
+    "they",
     # Personal (object)
-    "me", "him", "her", "us", "them",
+    "me",
+    "him",
+    "her",
+    "us",
+    "them",
     # Possessive
-    "mine", "yours", "his", "hers", "its", "ours", "theirs",
+    "mine",
+    "yours",
+    "his",
+    "hers",
+    "its",
+    "ours",
+    "theirs",
     # Reflexive
-    "myself", "yourself", "himself", "herself", "itself",
-    "ourselves", "yourselves", "themselves",
+    "myself",
+    "yourself",
+    "himself",
+    "herself",
+    "itself",
+    "ourselves",
+    "yourselves",
+    "themselves",
     # Demonstrative
-    "this", "that", "these", "those",
+    "this",
+    "that",
+    "these",
+    "those",
     # Relative
-    "who", "whom", "whose", "which", "that",
+    "who",
+    "whom",
+    "whose",
+    "which",
+    "that",
     # Indefinite
-    "anybody", "anyone", "anything", "everybody", "everyone",
-    "everything", "nobody", "no one", "nothing", "somebody",
-    "someone", "something", "one",
+    "anybody",
+    "anyone",
+    "anything",
+    "everybody",
+    "everyone",
+    "everything",
+    "nobody",
+    "no one",
+    "nothing",
+    "somebody",
+    "someone",
+    "something",
+    "one",
 }
 # Auxiliary verbs (modal, primary)
 AUXILIARIES = {
     # Modals
-    "can", "could", "may", "might", "must", "shall", "should",
-    "will", "would", "ought",
+    "can",
+    "could",
+    "may",
+    "might",
+    "must",
+    "shall",
+    "should",
+    "will",
+    "would",
+    "ought",
     # Primary auxiliaries (be, have, do)
-    "am", "is", "are", "was", "were", "be", "being", "been",
-    "have", "has", "had", "having",
-    "do", "does", "did", "doing",
+    "am",
+    "is",
+    "are",
+    "was",
+    "were",
+    "be",
+    "being",
+    "been",
+    "have",
+    "has",
+    "had",
+    "having",
+    "do",
+    "does",
+    "did",
+    "doing",
 }
 # Particles (often used with phrasal verbs)
 PARTICLES = {
-    "up", "down", "out", "off", "over", "in", "away",
-    "back", "on", "along", "forth", "apart", "aside",
+    "up",
+    "down",
+    "out",
+    "off",
+    "over",
+    "in",
+    "away",
+    "back",
+    "on",
+    "along",
+    "forth",
+    "apart",
+    "aside",
 }
-def compute_function_words(text: str) -> FunctionWordResult:
+def compute_function_words(text: str, chunk_size: int = 1000) -> FunctionWordResult:
     """
     Compute function word frequency profiles for authorship analysis.
@@ -180,18 +350,21 @@ def compute_function_words(text: str) -> FunctionWordResult:
           determiner and pronoun) - each category is counted independently
     """
     # Step 1: Create union set of all function words (for total ratio calculation)
-    ALL_FUNCTION_WORDS = (
-        DETERMINERS
-        | PREPOSITIONS
-        | CONJUNCTIONS
-        | PRONOUNS
-        | AUXILIARIES
-        | PARTICLES
+    all_function_words = (
+        DETERMINERS | PREPOSITIONS | CONJUNCTIONS | PRONOUNS | AUXILIARIES | PARTICLES
     )
     # Step 2: Tokenize text (lowercase, split on whitespace, strip punctuation)
     if not text or not text.strip():
         # Handle empty text edge case
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return FunctionWordResult(
             determiner_ratio=0.0,
             preposition_ratio=0.0,
@@ -204,6 +377,16 @@ def compute_function_words(text: str) -> FunctionWordResult:
             most_frequent_function_words=[],
             least_frequent_function_words=[],
             function_word_distribution={},
+            determiner_ratio_dist=empty_dist,
+            preposition_ratio_dist=empty_dist,
+            conjunction_ratio_dist=empty_dist,
+            pronoun_ratio_dist=empty_dist,
+            auxiliary_ratio_dist=empty_dist,
+            particle_ratio_dist=empty_dist,
+            total_function_word_ratio_dist=empty_dist,
+            function_word_diversity_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=0,
             metadata={
                 "total_word_count": 0,
                 "total_function_word_count": 0,
@@ -232,15 +415,13 @@ def compute_function_words(text: str) -> FunctionWordResult:
     raw_tokens = text_lower.split()
     # Comprehensive punctuation set for stripping
-    PUNCTUATION = set(
-        ".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„""''‚'"
-    )
+    punctuation_chars = set(".,!?;:'\"()[]{}/-—–…*&@#$%^~`\\|<>«»„''‚'")
     # Strip punctuation from each token
     tokens = []
     for token in raw_tokens:
         # Strip leading and trailing punctuation
-        clean_token = token.strip("".join(PUNCTUATION))
+        clean_token = token.strip("".join(punctuation_chars))
         if clean_token:  # Only add non-empty tokens
             tokens.append(clean_token)
@@ -272,7 +453,7 @@ def compute_function_words(text: str) -> FunctionWordResult:
     # Step 5: Build distribution (count each function word only once per token)
     function_word_counts: dict[str, int] = {}
     for token in tokens:
-        if token in ALL_FUNCTION_WORDS:
+        if token in all_function_words:
             function_word_counts[token] = function_word_counts.get(token, 0) + 1
     # Step 6: Calculate ratios
@@ -306,9 +487,7 @@ def compute_function_words(text: str) -> FunctionWordResult:
     # Step 8: Find most/least frequent function words
     if function_word_counts:
         # Sort by count descending
-        sorted_by_count = sorted(
-            function_word_counts.items(), key=lambda x: x[1], reverse=True
-        )
+        sorted_by_count = sorted(function_word_counts.items(), key=lambda x: x[1], reverse=True)
         # Top 10 most frequent
         most_frequent = sorted_by_count[:10]
@@ -353,7 +532,17 @@ def compute_function_words(text: str) -> FunctionWordResult:
     overlapping_words.sort()
-    # Step 11: Build metadata
+    # Step 11: Create single-value distributions (analysis is done on full text)
+    determiner_ratio_dist = make_distribution([determiner_ratio])
+    preposition_ratio_dist = make_distribution([preposition_ratio])
+    conjunction_ratio_dist = make_distribution([conjunction_ratio])
+    pronoun_ratio_dist = make_distribution([pronoun_ratio])
+    auxiliary_ratio_dist = make_distribution([auxiliary_ratio])
+    particle_ratio_dist = make_distribution([particle_ratio])
+    total_function_word_ratio_dist = make_distribution([total_function_word_ratio])
+    function_word_diversity_dist = make_distribution([function_word_diversity])
+    # Step 12: Build metadata
     metadata = {
         "total_word_count": total_words,
         "total_function_word_count": total_function_word_count,
@@ -374,7 +563,7 @@ def compute_function_words(text: str) -> FunctionWordResult:
         "overlapping_word_categories": overlapping_word_categories,
     }
-    # Step 12: Return result
+    # Step 13: Return result
     return FunctionWordResult(
         determiner_ratio=determiner_ratio,
         preposition_ratio=preposition_ratio,
@@ -387,5 +576,15 @@ def compute_function_words(text: str) -> FunctionWordResult:
         most_frequent_function_words=most_frequent,
         least_frequent_function_words=least_frequent,
         function_word_distribution=function_word_counts,
+        determiner_ratio_dist=determiner_ratio_dist,
+        preposition_ratio_dist=preposition_ratio_dist,
+        conjunction_ratio_dist=conjunction_ratio_dist,
+        pronoun_ratio_dist=pronoun_ratio_dist,
+        auxiliary_ratio_dist=auxiliary_ratio_dist,
+        particle_ratio_dist=particle_ratio_dist,
+        total_function_word_ratio_dist=total_function_word_ratio_dist,
+        function_word_diversity_dist=function_word_diversity_dist,
+        chunk_size=chunk_size,
+        chunk_count=1,  # Single pass analysis
         metadata=metadata,
     )

pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

pystylometry 1.0.0py3-none-any.whl → 1.3.0py3-none-any.whl