PyPI - pystylometry - Versions diffs - 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

pystylometry 1.0.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

pystylometry/README.md +42 -0
pystylometry/__init__.py +45 -3
pystylometry/_types.py +1017 -259
pystylometry/authorship/README.md +21 -0
pystylometry/authorship/__init__.py +28 -4
pystylometry/authorship/additional_methods.py +260 -40
pystylometry/authorship/compression.py +175 -0
pystylometry/authorship/kilgarriff.py +354 -0
pystylometry/character/README.md +17 -0
pystylometry/character/character_metrics.py +267 -179
pystylometry/cli.py +427 -0
pystylometry/consistency/README.md +27 -0
pystylometry/consistency/__init__.py +57 -0
pystylometry/consistency/_thresholds.py +162 -0
pystylometry/consistency/drift.py +549 -0
pystylometry/dialect/README.md +26 -0
pystylometry/dialect/__init__.py +65 -0
pystylometry/dialect/_data/dialect_markers.json +1134 -0
pystylometry/dialect/_loader.py +360 -0
pystylometry/dialect/detector.py +533 -0
pystylometry/lexical/README.md +23 -0
pystylometry/lexical/advanced_diversity.py +61 -22
pystylometry/lexical/function_words.py +255 -56
pystylometry/lexical/hapax.py +182 -52
pystylometry/lexical/mtld.py +108 -26
pystylometry/lexical/ttr.py +76 -10
pystylometry/lexical/word_frequency_sophistication.py +1522 -298
pystylometry/lexical/yule.py +136 -50
pystylometry/ngrams/README.md +18 -0
pystylometry/ngrams/entropy.py +150 -49
pystylometry/ngrams/extended_ngrams.py +314 -69
pystylometry/prosody/README.md +17 -0
pystylometry/prosody/rhythm_prosody.py +773 -11
pystylometry/readability/README.md +23 -0
pystylometry/readability/additional_formulas.py +1887 -762
pystylometry/readability/ari.py +144 -82
pystylometry/readability/coleman_liau.py +136 -109
pystylometry/readability/flesch.py +177 -73
pystylometry/readability/gunning_fog.py +165 -161
pystylometry/readability/smog.py +123 -42
pystylometry/stylistic/README.md +20 -0
pystylometry/stylistic/cohesion_coherence.py +669 -13
pystylometry/stylistic/genre_register.py +1560 -17
pystylometry/stylistic/markers.py +611 -17
pystylometry/stylistic/vocabulary_overlap.py +354 -13
pystylometry/syntactic/README.md +20 -0
pystylometry/syntactic/advanced_syntactic.py +76 -14
pystylometry/syntactic/pos_ratios.py +70 -6
pystylometry/syntactic/sentence_stats.py +55 -12
pystylometry/syntactic/sentence_types.py +71 -15
pystylometry/viz/README.md +27 -0
pystylometry/viz/__init__.py +71 -0
pystylometry/viz/drift.py +589 -0
pystylometry/viz/jsx/__init__.py +31 -0
pystylometry/viz/jsx/_base.py +144 -0
pystylometry/viz/jsx/report.py +677 -0
pystylometry/viz/jsx/timeline.py +716 -0
pystylometry/viz/jsx/viewer.py +1032 -0
pystylometry-1.3.0.dist-info/METADATA +136 -0
pystylometry-1.3.0.dist-info/RECORD +76 -0
{pystylometry-1.0.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +1 -1
pystylometry-1.3.0.dist-info/entry_points.txt +4 -0
pystylometry-1.0.0.dist-info/METADATA +0 -275
pystylometry-1.0.0.dist-info/RECORD +0 -46

pystylometry/syntactic/pos_ratios.py CHANGED Viewed

@@ -1,10 +1,17 @@
-"""Part-of-Speech ratio analysis using spaCy."""
+"""Part-of-Speech ratio analysis using spaCy.
-from .._types import POSResult
+Related GitHub Issue:
+    #27 - Native chunked analysis with Distribution dataclass
+    https://github.com/craigtrim/pystylometry/issues/27
+"""
+from .._types import Distribution, POSResult, make_distribution
 from .._utils import check_optional_dependency
-def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
+def compute_pos_ratios(
+    text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
+) -> POSResult:
     """
     Compute Part-of-Speech ratios and lexical density using spaCy.
@@ -18,6 +25,10 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
     - Lexical density: (nouns + verbs + adjectives + adverbs) / total words
     - Function word ratio: (determiners + prepositions + conjunctions) / total words
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     References:
         Biber, D. (1988). Variation across speech and writing.
         Cambridge University Press.
@@ -25,9 +36,13 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
     Args:
         text: Input text to analyze
         model: spaCy model name (default: "en_core_web_sm")
+        chunk_size: Number of words per chunk (default: 1000).
+            Note: POS analysis is performed on the full text for accuracy,
+            so this parameter is included for API consistency but actual
+            results are from a single pass.
     Returns:
-        POSResult with all POS ratios and metadata
+        POSResult with all POS ratios, distributions, and metadata
     Raises:
         ImportError: If spaCy is not installed
@@ -47,8 +62,7 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
         nlp = spacy.load(model)
     except OSError:
         raise OSError(
-            f"spaCy model '{model}' not found. "
-            f"Download it with: python -m spacy download {model}"
+            f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
         )
     # Process text with spaCy
@@ -89,6 +103,14 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
     # Handle empty text
     if total_tokens == 0:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return POSResult(
             noun_ratio=float("nan"),
             verb_ratio=float("nan"),
@@ -98,6 +120,16 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
             adjective_noun_ratio=float("nan"),
             lexical_density=float("nan"),
             function_word_ratio=float("nan"),
+            noun_ratio_dist=empty_dist,
+            verb_ratio_dist=empty_dist,
+            adjective_ratio_dist=empty_dist,
+            adverb_ratio_dist=empty_dist,
+            noun_verb_ratio_dist=empty_dist,
+            adjective_noun_ratio_dist=empty_dist,
+            lexical_density_dist=empty_dist,
+            function_word_ratio_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=0,
             metadata={
                 "model": model,
                 "token_count": 0,
@@ -129,6 +161,28 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
     function_words = det_count + adp_count + conj_count
     function_word_ratio = function_words / total_tokens
+    # Create single-value distributions (POS analysis is done on full text)
+    noun_ratio_dist = make_distribution([noun_ratio])
+    verb_ratio_dist = make_distribution([verb_ratio])
+    adj_ratio_dist = make_distribution([adj_ratio])
+    adv_ratio_dist = make_distribution([adv_ratio])
+    noun_verb_dist = (
+        make_distribution([noun_verb_ratio])
+        if not (noun_verb_ratio != noun_verb_ratio)
+        else Distribution(
+            values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
+        )
+    )
+    adj_noun_dist = (
+        make_distribution([adj_noun_ratio])
+        if not (adj_noun_ratio != adj_noun_ratio)
+        else Distribution(
+            values=[], mean=float("nan"), median=float("nan"), std=0.0, range=0.0, iqr=0.0
+        )
+    )
+    lexical_density_dist = make_distribution([lexical_density])
+    function_word_dist = make_distribution([function_word_ratio])
     return POSResult(
         noun_ratio=noun_ratio,
         verb_ratio=verb_ratio,
@@ -138,6 +192,16 @@ def compute_pos_ratios(text: str, model: str = "en_core_web_sm") -> POSResult:
         adjective_noun_ratio=adj_noun_ratio,
         lexical_density=lexical_density,
         function_word_ratio=function_word_ratio,
+        noun_ratio_dist=noun_ratio_dist,
+        verb_ratio_dist=verb_ratio_dist,
+        adjective_ratio_dist=adj_ratio_dist,
+        adverb_ratio_dist=adv_ratio_dist,
+        noun_verb_ratio_dist=noun_verb_dist,
+        adjective_noun_ratio_dist=adj_noun_dist,
+        lexical_density_dist=lexical_density_dist,
+        function_word_ratio_dist=function_word_dist,
+        chunk_size=chunk_size,
+        chunk_count=1,  # Single pass analysis
         metadata={
             "model": model,
             "token_count": total_tokens,

pystylometry/syntactic/sentence_stats.py CHANGED Viewed

@@ -1,10 +1,17 @@
-"""Sentence-level statistics using spaCy."""
+"""Sentence-level statistics using spaCy.
-from .._types import SentenceStatsResult
-from .._utils import check_optional_dependency, split_sentences
+Related GitHub Issue:
+    #27 - Native chunked analysis with Distribution dataclass
+    https://github.com/craigtrim/pystylometry/issues/27
+"""
+from .._types import Distribution, SentenceStatsResult, make_distribution
+from .._utils import check_optional_dependency
-def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> SentenceStatsResult:
+def compute_sentence_stats(
+    text: str, model: str = "en_core_web_sm", chunk_size: int = 1000
+) -> SentenceStatsResult:
     """
     Compute sentence-level statistics using spaCy.
@@ -16,6 +23,10 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
     - Maximum sentence length
     - Total sentence count
+    Related GitHub Issue:
+        #27 - Native chunked analysis with Distribution dataclass
+        https://github.com/craigtrim/pystylometry/issues/27
     References:
         Hunt, K. W. (1965). Grammatical structures written at three grade levels.
         NCTE Research Report No. 3.
@@ -23,9 +34,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
     Args:
         text: Input text to analyze
         model: spaCy model name (default: "en_core_web_sm")
+        chunk_size: Number of words per chunk (default: 1000).
+            Note: Sentence analysis is performed on the full text for accuracy,
+            so this parameter is included for API consistency but actual
+            results are from a single pass.
     Returns:
-        SentenceStatsResult with sentence statistics and metadata
+        SentenceStatsResult with sentence statistics, distributions, and metadata
     Raises:
         ImportError: If spaCy is not installed
@@ -45,8 +60,7 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
         nlp = spacy.load(model)
     except OSError:
         raise OSError(
-            f"spaCy model '{model}' not found. "
-            f"Download it with: python -m spacy download {model}"
+            f"spaCy model '{model}' not found. Download it with: python -m spacy download {model}"
         )
     # Process text with spaCy
@@ -62,13 +76,28 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
     # Handle empty text
     if len(sentence_lengths) == 0:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return SentenceStatsResult(
             mean_sentence_length=float("nan"),
             sentence_length_std=float("nan"),
-            sentence_length_range=0,
-            min_sentence_length=0,
-            max_sentence_length=0,
+            sentence_length_range=0.0,
+            min_sentence_length=0.0,
+            max_sentence_length=0.0,
             sentence_count=0,
+            mean_sentence_length_dist=empty_dist,
+            sentence_length_std_dist=empty_dist,
+            sentence_length_range_dist=empty_dist,
+            min_sentence_length_dist=empty_dist,
+            max_sentence_length_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=0,
             metadata={
                 "model": model,
             },
@@ -86,10 +115,17 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
     else:
         std_dev = 0.0
-    min_length = min(sentence_lengths)
-    max_length = max(sentence_lengths)
+    min_length = float(min(sentence_lengths))
+    max_length = float(max(sentence_lengths))
     length_range = max_length - min_length
+    # Create single-value distributions (sentence analysis is done on full text)
+    mean_dist = make_distribution([mean_length])
+    std_dist = make_distribution([std_dev])
+    range_dist = make_distribution([length_range])
+    min_dist = make_distribution([min_length])
+    max_dist = make_distribution([max_length])
     return SentenceStatsResult(
         mean_sentence_length=mean_length,
         sentence_length_std=std_dev,
@@ -97,6 +133,13 @@ def compute_sentence_stats(text: str, model: str = "en_core_web_sm") -> Sentence
         min_sentence_length=min_length,
         max_sentence_length=max_length,
         sentence_count=len(sentence_lengths),
+        mean_sentence_length_dist=mean_dist,
+        sentence_length_std_dist=std_dist,
+        sentence_length_range_dist=range_dist,
+        min_sentence_length_dist=min_dist,
+        max_sentence_length_dist=max_dist,
+        chunk_size=chunk_size,
+        chunk_count=1,  # Single pass analysis
         metadata={
             "model": model,
             "sentence_lengths": sentence_lengths,

pystylometry/syntactic/sentence_types.py CHANGED Viewed

@@ -27,13 +27,19 @@ References:
     Quirk, R., et al. (1985). A Comprehensive Grammar of the English Language. Longman.
 """
-from .._types import SentenceTypeResult
+from typing import Any
+from .._types import Distribution, SentenceTypeResult, make_distribution
 from .._utils import check_optional_dependency
+# Type alias for spaCy Span (loaded dynamically)
+_SpaCySpan = Any
 def compute_sentence_types(
     text: str,
     model: str = "en_core_web_sm",
+    chunk_size: int = 1000,
 ) -> SentenceTypeResult:
     """
     Classify sentences by structure and function.
@@ -193,8 +199,7 @@ def compute_sentence_types(
         nlp = spacy.load(model)
     except OSError as e:
         raise OSError(
-            f"spaCy model '{model}' not found. "
-            f"Download with: python -m spacy download {model}"
+            f"spaCy model '{model}' not found. Download with: python -m spacy download {model}"
         ) from e
     # Parse text
@@ -203,6 +208,14 @@ def compute_sentence_types(
     # Handle empty text
     if len(sentences) == 0:
+        empty_dist = Distribution(
+            values=[],
+            mean=float("nan"),
+            median=float("nan"),
+            std=0.0,
+            range=0.0,
+            iqr=0.0,
+        )
         return SentenceTypeResult(
             simple_ratio=float("nan"),
             compound_ratio=float("nan"),
@@ -223,6 +236,18 @@ def compute_sentence_types(
             total_sentences=0,
             structural_diversity=float("nan"),
             functional_diversity=float("nan"),
+            simple_ratio_dist=empty_dist,
+            compound_ratio_dist=empty_dist,
+            complex_ratio_dist=empty_dist,
+            compound_complex_ratio_dist=empty_dist,
+            declarative_ratio_dist=empty_dist,
+            interrogative_ratio_dist=empty_dist,
+            imperative_ratio_dist=empty_dist,
+            exclamatory_ratio_dist=empty_dist,
+            structural_diversity_dist=empty_dist,
+            functional_diversity_dist=empty_dist,
+            chunk_size=chunk_size,
+            chunk_count=0,
             metadata={
                 "warning": "Empty text or no sentences found",
             },
@@ -249,13 +274,15 @@ def compute_sentence_types(
         functional_counts[functional_type] += 1
         # Store classification
-        sentence_classifications.append({
-            "text": sent.text,
-            "structural_type": structural_type,
-            "functional_type": functional_type,
-            "independent_clauses": independent_count,
-            "dependent_clauses": dependent_count,
-        })
+        sentence_classifications.append(
+            {
+                "text": sent.text,
+                "structural_type": structural_type,
+                "functional_type": functional_type,
+                "independent_clauses": independent_count,
+                "dependent_clauses": dependent_count,
+            }
+        )
     # Calculate ratios
     total_sentences = len(sentences)
@@ -271,11 +298,28 @@ def compute_sentence_types(
     # Calculate diversity metrics
     structural_ratios = [simple_ratio, compound_ratio, complex_ratio, compound_complex_ratio]
-    functional_ratios = [declarative_ratio, interrogative_ratio, imperative_ratio, exclamatory_ratio]
+    functional_ratios = [
+        declarative_ratio,
+        interrogative_ratio,
+        imperative_ratio,
+        exclamatory_ratio,
+    ]
     structural_diversity = _calculate_shannon_entropy(structural_ratios)
     functional_diversity = _calculate_shannon_entropy(functional_ratios)
+    # Create single-value distributions (sentence analysis is done on full text)
+    simple_ratio_dist = make_distribution([simple_ratio])
+    compound_ratio_dist = make_distribution([compound_ratio])
+    complex_ratio_dist = make_distribution([complex_ratio])
+    compound_complex_ratio_dist = make_distribution([compound_complex_ratio])
+    declarative_ratio_dist = make_distribution([declarative_ratio])
+    interrogative_ratio_dist = make_distribution([interrogative_ratio])
+    imperative_ratio_dist = make_distribution([imperative_ratio])
+    exclamatory_ratio_dist = make_distribution([exclamatory_ratio])
+    structural_diversity_dist = make_distribution([structural_diversity])
+    functional_diversity_dist = make_distribution([functional_diversity])
     # Collect metadata
     metadata = {
         "sentence_count": total_sentences,
@@ -306,11 +350,23 @@ def compute_sentence_types(
         total_sentences=total_sentences,
         structural_diversity=structural_diversity,
         functional_diversity=functional_diversity,
+        simple_ratio_dist=simple_ratio_dist,
+        compound_ratio_dist=compound_ratio_dist,
+        complex_ratio_dist=complex_ratio_dist,
+        compound_complex_ratio_dist=compound_complex_ratio_dist,
+        declarative_ratio_dist=declarative_ratio_dist,
+        interrogative_ratio_dist=interrogative_ratio_dist,
+        imperative_ratio_dist=imperative_ratio_dist,
+        exclamatory_ratio_dist=exclamatory_ratio_dist,
+        structural_diversity_dist=structural_diversity_dist,
+        functional_diversity_dist=functional_diversity_dist,
+        chunk_size=chunk_size,
+        chunk_count=1,  # Single pass analysis
         metadata=metadata,
     )
-def _count_independent_clauses(sent) -> int:
+def _count_independent_clauses(sent: _SpaCySpan) -> int:
     """
     Count independent clauses in a sentence.
@@ -336,7 +392,7 @@ def _count_independent_clauses(sent) -> int:
     return count
-def _count_dependent_clauses(sent) -> int:
+def _count_dependent_clauses(sent: _SpaCySpan) -> int:
     """
     Count dependent clauses in a sentence.
@@ -382,7 +438,7 @@ def _classify_structural(independent: int, dependent: int) -> str:
         return "simple"
-def _classify_functional(sent) -> str:
+def _classify_functional(sent: _SpaCySpan) -> str:
     """
     Classify sentence function based on punctuation and structure.
@@ -415,7 +471,7 @@ def _classify_functional(sent) -> str:
     return "declarative"
-def _is_imperative_structure(sent) -> bool:
+def _is_imperative_structure(sent: _SpaCySpan) -> bool:
     """
     Check if sentence has imperative structure.

pystylometry/viz/README.md ADDED Viewed

@@ -0,0 +1,27 @@
+# viz
+![6 public functions](https://img.shields.io/badge/functions-6-blue)
+![Optional: matplotlib](https://img.shields.io/badge/optional-matplotlib-yellow)
+Visualization for drift detection results. Two output modes: static PNG (matplotlib) and interactive HTML (React JSX).
+## Catalogue
+| File | Functions | Output |
+|------|-----------|--------|
+| `drift.py` | `plot_drift_timeline`, `plot_drift_scatter`, `plot_drift_report` | PNG via matplotlib/seaborn |
+| `jsx/report.py` | `export_drift_report_jsx` | Interactive HTML dashboard |
+| `jsx/timeline.py` | `export_drift_timeline_jsx` | Interactive HTML timeline |
+| `jsx/viewer.py` | `export_drift_viewer` | Standalone HTML viewer with file upload |
+| `jsx/_base.py` | _(internal)_ | React/JSX rendering base |
+## Install
+```
+pip install pystylometry[viz]   # For PNG output (matplotlib + seaborn)
+# JSX/HTML output requires no additional dependencies
+```
+## See Also
+- [`consistency/`](../consistency/) produces the `KilgarriffDriftResult` consumed by all viz functions

pystylometry/viz/__init__.py ADDED Viewed

@@ -0,0 +1,71 @@
+"""Visualization module for pystylometry.
+This module provides visualization functions for stylometric analysis results.
+Matplotlib Functions (PNG output):
+    Requires optional dependencies: pip install pystylometry[viz]
+    plot_drift_timeline: Line chart of chi-squared values over document
+    plot_drift_scatter: Scatter plot with reference zones (tic-tac-toe style)
+    plot_drift_report: Combined multi-panel visualization
+Interactive JSX Functions (HTML output):
+    No additional dependencies required (uses React via CDN)
+    export_drift_timeline_jsx: Interactive timeline chart
+    export_drift_report_jsx: Interactive multi-panel dashboard
+    export_drift_viewer: Standalone viewer with file upload
+Related GitHub Issues:
+    #38 - Visualization Options for Style Drift Detection
+    https://github.com/craigtrim/pystylometry/issues/38
+Example:
+    >>> from pystylometry.consistency import compute_kilgarriff_drift
+    >>> from pystylometry.viz import plot_drift_timeline, export_drift_timeline_jsx
+    >>>
+    >>> result = compute_kilgarriff_drift(text)
+    >>> plot_drift_timeline(result, output="timeline.png")  # Static PNG
+    >>> export_drift_timeline_jsx(result, "timeline.html")  # Interactive HTML
+"""
+from .drift import (  # noqa: E402
+    plot_drift_report,
+    plot_drift_scatter,
+    plot_drift_timeline,
+)
+from .jsx import (  # noqa: E402
+    export_drift_report_jsx,
+    export_drift_timeline_jsx,
+    export_drift_viewer,
+)
+try:
+    import matplotlib  # noqa: F401
+    import seaborn  # noqa: F401  # type: ignore[import-untyped]
+    _VIZ_AVAILABLE = True
+except ImportError:
+    _VIZ_AVAILABLE = False
+def _check_viz_available() -> None:
+    """Raise ImportError if visualization dependencies are not installed."""
+    if not _VIZ_AVAILABLE:
+        raise ImportError(
+            "Visualization requires optional dependencies. "
+            "Install with: pip install pystylometry[viz] or poetry install --with viz"
+        )
+__all__ = [
+    # Matplotlib (PNG)
+    "plot_drift_timeline",
+    "plot_drift_scatter",
+    "plot_drift_report",
+    # JSX (HTML)
+    "export_drift_timeline_jsx",
+    "export_drift_report_jsx",
+    # Standalone viewer
+    "export_drift_viewer",
+]

pystylometry 1.0.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

pystylometry 1.0.0py3-none-any.whl → 1.3.0py3-none-any.whl