PyPI - pystylometry - Versions diffs - 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl - Mend

pystylometry 1.1.0py3-none-any.whl → 1.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (29) hide show

pystylometry/README.md +42 -0
pystylometry/__init__.py +17 -1
pystylometry/_types.py +54 -0
pystylometry/authorship/README.md +21 -0
pystylometry/authorship/__init__.py +9 -6
pystylometry/authorship/additional_methods.py +262 -17
pystylometry/authorship/compression.py +175 -0
pystylometry/authorship/kilgarriff.py +8 -1
pystylometry/character/README.md +17 -0
pystylometry/consistency/README.md +27 -0
pystylometry/dialect/README.md +26 -0
pystylometry/lexical/README.md +23 -0
pystylometry/ngrams/README.md +18 -0
pystylometry/ngrams/extended_ngrams.py +314 -69
pystylometry/prosody/README.md +17 -0
pystylometry/prosody/rhythm_prosody.py +773 -11
pystylometry/readability/README.md +23 -0
pystylometry/stylistic/README.md +20 -0
pystylometry/stylistic/cohesion_coherence.py +669 -13
pystylometry/stylistic/genre_register.py +1560 -17
pystylometry/stylistic/markers.py +611 -17
pystylometry/stylistic/vocabulary_overlap.py +354 -13
pystylometry/syntactic/README.md +20 -0
pystylometry/viz/README.md +27 -0
pystylometry-1.3.0.dist-info/METADATA +136 -0
{pystylometry-1.1.0.dist-info → pystylometry-1.3.0.dist-info}/RECORD +28 -15
pystylometry-1.1.0.dist-info/METADATA +0 -278
{pystylometry-1.1.0.dist-info → pystylometry-1.3.0.dist-info}/WHEEL +0 -0
{pystylometry-1.1.0.dist-info → pystylometry-1.3.0.dist-info}/entry_points.txt +0 -0

pystylometry/README.md ADDED Viewed

@@ -0,0 +1,42 @@
+# pystylometry
+![Python 3.10+](https://img.shields.io/badge/python-3.10%2B-blue)
+![License: MIT](https://img.shields.io/badge/license-MIT-green)
+Core package for stylometric analysis and authorship attribution.
+## Module Map
+| Module | Purpose | Key Functions |
+|--------|---------|---------------|
+| [`lexical/`](lexical/) | Vocabulary diversity & richness | `compute_mtld`, `compute_yule`, `compute_ttr`, `compute_hapax_ratios` |
+| [`readability/`](readability/) | Text readability scoring | `compute_flesch`, `compute_gunning_fog`, `compute_ari`, `compute_smog` |
+| [`syntactic/`](syntactic/) | Sentence & parse structure | `compute_pos_ratios`, `compute_sentence_types`, `compute_advanced_syntactic` |
+| [`authorship/`](authorship/) | Author attribution & comparison | `compute_burrows_delta`, `compute_kilgarriff`, `compute_compression_distance` |
+| [`stylistic/`](stylistic/) | Style markers & vocabulary overlap | `compute_stylistic_markers`, `compute_vocabulary_overlap`, `compute_genre_register` |
+| [`character/`](character/) | Character-level features | `compute_character_metrics` |
+| [`ngrams/`](ngrams/) | N-gram entropy & sequences | `compute_extended_ngrams`, `compute_ngram_entropy` |
+| [`dialect/`](dialect/) | Regional dialect detection | `compute_dialect` |
+| [`consistency/`](consistency/) | Intra-document drift detection | `compute_kilgarriff_drift` |
+| [`prosody/`](prosody/) | Rhythm & stress patterns | `compute_rhythm_prosody` |
+| [`viz/`](viz/) | Visualization (PNG & interactive HTML) | `plot_drift_timeline`, `export_drift_report_jsx` |
+## Shared Internals
+| File | Purpose |
+|------|---------|
+| `_types.py` | All dataclass result types (e.g. `FleschResult`, `MTLDResult`, `KilgarriffDriftResult`) |
+| `_normalize.py` | Text normalization for readability and stylometry pipelines |
+| `_utils.py` | Shared tokenization and helper functions |
+| `tokenizer.py` | Configurable tokenizer with sentence/word splitting |
+| `cli.py` | Command-line interface (`pystylometry analyze`) |
+## Installation Extras
+```
+pip install pystylometry                  # Core (lexical only)
+pip install pystylometry[readability]     # + readability
+pip install pystylometry[syntactic]       # + syntactic (requires spaCy)
+pip install pystylometry[authorship]      # + authorship attribution
+pip install pystylometry[all]             # Everything
+```

pystylometry/__init__.py CHANGED Viewed

@@ -63,18 +63,28 @@ try:
 except ImportError:
     _SYNTACTIC_AVAILABLE = False
-# Authorship, ngrams, dialect, and consistency use only stdlib (no external dependencies)
+# Prosody requires pronouncing (CMU dictionary) - same dependency as readability
+try:
+    from . import prosody  # noqa: F401 - Rhythm and prosody metrics (Issue #25)
+    _PROSODY_AVAILABLE = True
+except ImportError:
+    _PROSODY_AVAILABLE = False
+# Authorship, ngrams, dialect, consistency, and stylistic use only stdlib (no external dependencies)
 from . import (
     authorship,  # noqa: F401
     consistency,  # noqa: F401 - Style drift detection (Issue #36)
     dialect,  # noqa: F401
     ngrams,  # noqa: F401
+    stylistic,  # noqa: F401 - Vocabulary overlap and similarity (Issue #21)
 )
 _AUTHORSHIP_AVAILABLE = True
 _NGRAMS_AVAILABLE = True
 _DIALECT_AVAILABLE = True
 _CONSISTENCY_AVAILABLE = True
+_STYLISTIC_AVAILABLE = True
 def analyze(
@@ -206,6 +216,8 @@ def get_available_modules() -> dict[str, bool]:
         "ngrams": _NGRAMS_AVAILABLE,
         "dialect": _DIALECT_AVAILABLE,
         "consistency": _CONSISTENCY_AVAILABLE,  # Style drift detection (Issue #36)
+        "stylistic": _STYLISTIC_AVAILABLE,  # Vocabulary overlap (Issue #21)
+        "prosody": _PROSODY_AVAILABLE,  # Rhythm and prosody (Issue #25)
     }
@@ -229,3 +241,7 @@ if _DIALECT_AVAILABLE:
     __all__.append("dialect")
 if _CONSISTENCY_AVAILABLE:
     __all__.append("consistency")
+if _STYLISTIC_AVAILABLE:
+    __all__.append("stylistic")
+if _PROSODY_AVAILABLE:
+    __all__.append("prosody")

pystylometry/_types.py CHANGED Viewed

@@ -1517,6 +1517,7 @@ class VocabularyOverlapResult:
         - Dice coefficient (2 * intersection / sum of sizes)
         - Overlap coefficient (intersection / min(size1, size2))
         - Cosine similarity (using word frequency vectors)
+        - KL divergence (asymmetric distributional difference)
         - Shared vocabulary size and ratio
         - Unique words in each text
         - Most distinctive words for each text
@@ -1526,6 +1527,10 @@ class VocabularyOverlapResult:
             New Phytologist, 11(2), 37-50.
         Salton, G., & McGill, M. J. (1983). Introduction to Modern Information
             Retrieval. McGraw-Hill.
+        Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
+            Annals of Mathematical Statistics, 22(1), 79-86.
+        Manning, C. D., & Schütze, H. (1999). Foundations of Statistical NLP.
+            MIT Press.
     Example:
         >>> result = compute_vocabulary_overlap(text1, text2)
@@ -1539,6 +1544,7 @@ class VocabularyOverlapResult:
     dice_coefficient: float  # 2 * intersection / (size1 + size2)
     overlap_coefficient: float  # Intersection / min(size1, size2)
     cosine_similarity: float  # Cosine of frequency vectors
+    kl_divergence: float  # Kullback-Leibler divergence (asymmetric, text1 || text2)
     # Vocabulary sizes
     text1_vocab_size: int  # Unique words in text 1
@@ -1897,6 +1903,54 @@ class JohnsBurrowsResult:
     metadata: dict[str, Any]  # Method-specific parameters, z-scores, etc.
+@dataclass
+class CompressionResult:
+    """Result from compression-based authorship attribution.
+    Compression-based methods use the Normalized Compression Distance (NCD) to
+    measure similarity between texts. The intuition is that if two texts are
+    similar, compressing them together will yield better compression than
+    compressing separately. This approach is language-independent and captures
+    deep statistical regularities.
+    Related GitHub Issue:
+        #24 - Additional Authorship Attribution Methods
+        https://github.com/craigtrim/pystylometry/issues/24
+    Formula:
+        NCD(x,y) = (C(xy) - min(C(x), C(y))) / max(C(x), C(y))
+        where C(x) is the compressed size of x, and C(xy) is the compressed
+        size of x and y concatenated.
+    Interpretation:
+        - NCD ≈ 0: Texts are very similar
+        - NCD ≈ 1: Texts are very different
+        - Typical same-author pairs: 0.3-0.6
+        - Typical different-author pairs: 0.6-0.9
+    References:
+        Cilibrasi, R., & Vitányi, P. M. (2005). Clustering by compression.
+            IEEE Transactions on Information Theory, 51(4), 1523-1545.
+        Benedetto, D., Caglioti, E., & Loreto, V. (2002). Language trees and
+            zipping. Physical Review Letters, 88(4), 048702.
+    Example:
+        >>> result = compute_compression_distance(text1, text2)
+        >>> print(f"NCD: {result.ncd:.3f}")
+        >>> if result.ncd < 0.5:
+        ...     print("Texts likely by same author")
+    """
+    ncd: float  # Normalized Compression Distance [0, 1+]
+    compressor: str  # Compression algorithm used (e.g., "gzip", "zlib", "bz2")
+    text1_compressed_size: int  # Compressed size of text1 alone
+    text2_compressed_size: int  # Compressed size of text2 alone
+    combined_compressed_size: int  # Compressed size of concatenated texts
+    metadata: dict[str, Any]  # Raw sizes, compression ratios, etc.
 # ===== Rhythm and Prosody Results =====
 # Related to GitHub Issue #25: Rhythm and Prosody Metrics
 # https://github.com/craigtrim/pystylometry/issues/25

pystylometry/authorship/README.md ADDED Viewed

@@ -0,0 +1,21 @@
+# authorship
+![7 public functions](https://img.shields.io/badge/functions-7-blue)
+![No external deps](https://img.shields.io/badge/deps-none-brightgreen)
+Authorship attribution methods for comparing texts and determining likely authorship.
+## Catalogue
+| File | Functions | Method |
+|------|-----------|--------|
+| `burrows_delta.py` | `compute_burrows_delta`, `compute_cosine_delta` | Classic Delta and angular distance variant |
+| `zeta.py` | `compute_zeta` | Zeta method for marker word detection |
+| `kilgarriff.py` | `compute_kilgarriff` | Chi-squared corpus comparison |
+| `additional_methods.py` | `compute_minmax`, `compute_johns_delta` | MinMax distance, Quadratic/Weighted Delta |
+| `compression.py` | `compute_compression_distance` | Normalized Compression Distance (NCD) |
+## See Also
+- [`consistency/`](../consistency/) applies `compute_kilgarriff` in sliding windows for intra-document drift detection
+- [`lexical/`](../lexical/) provides the vocabulary features many attribution methods rely on

pystylometry/authorship/__init__.py CHANGED Viewed

@@ -2,8 +2,8 @@
 This module provides methods for authorship attribution - comparing texts to
 determine whether they were written by the same author. Available methods
-include classic approaches (Burrows' Delta, Zeta) and statistical methods
-(Kilgarriff's chi-squared).
+include classic approaches (Burrows' Delta, Zeta), statistical methods
+(Kilgarriff's chi-squared), and information-theoretic methods (NCD).
 Related GitHub Issues:
     #24 - Additional Authorship Attribution Methods
@@ -16,20 +16,23 @@ Available Functions:
     compute_cosine_delta: Angular distance variant of Delta
     compute_zeta: Zeta method for marker word detection
     compute_kilgarriff: Chi-squared method for corpus comparison
-    compute_minmax: Burrows' original min-max method (not yet implemented)
-    compute_johns_delta: Delta variations (not yet implemented)
+    compute_minmax: Burrows' original min-max distance method
+    compute_johns_delta: Delta variations (quadratic, weighted)
+    compute_compression_distance: Normalized Compression Distance (NCD)
 """
 from .additional_methods import compute_johns_delta, compute_minmax
 from .burrows_delta import compute_burrows_delta, compute_cosine_delta
+from .compression import compute_compression_distance
 from .kilgarriff import compute_kilgarriff
 from .zeta import compute_zeta
 __all__ = [
     "compute_burrows_delta",
+    "compute_compression_distance",
     "compute_cosine_delta",
-    "compute_zeta",
+    "compute_johns_delta",
     "compute_kilgarriff",
     "compute_minmax",
-    "compute_johns_delta",
+    "compute_zeta",
 ]

pystylometry/authorship/additional_methods.py CHANGED Viewed

@@ -8,40 +8,150 @@ Related GitHub Issue:
     https://github.com/craigtrim/pystylometry/issues/24
 Methods implemented:
-    - Kilgarriff's Chi-squared → See kilgarriff.py (Issue #31)
-    - Min-Max (Burrows' original method) → Not yet implemented
-    - John Burrows' Delta variations → Not yet implemented
+    - Kilgarriff's Chi-squared -> See kilgarriff.py (Issue #31)
+    - Min-Max distance (Burrows' original method)
+    - John Burrows' Delta variations (Quadratic, Weighted)
 References:
     Kilgarriff, A. (2001). Comparing corpora. International Journal of Corpus Linguistics.
     Burrows, J. F. (1992). Not unless you ask nicely. Literary and Linguistic Computing.
     Burrows, J. (2005). Who wrote Shamela? Literary and Linguistic Computing.
+    Argamon, S. (2008). Interpreting Burrows's Delta. Literary and Linguistic Computing.
 """
+from __future__ import annotations
+import math
+from collections import Counter
 from .._types import JohnsBurrowsResult, MinMaxResult
+from .._utils import tokenize
 def compute_minmax(text1: str, text2: str, mfw: int = 100) -> MinMaxResult:
     """
-    Compute Min-Max distance (Burrows' original method).
+    Compute Min-Max distance between two texts.
+    This is Burrows' original method from his 1992 paper, before the development
+    of Delta. It normalizes word frequencies using min-max scaling and computes
+    the mean absolute distance between normalized frequency vectors.
     Related GitHub Issue:
         #24 - Additional Authorship Attribution Methods
         https://github.com/craigtrim/pystylometry/issues/24
+    Algorithm:
+        1. Tokenize both texts and build frequency distributions
+        2. Identify the top N most frequent words in the joint corpus
+        3. Compute relative frequencies for each word in each text
+        4. Normalize each word's frequencies using min-max scaling:
+           normalized(f) = (f - min) / (max - min)
+        5. Compute mean absolute difference of normalized frequencies
+    Interpretation:
+        - Lower values indicate more similar texts (likely same author)
+        - Higher values indicate more different texts
+        - Scale: 0.0 (identical) to 1.0 (maximally different)
+    References:
+        Burrows, J. F. (1992). Not unless you ask nicely: The interpretative
+            nexus between analysis and information. Literary and Linguistic
+            Computing, 7(2), 91-109.
     Args:
         text1: First text for comparison
         text2: Second text for comparison
-        mfw: Number of most frequent words to analyze
+        mfw: Number of most frequent words to analyze (default: 100)
     Returns:
         MinMaxResult with min-max distance and distinctive features.
+    Example:
+        >>> result = compute_minmax(text_by_author_a, text_by_author_b)
+        >>> print(f"MinMax distance: {result.minmax_distance:.3f}")
+        >>> print(f"Most distinctive: {result.most_distinctive_features[0]}")
     """
-    # TODO: Implement Min-Max distance
-    # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
-    raise NotImplementedError(
-        "Min-Max distance not yet implemented. "
-        "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
+    # Tokenize and lowercase
+    tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
+    tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
+    if not tokens1 or not tokens2:
+        return MinMaxResult(
+            minmax_distance=0.0,
+            feature_count=0,
+            most_distinctive_features=[],
+            metadata={
+                "text1_size": len(tokens1),
+                "text2_size": len(tokens2),
+                "warning": "One or both texts are empty",
+            },
+        )
+    # Build frequency distributions
+    freq1 = Counter(tokens1)
+    freq2 = Counter(tokens2)
+    size1 = len(tokens1)
+    size2 = len(tokens2)
+    # Joint corpus: top N most frequent words
+    joint: Counter[str] = Counter()
+    joint.update(freq1)
+    joint.update(freq2)
+    top_words = [word for word, _ in joint.most_common(mfw)]
+    if not top_words:
+        return MinMaxResult(
+            minmax_distance=0.0,
+            feature_count=0,
+            most_distinctive_features=[],
+            metadata={
+                "text1_size": size1,
+                "text2_size": size2,
+                "warning": "No common words found",
+            },
+        )
+    # Relative frequencies
+    rel1 = [freq1.get(w, 0) / size1 for w in top_words]
+    rel2 = [freq2.get(w, 0) / size2 for w in top_words]
+    # Min-Max normalization per feature across both texts
+    # Then compute absolute distance
+    contributions: list[tuple[str, float]] = []
+    total_distance = 0.0
+    for i, word in enumerate(top_words):
+        f1, f2 = rel1[i], rel2[i]
+        max_val = max(f1, f2)
+        if max_val > 0:
+            # Min-Max normalized distance for this feature
+            dist = abs(f1 - f2) / max_val
+        else:
+            dist = 0.0
+        total_distance += dist
+        contributions.append((word, dist))
+    # Sort contributions by magnitude
+    contributions.sort(key=lambda x: x[1], reverse=True)
+    # Mean distance across all features
+    minmax_distance = total_distance / len(top_words) if top_words else 0.0
+    return MinMaxResult(
+        minmax_distance=minmax_distance,
+        feature_count=len(top_words),
+        most_distinctive_features=contributions[:20],
+        metadata={
+            "text1_size": size1,
+            "text2_size": size2,
+            "text1_vocab": len(freq1),
+            "text2_vocab": len(freq2),
+            "mfw_requested": mfw,
+            "method": "minmax_1992",
+            "all_contributions": contributions,
+        },
     )
@@ -54,22 +164,157 @@ def compute_johns_delta(
     """
     Compute John Burrows' Delta variations.
+    This implements alternative formulations of Burrows' Delta metric beyond
+    the standard mean absolute z-score difference. The quadratic variant uses
+    squared z-score differences (Euclidean distance), while the weighted variant
+    applies inverse-rank weighting so higher-frequency words contribute more.
     Related GitHub Issue:
         #24 - Additional Authorship Attribution Methods
         https://github.com/craigtrim/pystylometry/issues/24
+    Methods:
+        - "quadratic": Euclidean distance of z-scores
+          Delta_Q = sqrt(sum((z1_i - z2_i)^2) / n)
+        - "weighted": Inverse-rank weighted Delta
+          Delta_W = sum(w_i * |z1_i - z2_i|) / sum(w_i)
+          where w_i = 1 / rank_i
+    Interpretation:
+        - Lower values indicate more similar texts (likely same author)
+        - Quadratic Delta penalizes large deviations more than standard Delta
+        - Weighted Delta emphasizes the most frequent words
+    References:
+        Burrows, J. (2005). Who wrote Shamela? Verifying the authorship of a
+            parodic text. Literary and Linguistic Computing, 20(4), 437-450.
+        Argamon, S. (2008). Interpreting Burrows's Delta: Geometric and
+            probabilistic foundations. Literary and Linguistic Computing,
+            23(2), 131-147.
     Args:
         text1: First text for comparison
         text2: Second text for comparison
-        mfw: Number of most frequent words to analyze
-        method: Delta variation ("quadratic", "weighted", "rotated")
+        mfw: Number of most frequent words to analyze (default: 100)
+        method: Delta variation ("quadratic" or "weighted")
     Returns:
         JohnsBurrowsResult with delta score and method details.
+    Example:
+        >>> result = compute_johns_delta(text1, text2, method="quadratic")
+        >>> print(f"Quadratic Delta: {result.delta_score:.3f}")
     """
-    # TODO: Implement John's Delta variations
-    # GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24
-    raise NotImplementedError(
-        "John's Delta variations not yet implemented. "
-        "See GitHub Issue #24: https://github.com/craigtrim/pystylometry/issues/24"
+    if method not in ("quadratic", "weighted"):
+        raise ValueError(f"method must be 'quadratic' or 'weighted', got '{method}'")
+    # Tokenize and lowercase
+    tokens1 = [t.lower() for t in tokenize(text1) if t.isalpha()]
+    tokens2 = [t.lower() for t in tokenize(text2) if t.isalpha()]
+    if not tokens1 or not tokens2:
+        return JohnsBurrowsResult(
+            delta_score=0.0,
+            method=method,
+            feature_count=0,
+            most_distinctive_features=[],
+            metadata={
+                "text1_size": len(tokens1),
+                "text2_size": len(tokens2),
+                "warning": "One or both texts are empty",
+            },
+        )
+    # Build frequency distributions
+    freq1 = Counter(tokens1)
+    freq2 = Counter(tokens2)
+    size1 = len(tokens1)
+    size2 = len(tokens2)
+    # Joint corpus: top N most frequent words
+    joint: Counter[str] = Counter()
+    joint.update(freq1)
+    joint.update(freq2)
+    top_words = [word for word, _ in joint.most_common(mfw)]
+    if not top_words:
+        return JohnsBurrowsResult(
+            delta_score=0.0,
+            method=method,
+            feature_count=0,
+            most_distinctive_features=[],
+            metadata={
+                "text1_size": size1,
+                "text2_size": size2,
+                "warning": "No common words found",
+            },
+        )
+    # Relative frequencies
+    rel1 = [freq1.get(w, 0) / size1 for w in top_words]
+    rel2 = [freq2.get(w, 0) / size2 for w in top_words]
+    # Mean-normalized differences
+    # With only 2 texts, classical z-scores are degenerate: stdev([a,b]) is
+    # always |a-b|/sqrt(2), producing identical z-scores (±0.707) for all
+    # features with any difference. Instead, we normalize by the mean frequency
+    # of each feature across both texts, which preserves discriminative power:
+    #   normalized_i = (f1_i - f2_i) / mean(f1_i, f2_i)
+    # This weights words proportionally to how much they differ relative to
+    # their expected frequency, preventing high-frequency words from dominating
+    # through absolute differences alone.
+    z1: list[float] = []
+    z2: list[float] = []
+    for i in range(len(top_words)):
+        mean_val = (rel1[i] + rel2[i]) / 2
+        # Normalize by mean frequency; use epsilon for words absent in both
+        norm = mean_val if mean_val > 0 else 1e-10
+        z1.append((rel1[i] - mean_val) / norm)
+        z2.append((rel2[i] - mean_val) / norm)
+    # Compute distance based on method
+    contributions: list[tuple[str, float]] = []
+    if method == "quadratic":
+        # Quadratic Delta: root mean squared z-score difference
+        squared_diffs: list[float] = []
+        for i, word in enumerate(top_words):
+            diff_sq = (z1[i] - z2[i]) ** 2
+            squared_diffs.append(diff_sq)
+            contributions.append((word, diff_sq))
+        delta_score = math.sqrt(sum(squared_diffs) / len(squared_diffs)) if squared_diffs else 0.0
+    else:  # weighted
+        # Weighted Delta: inverse-rank weighted mean absolute z-score difference
+        weighted_diffs: list[float] = []
+        weights: list[float] = []
+        for i, word in enumerate(top_words):
+            weight = 1.0 / (i + 1)  # Inverse rank weighting
+            abs_diff = abs(z1[i] - z2[i])
+            weighted_diffs.append(weight * abs_diff)
+            weights.append(weight)
+            contributions.append((word, abs_diff))
+        delta_score = sum(weighted_diffs) / sum(weights) if weights else 0.0
+    # Sort contributions by magnitude
+    contributions.sort(key=lambda x: x[1], reverse=True)
+    return JohnsBurrowsResult(
+        delta_score=delta_score,
+        method=method,
+        feature_count=len(top_words),
+        most_distinctive_features=contributions[:20],
+        metadata={
+            "text1_size": size1,
+            "text2_size": size2,
+            "text1_vocab": len(freq1),
+            "text2_vocab": len(freq2),
+            "mfw_requested": mfw,
+            "z_scores_text1": dict(zip(top_words[:20], z1[:20])),
+            "z_scores_text2": dict(zip(top_words[:20], z2[:20])),
+            "all_contributions": contributions,
+        },
     )

pystylometry 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

pystylometry 1.1.0py3-none-any.whl → 1.3.0py3-none-any.whl