PyPI - pycorpdiff - Versions diffs - 0.1.0a0__py3-none-any.whl - Mend

pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

pycorpdiff/__init__.py +126 -0
pycorpdiff/_backends/__init__.py +3 -0
pycorpdiff/_backends/pandas.py +3 -0
pycorpdiff/_backends/polars.py +3 -0
pycorpdiff/collocation/__init__.py +19 -0
pycorpdiff/collocation/cooccurrence.py +65 -0
pycorpdiff/collocation/measures.py +102 -0
pycorpdiff/collocation/network.py +233 -0
pycorpdiff/collocation/shift.py +146 -0
pycorpdiff/compare.py +345 -0
pycorpdiff/corpus.py +411 -0
pycorpdiff/datasets/__init__.py +27 -0
pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
pycorpdiff/datasets/_generate_hansard.py +221 -0
pycorpdiff/datasets/hansard.py +235 -0
pycorpdiff/datasets/histwords.py +221 -0
pycorpdiff/explain.py +177 -0
pycorpdiff/io/__init__.py +16 -0
pycorpdiff/io/duckdb.py +92 -0
pycorpdiff/io/huggingface.py +142 -0
pycorpdiff/io/readers.py +138 -0
pycorpdiff/keyness/__init__.py +26 -0
pycorpdiff/keyness/bayes.py +50 -0
pycorpdiff/keyness/chi_squared.py +94 -0
pycorpdiff/keyness/correction.py +34 -0
pycorpdiff/keyness/dispersion.py +89 -0
pycorpdiff/keyness/effect_sizes.py +65 -0
pycorpdiff/keyness/loglikelihood.py +92 -0
pycorpdiff/keyness/multicorpus.py +143 -0
pycorpdiff/keyness/permutation.py +154 -0
pycorpdiff/py.typed +0 -0
pycorpdiff/results.py +635 -0
pycorpdiff/semantic/__init__.py +18 -0
pycorpdiff/semantic/alignment.py +53 -0
pycorpdiff/semantic/embed.py +84 -0
pycorpdiff/semantic/shift.py +224 -0
pycorpdiff/semantic/trajectory.py +166 -0
pycorpdiff/stats.py +69 -0
pycorpdiff/temporal/__init__.py +15 -0
pycorpdiff/temporal/bocpd.py +233 -0
pycorpdiff/temporal/causal_impact.py +293 -0
pycorpdiff/temporal/changepoint.py +92 -0
pycorpdiff/temporal/forecast.py +405 -0
pycorpdiff/temporal/its.py +123 -0
pycorpdiff/temporal/slicing.py +174 -0
pycorpdiff/tokenize.py +110 -0
pycorpdiff/viz/__init__.py +37 -0
pycorpdiff/viz/bocpd.py +173 -0
pycorpdiff/viz/causal_impact.py +142 -0
pycorpdiff/viz/collocation.py +48 -0
pycorpdiff/viz/dispersion.py +117 -0
pycorpdiff/viz/forecast.py +129 -0
pycorpdiff/viz/keyness.py +96 -0
pycorpdiff/viz/network.py +186 -0
pycorpdiff/viz/scattertext.py +160 -0
pycorpdiff/viz/semantic_forecast.py +114 -0
pycorpdiff/viz/trajectory.py +48 -0
pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0

pycorpdiff/collocation/shift.py ADDED Viewed

@@ -0,0 +1,146 @@
+"""Cross-corpus collocation shift — gained / lost collocates of a target."""
+from __future__ import annotations
+from typing import Literal
+import pandas as pd
+from ..corpus import Corpus, CorpusSlice
+from .cooccurrence import collocate_counts
+from .measures import logdice, mi_three, pmi, t_score
+CollocationMeasure = Literal["logDice", "PMI", "t_score", "MI3"]
+_MEASURES_NEED_N: dict[str, bool] = {
+    "logDice": False,
+    "PMI": True,
+    "t_score": True,
+    "MI3": True,
+}
+def collocation_shift(
+    a: Corpus | CorpusSlice,
+    b: Corpus | CorpusSlice,
+    target: str,
+    window: int = 5,
+    measure: CollocationMeasure = "logDice",
+    min_count: int = 5,
+    smoothing: float = 0.5,
+) -> pd.DataFrame:
+    """Compute the change in target-term collocates between two corpora.
+    For every collocate that meets the combined ``min_count`` threshold,
+    the chosen association measure is computed in each corpus and the
+    difference (``score_a - score_b``) is reported. Laplace ``smoothing``
+    is applied to joint and marginal counts before scoring so collocates
+    absent on one side yield finite scores; the default α=0.5 mirrors
+    Hardie's LogRatio smoothing and Rychlý's logDice convention.
+    Parameters
+    ----------
+    a, b
+        The two corpora (or slices) to compare. ``target`` must appear
+        in both — its complete absence on one side makes the shift
+        undefined.
+    target
+        The pivot term whose collocates we are tracking.
+    window
+        Context size on each side of the target.
+    measure
+        Which association measure to apply.
+    min_count
+        Drop collocates whose ``count_a + count_b`` is below this.
+    smoothing
+        Laplace constant added to joint / marginal counts before
+        scoring. Must be > 0.
+    Returns
+    -------
+    pandas.DataFrame
+        Indexed by collocate, columns: ``count_a``, ``count_b``,
+        ``score_a``, ``score_b``, ``shift``. Sorted by ``|shift|``
+        descending.
+    """
+    if smoothing <= 0:
+        raise ValueError(f"smoothing must be > 0; got {smoothing}")
+    if measure not in _MEASURES_NEED_N:
+        raise ValueError(
+            f"unknown measure={measure!r}; expected one of {list(_MEASURES_NEED_N)}"
+        )
+    tokens_a = a.tokens()
+    tokens_b = b.tokens()
+    cocount_a, fx_a = collocate_counts(tokens_a, target, window=window)
+    cocount_b, fx_b = collocate_counts(tokens_b, target, window=window)
+    if fx_a == 0:
+        raise ValueError(f"target {target!r} not found in corpus a")
+    if fx_b == 0:
+        raise ValueError(f"target {target!r} not found in corpus b")
+    all_collocates = sorted(set(cocount_a) | set(cocount_b))
+    fxy_a_raw = pd.Series(
+        {c: cocount_a.get(c, 0) for c in all_collocates}, dtype="int64"
+    )
+    fxy_b_raw = pd.Series(
+        {c: cocount_b.get(c, 0) for c in all_collocates}, dtype="int64"
+    )
+    keep = (fxy_a_raw + fxy_b_raw) >= min_count
+    fxy_a_raw = fxy_a_raw[keep]
+    fxy_b_raw = fxy_b_raw[keep]
+    if len(fxy_a_raw) == 0:
+        return pd.DataFrame(
+            columns=["count_a", "count_b", "score_a", "score_b", "shift"]
+        ).rename_axis("collocate")
+    vocab_a = a.vocab()
+    vocab_b = b.vocab()
+    fy_a_raw = vocab_a.reindex(fxy_a_raw.index, fill_value=0).astype(float)
+    fy_b_raw = vocab_b.reindex(fxy_b_raw.index, fill_value=0).astype(float)
+    n_a = a.total_tokens()
+    n_b = b.total_tokens()
+    # Laplace smoothing across joint and marginal counts — keeps every
+    # measure finite even for collocates absent on one side. f_x (the
+    # target count) is also smoothed for symmetry.
+    fxy_a = fxy_a_raw + smoothing
+    fxy_b = fxy_b_raw + smoothing
+    fy_a = fy_a_raw + smoothing
+    fy_b = fy_b_raw + smoothing
+    fx_a_s = fx_a + smoothing
+    fx_b_s = fx_b + smoothing
+    if measure == "logDice":
+        score_a = logdice(fxy_a, fx_a_s, fy_a)
+        score_b = logdice(fxy_b, fx_b_s, fy_b)
+    elif measure == "PMI":
+        score_a = pmi(fxy_a, fx_a_s, fy_a, n_a)
+        score_b = pmi(fxy_b, fx_b_s, fy_b, n_b)
+    elif measure == "t_score":
+        score_a = t_score(fxy_a, fx_a_s, fy_a, n_a)
+        score_b = t_score(fxy_b, fx_b_s, fy_b, n_b)
+    else:  # MI3
+        score_a = mi_three(fxy_a, fx_a_s, fy_a, n_a)
+        score_b = mi_three(fxy_b, fx_b_s, fy_b, n_b)
+    shift = score_a - score_b
+    table = pd.DataFrame(
+        {
+            "count_a": fxy_a_raw.astype("int64"),
+            "count_b": fxy_b_raw.astype("int64"),
+            "score_a": score_a,
+            "score_b": score_b,
+            "shift": shift,
+        }
+    )
+    table.index.name = "collocate"
+    sort_key = table["shift"].abs()
+    return (
+        table.assign(_sort_key=sort_key)
+        .sort_values("_sort_key", ascending=False, kind="stable")
+        .drop(columns="_sort_key")
+    )

pycorpdiff/compare.py ADDED Viewed

@@ -0,0 +1,345 @@
+"""Public ``compare()`` facade and the :class:`Comparison` class.
+This module defines the public API surface. Analytical methods delegate
+to the keyness / collocation / semantic subpackages.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Literal
+from .corpus import Corpus, CorpusSlice
+if TYPE_CHECKING:
+    from .results import (
+        CollocationShiftResult,
+        ConcordanceResult,
+        KeynessResult,
+        SemanticShiftResult,
+    )
+    from .semantic.embed import Embedder
+KeynessMethod = Literal[
+    "log_likelihood", "log_ratio", "bayes_factor", "percent_diff", "chi_squared",
+]
+CollocationMeasure = Literal["logDice", "PMI", "t_score", "MI3"]
+EmbeddingAlignment = Literal["none", "procrustes"]
+MultipleComparisons = Literal["bh", "bonferroni", "none"]
+CorpusLike = Corpus | CorpusSlice
+@dataclass(frozen=True)
+class Comparison:
+    """A pairwise comparison of two corpora (or slices).
+    Construct via :func:`compare` rather than directly; this keeps the
+    surface area small and lets the package add specialised
+    constructors (``compare.before_after``, ``compare.over_time``) on
+    the function attribute.
+    """
+    a: CorpusLike
+    b: CorpusLike
+    def keyness(
+        self,
+        method: KeynessMethod = "log_likelihood",
+        effect_size: bool = True,
+        dispersion: bool = False,
+        min_count: int = 5,
+        multiple_comparisons: MultipleComparisons = "bh",
+        stop_words: set[str] | list[str] | None = None,
+        permutation_n: int = 0,
+        permutation_seed: int | None = None,
+    ) -> KeynessResult:
+        """Compute keyness for every shared-vocabulary item.
+        Parameters
+        ----------
+        method
+            Which statistic to sort the result by. ``"log_likelihood"``
+            (default) sorts by signed Dunning G²; ``"chi_squared"``
+            sorts by signed Pearson χ². The other modes
+            (``"log_ratio"``, ``"bayes_factor"``, ``"percent_diff"``)
+            require ``effect_size=True`` and sort by that column.
+        effect_size
+            If True (default), also compute LogRatio (Hardie),
+            %DIFF (Gabrielatos), and the BIC-Bayes factor (Wilson).
+        dispersion
+            If True, compute Juilland's D for both corpora and flag
+            terms where ``D < 0.5`` in either — the canonical "this is
+            driven by one document" heuristic. Off by default because
+            it requires constructing the full doc-term matrices.
+        min_count
+            Drop terms whose ``count_a + count_b`` is below this
+            threshold. Dunning's small-cell unreliability makes the
+            default of 5 the standard recommendation.
+        multiple_comparisons
+            ``"bh"`` (default, Benjamini–Hochberg), ``"bonferroni"``,
+            or ``"none"``. The corrected column is named ``p_adjusted``.
+        stop_words
+            Iterable of terms to exclude before scoring. Useful for
+            filtering function-word noise without modifying the source
+            corpus. Tokens drop *after* vocabulary union, so the corpus
+            totals (used as normalisation denominators) are unaffected.
+        permutation_n
+            If positive, also compute an empirical permutation *p*-value
+            for every retained term and emit it as the ``p_permutation``
+            column. Documents are the unit of exchangeability. Useful
+            when the asymptotic χ² approximation is suspect (small
+            expected counts, very small corpora). ``999`` is the
+            conventional value; cost scales linearly. Disabled by
+            default — this is the expensive opt-in.
+        permutation_seed
+            Optional RNG seed for reproducible permutation *p*-values.
+        """
+        # Imports kept local to break circulars and to keep this module
+        # importable without the keyness machinery on hand.
+        from .keyness.bayes import bayes_factor as _bayes_factor
+        from .keyness.chi_squared import chi_squared as _chi_squared
+        from .keyness.correction import benjamini_hochberg, bonferroni
+        from .keyness.dispersion import juilland_d
+        from .keyness.effect_sizes import log_ratio as _log_ratio
+        from .keyness.effect_sizes import percent_diff as _percent_diff
+        from .keyness.loglikelihood import log_likelihood
+        from .results import KeynessResult
+        dtm_a = self.a.doc_term_counts(min_count=1)
+        dtm_b = self.b.doc_term_counts(min_count=1)
+        vocab_a = dtm_a.sum(axis=0)
+        vocab_b = dtm_b.sum(axis=0)
+        n_a = int(vocab_a.sum())
+        n_b = int(vocab_b.sum())
+        if n_a == 0 or n_b == 0:
+            raise ValueError(
+                f"both corpora must contain at least one token; got |a|={n_a}, |b|={n_b}"
+            )
+        all_terms = vocab_a.index.union(vocab_b.index)
+        a_aligned = vocab_a.reindex(all_terms, fill_value=0).astype("int64")
+        b_aligned = vocab_b.reindex(all_terms, fill_value=0).astype("int64")
+        keep = (a_aligned + b_aligned) >= min_count
+        if stop_words is not None:
+            stop_set = set(stop_words)
+            keep &= ~a_aligned.index.isin(stop_set)
+        a_kept = a_aligned[keep]
+        b_kept = b_aligned[keep]
+        # G² is always computed (cheap, the default sort column). χ² is
+        # computed only when requested — same shape, asymptotically
+        # equivalent, no need to pay for both by default.
+        table = log_likelihood(a_kept, b_kept, n_a, n_b)
+        if method == "chi_squared":
+            chi_table = _chi_squared(a_kept, b_kept, n_a, n_b)
+            table["chi_squared"] = chi_table["chi_squared"]
+        if effect_size:
+            table["log_ratio"] = _log_ratio(a_kept, b_kept, n_a, n_b)
+            table["percent_diff"] = _percent_diff(a_kept, b_kept, n_a, n_b)
+            table["bayes_factor"] = _bayes_factor(a_kept, b_kept, n_a, n_b)
+        if dispersion:
+            kept_terms = table.index
+            disp_a = juilland_d(dtm_a.reindex(columns=kept_terms, fill_value=0))
+            disp_b = juilland_d(dtm_b.reindex(columns=kept_terms, fill_value=0))
+            table["dispersion_a"] = disp_a
+            table["dispersion_b"] = disp_b
+            table["dispersion_flag"] = (disp_a < 0.5) | (disp_b < 0.5)
+        if multiple_comparisons == "bh":
+            table["p_adjusted"] = benjamini_hochberg(table["p_value"].to_numpy())
+        elif multiple_comparisons == "bonferroni":
+            table["p_adjusted"] = bonferroni(table["p_value"].to_numpy())
+        if permutation_n > 0:
+            from .keyness.permutation import permutation_pvalues as _perm
+            p_perm = _perm(
+                self.a, self.b,
+                terms=table.index,
+                n_permutations=permutation_n,
+                seed=permutation_seed,
+            )
+            table["p_permutation"] = p_perm.reindex(table.index)
+        sort_col = {
+            "log_likelihood": "g2",
+            "log_ratio": "log_ratio",
+            "bayes_factor": "bayes_factor",
+            "percent_diff": "percent_diff",
+            "chi_squared": "chi_squared",
+        }[method]
+        if sort_col not in table.columns:
+            # User asked to sort by an effect-size column they disabled.
+            raise ValueError(
+                f"method={method!r} requires effect_size=True so the column exists"
+            )
+        # Sort by |signed score| so direction doesn't bury overuse-in-B terms.
+        sort_key = table[sort_col].abs()
+        table = table.assign(_sort_key=sort_key).sort_values(
+            "_sort_key", ascending=False
+        ).drop(columns="_sort_key")
+        out = table.reset_index().rename(columns={"index": "term"})
+        return KeynessResult(
+            table=out,
+            method=method,
+            n_a=n_a,
+            n_b=n_b,
+            label_a=_corpus_label(self.a),
+            label_b=_corpus_label(self.b),
+            params={
+                "effect_size": effect_size,
+                "dispersion": dispersion,
+                "min_count": min_count,
+                "multiple_comparisons": multiple_comparisons,
+                "stop_words": tuple(stop_words) if stop_words else None,
+                "permutation_n": permutation_n,
+                "permutation_seed": permutation_seed,
+            },
+            corpus_a=self.a,
+            corpus_b=self.b,
+        )
+    def collocation_shift(
+        self,
+        target: str,
+        window: int = 5,
+        measure: CollocationMeasure = "logDice",
+        min_count: int = 5,
+        smoothing: float = 0.5,
+    ) -> CollocationShiftResult:
+        """Compute the change in collocates of ``target`` between a and b.
+        Window-based co-occurrence with Rychlý logDice (default) or PMI /
+        t-score / MI³ as alternatives. Laplace smoothing keeps shifts
+        finite for collocates absent on one side.
+        """
+        from .collocation.shift import collocation_shift as _shift
+        from .results import CollocationShiftResult
+        table = _shift(
+            self.a,
+            self.b,
+            target=target,
+            window=window,
+            measure=measure,
+            min_count=min_count,
+            smoothing=smoothing,
+        )
+        return CollocationShiftResult(
+            target=target,
+            table=table.reset_index(),
+            measure=measure,
+            window=window,
+            label_a=_corpus_label(self.a),
+            label_b=_corpus_label(self.b),
+            corpus_a=self.a,
+            corpus_b=self.b,
+        )
+    def semantic_shift(
+        self,
+        target: str | list[str],
+        embedder: Embedder | None = None,
+        window: int = 5,
+        align: EmbeddingAlignment = "none",
+    ) -> SemanticShiftResult:
+        """Compute embedding-space displacement of target term(s).
+        Uses *averaged contextual embeddings*: every window around the
+        target in each corpus is encoded by ``embedder`` and averaged
+        into a corpus-specific centroid. The cosine distance between
+        centroids is the reported shift.
+        ``embedder`` defaults to :class:`SBERTEmbedder` (requires the
+        ``semantic`` extra). For deterministic offline demos pass
+        :class:`pycorpdiff.semantic.HashEmbedder`.
+        ``align="procrustes"`` is appropriate when the embedder produces
+        independent per-corpus spaces (Hamilton-style diachronic
+        word2vec). Modern shared-model encoders like SBERT live in a
+        common space, so the default is ``"none"``.
+        """
+        from .results import SemanticShiftResult
+        from .semantic.embed import SBERTEmbedder
+        from .semantic.shift import semantic_shift as _shift
+        effective_embedder = embedder if embedder is not None else SBERTEmbedder()
+        table = _shift(
+            self.a, self.b, target=target, embedder=effective_embedder,
+            window=window, align=align,
+        )
+        targets = [target] if isinstance(target, str) else list(target)
+        return SemanticShiftResult(
+            targets=targets,
+            table=table,
+            alignment=align,
+            label_a=_corpus_label(self.a),
+            label_b=_corpus_label(self.b),
+            corpus_a=self.a,
+            corpus_b=self.b,
+            embedder=effective_embedder,
+            window=window,
+        )
+    def concordance(
+        self, target: str, n: int = 20, window: int = 5
+    ) -> ConcordanceResult:
+        """Return side-by-side KWIC examples of ``target`` from both corpora.
+        Up to ``n`` lines per corpus are returned, concatenated into a
+        single :class:`ConcordanceResult` with a ``corpus`` column
+        distinguishing the source. Shortcut for
+        ``pycorpdiff.explain.kwic_compare(a, b, target, ...)``.
+        """
+        from .explain import kwic_compare
+        return kwic_compare(
+            self.a,
+            self.b,
+            target=target,
+            window=window,
+            n_per_side=n,
+            label_a=_corpus_label(self.a),
+            label_b=_corpus_label(self.b),
+        )
+def compare(a: CorpusLike, b: CorpusLike) -> Comparison:
+    """Construct a pairwise :class:`Comparison` of two corpora or slices."""
+    return Comparison(a=a, b=b)
+def _corpus_label(c: CorpusLike) -> str:
+    return c.label if isinstance(c, CorpusSlice) else "corpus"
+def _before_after(
+    corpus: Corpus,
+    event_date: str,
+    time_col: str = "date",
+) -> Comparison:
+    """Construct a before/after Comparison split on ``event_date``.
+    The before-slice contains documents with ``time_col < event_date``;
+    the after-slice contains documents with ``time_col >= event_date``.
+    """
+    import pandas as pd
+    if time_col not in corpus.docs.columns:
+        raise KeyError(f"time_col={time_col!r} not found in corpus columns")
+    event = pd.Timestamp(event_date)
+    times = pd.to_datetime(corpus.docs[time_col])
+    before = CorpusSlice(parent=corpus, mask=times < event, filters={"before": event_date})
+    after = CorpusSlice(parent=corpus, mask=times >= event, filters={"after": event_date})
+    return Comparison(a=before, b=after)
+# Expose the specialised constructor as an attribute of the public ``compare``
+# function so users can write ``pcd.compare.before_after(...)`` — matches the
+# API shape promised in the README.
+compare.before_after = _before_after  # type: ignore[attr-defined]