pycorpdiff 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycorpdiff/__init__.py +126 -0
- pycorpdiff/_backends/__init__.py +3 -0
- pycorpdiff/_backends/pandas.py +3 -0
- pycorpdiff/_backends/polars.py +3 -0
- pycorpdiff/collocation/__init__.py +19 -0
- pycorpdiff/collocation/cooccurrence.py +65 -0
- pycorpdiff/collocation/measures.py +102 -0
- pycorpdiff/collocation/network.py +233 -0
- pycorpdiff/collocation/shift.py +146 -0
- pycorpdiff/compare.py +345 -0
- pycorpdiff/corpus.py +411 -0
- pycorpdiff/datasets/__init__.py +27 -0
- pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- pycorpdiff/datasets/_generate_hansard.py +221 -0
- pycorpdiff/datasets/hansard.py +235 -0
- pycorpdiff/datasets/histwords.py +221 -0
- pycorpdiff/explain.py +177 -0
- pycorpdiff/io/__init__.py +16 -0
- pycorpdiff/io/duckdb.py +92 -0
- pycorpdiff/io/huggingface.py +142 -0
- pycorpdiff/io/readers.py +138 -0
- pycorpdiff/keyness/__init__.py +26 -0
- pycorpdiff/keyness/bayes.py +50 -0
- pycorpdiff/keyness/chi_squared.py +94 -0
- pycorpdiff/keyness/correction.py +34 -0
- pycorpdiff/keyness/dispersion.py +89 -0
- pycorpdiff/keyness/effect_sizes.py +65 -0
- pycorpdiff/keyness/loglikelihood.py +92 -0
- pycorpdiff/keyness/multicorpus.py +143 -0
- pycorpdiff/keyness/permutation.py +154 -0
- pycorpdiff/py.typed +0 -0
- pycorpdiff/results.py +635 -0
- pycorpdiff/semantic/__init__.py +18 -0
- pycorpdiff/semantic/alignment.py +53 -0
- pycorpdiff/semantic/embed.py +84 -0
- pycorpdiff/semantic/shift.py +224 -0
- pycorpdiff/semantic/trajectory.py +166 -0
- pycorpdiff/stats.py +69 -0
- pycorpdiff/temporal/__init__.py +15 -0
- pycorpdiff/temporal/bocpd.py +233 -0
- pycorpdiff/temporal/causal_impact.py +293 -0
- pycorpdiff/temporal/changepoint.py +92 -0
- pycorpdiff/temporal/forecast.py +405 -0
- pycorpdiff/temporal/its.py +123 -0
- pycorpdiff/temporal/slicing.py +174 -0
- pycorpdiff/tokenize.py +110 -0
- pycorpdiff/viz/__init__.py +37 -0
- pycorpdiff/viz/bocpd.py +173 -0
- pycorpdiff/viz/causal_impact.py +142 -0
- pycorpdiff/viz/collocation.py +48 -0
- pycorpdiff/viz/dispersion.py +117 -0
- pycorpdiff/viz/forecast.py +129 -0
- pycorpdiff/viz/keyness.py +96 -0
- pycorpdiff/viz/network.py +186 -0
- pycorpdiff/viz/scattertext.py +160 -0
- pycorpdiff/viz/semantic_forecast.py +114 -0
- pycorpdiff/viz/trajectory.py +48 -0
- pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
- pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
- pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
- pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
"""Effect-size measures for corpus comparison.
|
|
2
|
+
|
|
3
|
+
References
|
|
4
|
+
----------
|
|
5
|
+
Hardie, A. (2014). Log Ratio: An informal introduction. Centre for Corpus
|
|
6
|
+
Approaches to Social Science (CASS).
|
|
7
|
+
|
|
8
|
+
Gabrielatos, C. (2018). Keyness analysis: Nature, metrics and techniques.
|
|
9
|
+
In *Corpus Approaches to Discourse* (pp. 225-258). Routledge.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def log_ratio(
|
|
19
|
+
counts_a: pd.Series,
|
|
20
|
+
counts_b: pd.Series,
|
|
21
|
+
total_a: int,
|
|
22
|
+
total_b: int,
|
|
23
|
+
smoothing: float = 0.5,
|
|
24
|
+
) -> pd.Series:
|
|
25
|
+
"""Hardie's LogRatio: ``log2((a+α)/N_a) - log2((b+α)/N_b)``.
|
|
26
|
+
|
|
27
|
+
The Laplace ``smoothing`` constant (default 0.5, per Hardie) is added
|
|
28
|
+
to every count before normalisation so terms absent from one corpus
|
|
29
|
+
yield a finite (rather than ``±inf``) effect size. Positive LogRatio
|
|
30
|
+
means the term is more frequent in A; negative means more frequent in B.
|
|
31
|
+
"""
|
|
32
|
+
if smoothing <= 0:
|
|
33
|
+
raise ValueError(f"smoothing must be > 0; got {smoothing}")
|
|
34
|
+
|
|
35
|
+
terms = counts_a.index.union(counts_b.index)
|
|
36
|
+
a = counts_a.reindex(terms, fill_value=0).astype(float)
|
|
37
|
+
b = counts_b.reindex(terms, fill_value=0).astype(float)
|
|
38
|
+
rate_a = (a + smoothing) / total_a
|
|
39
|
+
rate_b = (b + smoothing) / total_b
|
|
40
|
+
return pd.Series(np.log2(rate_a / rate_b), index=terms, name="log_ratio")
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def percent_diff(
|
|
44
|
+
counts_a: pd.Series,
|
|
45
|
+
counts_b: pd.Series,
|
|
46
|
+
total_a: int,
|
|
47
|
+
total_b: int,
|
|
48
|
+
) -> pd.Series:
|
|
49
|
+
"""%DIFF — Gabrielatos's normalised percentage difference.
|
|
50
|
+
|
|
51
|
+
Computed as ``(rate_a - rate_b) / rate_b * 100`` where rates are per
|
|
52
|
+
million tokens. The denominator's choice of per-million is convention
|
|
53
|
+
only; it cancels out of the ratio. Returns ``+inf`` when ``b == 0``
|
|
54
|
+
and ``a > 0`` (the term is novel in A) — the caller may wish to
|
|
55
|
+
filter or replace those rows for plotting.
|
|
56
|
+
"""
|
|
57
|
+
terms = counts_a.index.union(counts_b.index)
|
|
58
|
+
a = counts_a.reindex(terms, fill_value=0).astype(float)
|
|
59
|
+
b = counts_b.reindex(terms, fill_value=0).astype(float)
|
|
60
|
+
|
|
61
|
+
rate_a = a / total_a * 1_000_000
|
|
62
|
+
rate_b = b / total_b * 1_000_000
|
|
63
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
64
|
+
diff = (rate_a - rate_b) / rate_b * 100.0
|
|
65
|
+
return pd.Series(diff, index=terms, name="percent_diff")
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
"""Dunning's G² log-likelihood statistic.
|
|
2
|
+
|
|
3
|
+
Reference
|
|
4
|
+
---------
|
|
5
|
+
Dunning, T. (1993). Accurate methods for the statistics of surprise and
|
|
6
|
+
coincidence. *Computational Linguistics*, 19(1), 61-74.
|
|
7
|
+
|
|
8
|
+
Notes
|
|
9
|
+
-----
|
|
10
|
+
The G² returned by :func:`log_likelihood` is **signed**: positive when the
|
|
11
|
+
term is overused in corpus A relative to corpus B (i.e. ``a/N_a > b/N_b``)
|
|
12
|
+
and negative when overused in B. This is the convention CASS / Lancaster
|
|
13
|
+
tooling has gravitated toward — it carries direction information without
|
|
14
|
+
needing a separate column. The reported *p*-value uses ``|G²|`` as the
|
|
15
|
+
test statistic; the unsigned form is what's chi-squared distributed.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import numpy as np
|
|
21
|
+
import pandas as pd
|
|
22
|
+
from scipy.special import xlogy
|
|
23
|
+
from scipy.stats import chi2
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def log_likelihood(
|
|
27
|
+
counts_a: pd.Series,
|
|
28
|
+
counts_b: pd.Series,
|
|
29
|
+
total_a: int,
|
|
30
|
+
total_b: int,
|
|
31
|
+
) -> pd.DataFrame:
|
|
32
|
+
"""Compute Dunning G² for every term in the union of input indices.
|
|
33
|
+
|
|
34
|
+
``counts_a`` and ``counts_b`` are aligned on their union; missing
|
|
35
|
+
terms are imputed as zero. No min-count filtering is applied here —
|
|
36
|
+
that is the caller's responsibility (see
|
|
37
|
+
:meth:`pycorpdiff.Comparison.keyness`).
|
|
38
|
+
|
|
39
|
+
Parameters
|
|
40
|
+
----------
|
|
41
|
+
counts_a, counts_b
|
|
42
|
+
Term-frequency series. Index entries are terms; values are
|
|
43
|
+
non-negative integer counts.
|
|
44
|
+
total_a, total_b
|
|
45
|
+
Corpus totals (token counts before any min-count filter). Used
|
|
46
|
+
for the contingency-table "not-term" cells.
|
|
47
|
+
|
|
48
|
+
Returns
|
|
49
|
+
-------
|
|
50
|
+
pandas.DataFrame
|
|
51
|
+
Indexed by term, with columns ``count_a``, ``count_b``,
|
|
52
|
+
``expected_a``, ``expected_b``, ``g2`` (signed), ``p_value``.
|
|
53
|
+
"""
|
|
54
|
+
if total_a <= 0 or total_b <= 0:
|
|
55
|
+
raise ValueError(f"total_a and total_b must be positive; got {total_a}, {total_b}")
|
|
56
|
+
|
|
57
|
+
terms = counts_a.index.union(counts_b.index)
|
|
58
|
+
a = counts_a.reindex(terms, fill_value=0).astype(np.int64).to_numpy()
|
|
59
|
+
b = counts_b.reindex(terms, fill_value=0).astype(np.int64).to_numpy()
|
|
60
|
+
|
|
61
|
+
obs_sum = a + b
|
|
62
|
+
total = total_a + total_b
|
|
63
|
+
expected_a = total_a * obs_sum / total
|
|
64
|
+
expected_b = total_b * obs_sum / total
|
|
65
|
+
|
|
66
|
+
# 2 * sum_i O_i * ln(O_i / E_i), with xlogy giving 0*log(0)=0.
|
|
67
|
+
unsigned = 2.0 * (
|
|
68
|
+
xlogy(a, a) - xlogy(a, expected_a) + xlogy(b, b) - xlogy(b, expected_b)
|
|
69
|
+
)
|
|
70
|
+
# Mathematically G² >= 0; clip away the tiny negative values that
|
|
71
|
+
# surface from float roundoff when the two corpora have ~identical rates.
|
|
72
|
+
unsigned = np.maximum(unsigned, 0.0)
|
|
73
|
+
|
|
74
|
+
# Sign by direction of overuse: + when A's rate exceeds B's, else -.
|
|
75
|
+
a_rate = a / total_a
|
|
76
|
+
b_rate = b / total_b
|
|
77
|
+
sign = np.where(a_rate >= b_rate, 1.0, -1.0)
|
|
78
|
+
signed = sign * unsigned
|
|
79
|
+
|
|
80
|
+
p_value = chi2.sf(unsigned, df=1)
|
|
81
|
+
|
|
82
|
+
return pd.DataFrame(
|
|
83
|
+
{
|
|
84
|
+
"count_a": a,
|
|
85
|
+
"count_b": b,
|
|
86
|
+
"expected_a": expected_a,
|
|
87
|
+
"expected_b": expected_b,
|
|
88
|
+
"g2": signed,
|
|
89
|
+
"p_value": p_value,
|
|
90
|
+
},
|
|
91
|
+
index=terms,
|
|
92
|
+
)
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
"""N-way keyness — does a term's rate differ across more than two corpora?
|
|
2
|
+
|
|
3
|
+
The two-corpus :func:`log_likelihood` answers "is this term distinctive
|
|
4
|
+
in A vs B". When you have three or more corpora — three news outlets,
|
|
5
|
+
five parties, ten decades — the natural generalisation is a one-way
|
|
6
|
+
contingency test: for each term, does its observed rate vary across
|
|
7
|
+
corpora more than chance would explain?
|
|
8
|
+
|
|
9
|
+
The reported G² uses the same marginal-only form as the two-way path
|
|
10
|
+
(:func:`pycorpdiff.keyness.loglikelihood.log_likelihood`), so the N=2
|
|
11
|
+
result agrees exactly with the existing keyness pipeline. Asymptotic
|
|
12
|
+
distribution is χ²(df=N−1).
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from collections.abc import Sequence
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
from scipy.special import xlogy
|
|
22
|
+
from scipy.stats import chi2
|
|
23
|
+
|
|
24
|
+
from ..corpus import Corpus, CorpusSlice
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def keyness_multi(
|
|
28
|
+
corpora: Sequence[Corpus | CorpusSlice],
|
|
29
|
+
*,
|
|
30
|
+
labels: Sequence[str] | None = None,
|
|
31
|
+
min_count: int = 5,
|
|
32
|
+
multiple_comparisons: str = "bh",
|
|
33
|
+
stop_words: set[str] | list[str] | None = None,
|
|
34
|
+
) -> pd.DataFrame:
|
|
35
|
+
"""Test for each term whether its rate varies across ``corpora``.
|
|
36
|
+
|
|
37
|
+
For ``N`` corpora and each shared-vocabulary term:
|
|
38
|
+
|
|
39
|
+
.. math::
|
|
40
|
+
|
|
41
|
+
G^2 = 2 \\sum_{i=1}^{N} O_i \\ln\\!\\left(\\frac{O_i}{E_i}\\right),
|
|
42
|
+
\\quad E_i = \\frac{(\\sum_j O_j)\\, n_i}{\\sum_j n_j}
|
|
43
|
+
|
|
44
|
+
where :math:`O_i` is the term's count in corpus *i* and :math:`n_i`
|
|
45
|
+
is corpus *i*'s total tokens. Asymptotic distribution is
|
|
46
|
+
:math:`\\chi^2(\\text{df}=N-1)`.
|
|
47
|
+
|
|
48
|
+
Parameters
|
|
49
|
+
----------
|
|
50
|
+
corpora
|
|
51
|
+
Two or more corpora (or slices) to compare side-by-side.
|
|
52
|
+
labels
|
|
53
|
+
Optional labels for the corpora; default ``["corpus_0",
|
|
54
|
+
"corpus_1", ...]``. Used to name the per-corpus count columns.
|
|
55
|
+
min_count
|
|
56
|
+
Drop terms whose total count across all corpora is below this.
|
|
57
|
+
multiple_comparisons
|
|
58
|
+
``"bh"`` (default), ``"bonferroni"``, or ``"none"``.
|
|
59
|
+
stop_words
|
|
60
|
+
Iterable of terms to exclude before scoring; same semantics as
|
|
61
|
+
the two-way :meth:`Comparison.keyness` parameter.
|
|
62
|
+
|
|
63
|
+
Returns
|
|
64
|
+
-------
|
|
65
|
+
pandas.DataFrame
|
|
66
|
+
Indexed by term, sorted by G² descending. Columns: per-corpus
|
|
67
|
+
``count_<label>``, ``g2``, ``p_value``, and
|
|
68
|
+
``p_adjusted`` (unless ``multiple_comparisons="none"``).
|
|
69
|
+
"""
|
|
70
|
+
if len(corpora) < 2:
|
|
71
|
+
raise ValueError(f"need at least 2 corpora; got {len(corpora)}")
|
|
72
|
+
|
|
73
|
+
if labels is None:
|
|
74
|
+
labels = [f"corpus_{i}" for i in range(len(corpora))]
|
|
75
|
+
if len(labels) != len(corpora):
|
|
76
|
+
raise ValueError(
|
|
77
|
+
f"labels must have one entry per corpus; "
|
|
78
|
+
f"got {len(labels)} labels for {len(corpora)} corpora"
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
vocabs = [c.vocab(min_count=1) for c in corpora]
|
|
82
|
+
totals = np.array([int(v.sum()) for v in vocabs], dtype=np.int64)
|
|
83
|
+
if (totals == 0).any():
|
|
84
|
+
which = [labels[i] for i, n in enumerate(totals) if n == 0]
|
|
85
|
+
raise ValueError(f"empty corpora: {which}")
|
|
86
|
+
|
|
87
|
+
all_terms = pd.Index([])
|
|
88
|
+
for v in vocabs:
|
|
89
|
+
all_terms = all_terms.union(v.index)
|
|
90
|
+
|
|
91
|
+
counts_matrix = np.zeros((len(all_terms), len(corpora)), dtype=np.int64)
|
|
92
|
+
for j, v in enumerate(vocabs):
|
|
93
|
+
aligned = v.reindex(all_terms, fill_value=0).astype(np.int64).to_numpy()
|
|
94
|
+
counts_matrix[:, j] = aligned
|
|
95
|
+
|
|
96
|
+
keep_mask = counts_matrix.sum(axis=1) >= min_count
|
|
97
|
+
if stop_words is not None:
|
|
98
|
+
stop_set = set(stop_words)
|
|
99
|
+
keep_mask &= ~all_terms.isin(stop_set)
|
|
100
|
+
kept_terms = all_terms[keep_mask]
|
|
101
|
+
kept_counts = counts_matrix[keep_mask]
|
|
102
|
+
|
|
103
|
+
if len(kept_terms) == 0:
|
|
104
|
+
empty = pd.DataFrame(
|
|
105
|
+
{f"count_{label}": pd.Series(dtype=np.int64) for label in labels}
|
|
106
|
+
)
|
|
107
|
+
empty["g2"] = pd.Series(dtype=float)
|
|
108
|
+
empty["p_value"] = pd.Series(dtype=float)
|
|
109
|
+
if multiple_comparisons != "none":
|
|
110
|
+
empty["p_adjusted"] = pd.Series(dtype=float)
|
|
111
|
+
return empty.rename_axis("term")
|
|
112
|
+
|
|
113
|
+
total_per_term = kept_counts.sum(axis=1).astype(np.float64)
|
|
114
|
+
grand_total = float(totals.sum())
|
|
115
|
+
expected = total_per_term[:, None] * totals[None, :] / grand_total
|
|
116
|
+
unsigned = 2.0 * (
|
|
117
|
+
xlogy(kept_counts, kept_counts) - xlogy(kept_counts, expected)
|
|
118
|
+
).sum(axis=1)
|
|
119
|
+
unsigned = np.maximum(unsigned, 0.0)
|
|
120
|
+
p_value = chi2.sf(unsigned, df=len(corpora) - 1)
|
|
121
|
+
|
|
122
|
+
table = pd.DataFrame(
|
|
123
|
+
{f"count_{label}": kept_counts[:, j] for j, label in enumerate(labels)},
|
|
124
|
+
index=kept_terms,
|
|
125
|
+
)
|
|
126
|
+
table["g2"] = unsigned
|
|
127
|
+
table["p_value"] = p_value
|
|
128
|
+
|
|
129
|
+
if multiple_comparisons == "bh":
|
|
130
|
+
from .correction import benjamini_hochberg
|
|
131
|
+
|
|
132
|
+
table["p_adjusted"] = benjamini_hochberg(p_value)
|
|
133
|
+
elif multiple_comparisons == "bonferroni":
|
|
134
|
+
from .correction import bonferroni
|
|
135
|
+
|
|
136
|
+
table["p_adjusted"] = bonferroni(p_value)
|
|
137
|
+
elif multiple_comparisons != "none":
|
|
138
|
+
raise ValueError(
|
|
139
|
+
f"multiple_comparisons must be 'bh', 'bonferroni', or 'none'; "
|
|
140
|
+
f"got {multiple_comparisons!r}"
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
return table.sort_values("g2", ascending=False).rename_axis("term")
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
"""Permutation-test *p*-values for keyness.
|
|
2
|
+
|
|
3
|
+
The asymptotic χ²-based *p*-value reported by :func:`log_likelihood`
|
|
4
|
+
relies on the large-sample approximation Dunning warned about in 1993:
|
|
5
|
+
when expected cell counts are small (~< 5), the χ²(df=1) reference
|
|
6
|
+
distribution stops being faithful. A permutation test sidesteps this
|
|
7
|
+
entirely — it builds the null distribution empirically from the data
|
|
8
|
+
itself by repeatedly shuffling document labels and recomputing G².
|
|
9
|
+
|
|
10
|
+
The standard reference is Westfall & Young (1993), *Resampling-Based
|
|
11
|
+
Multiple Testing*. The "+1/+1" small-sample correction in the *p*-value
|
|
12
|
+
formula is Phipson & Smyth (2010); without it the smallest reportable
|
|
13
|
+
*p* is 0, which is misleading for any finite *B*.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
from scipy import sparse
|
|
21
|
+
from scipy.special import xlogy
|
|
22
|
+
|
|
23
|
+
from ..corpus import Corpus, CorpusSlice
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def permutation_pvalues(
|
|
27
|
+
a: Corpus | CorpusSlice,
|
|
28
|
+
b: Corpus | CorpusSlice,
|
|
29
|
+
*,
|
|
30
|
+
terms: pd.Index | list[str] | None = None,
|
|
31
|
+
n_permutations: int = 999,
|
|
32
|
+
seed: int | None = None,
|
|
33
|
+
) -> pd.Series:
|
|
34
|
+
"""Compute empirical permutation *p*-values for per-term keyness.
|
|
35
|
+
|
|
36
|
+
For each permutation, the documents from both corpora are pooled,
|
|
37
|
+
randomly relabelled into two groups preserving the original
|
|
38
|
+
document counts ``|docs(a)|`` and ``|docs(b)|``, and a per-term G²
|
|
39
|
+
is recomputed against the new marginal counts. The reported
|
|
40
|
+
*p*-value for each term is the Phipson–Smyth (2010) empirical
|
|
41
|
+
fraction::
|
|
42
|
+
|
|
43
|
+
p = (#perms with |G²_perm| ≥ |G²_observed| + 1) / (B + 1)
|
|
44
|
+
|
|
45
|
+
Documents (rather than tokens) are the unit of exchangeability —
|
|
46
|
+
this matches the null "speeches are exchangeable between frames"
|
|
47
|
+
that most corpus-linguistic two-sample questions are really
|
|
48
|
+
testing.
|
|
49
|
+
|
|
50
|
+
Parameters
|
|
51
|
+
----------
|
|
52
|
+
a, b
|
|
53
|
+
The two corpora (or slices) to test.
|
|
54
|
+
terms
|
|
55
|
+
Optional restriction to a subset of vocabulary terms. If
|
|
56
|
+
``None`` (default) every term in the union of vocabularies is
|
|
57
|
+
scored.
|
|
58
|
+
n_permutations
|
|
59
|
+
Number of label permutations. 999 is the conventional default
|
|
60
|
+
— coarse enough to be fast, fine enough that the smallest
|
|
61
|
+
attainable *p* (= 1/1000 = 0.001) is below the usual 0.01
|
|
62
|
+
screening threshold.
|
|
63
|
+
seed
|
|
64
|
+
Optional random seed for reproducibility.
|
|
65
|
+
|
|
66
|
+
Returns
|
|
67
|
+
-------
|
|
68
|
+
pandas.Series
|
|
69
|
+
Indexed by term, named ``p_permutation``. Values in [1/(B+1), 1].
|
|
70
|
+
"""
|
|
71
|
+
if n_permutations < 1:
|
|
72
|
+
raise ValueError(f"n_permutations must be >= 1; got {n_permutations}")
|
|
73
|
+
|
|
74
|
+
dtm_a = a.doc_term_counts(min_count=1)
|
|
75
|
+
dtm_b = b.doc_term_counts(min_count=1)
|
|
76
|
+
|
|
77
|
+
all_terms = (
|
|
78
|
+
dtm_a.columns.union(dtm_b.columns) if terms is None else pd.Index(list(terms))
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
a_aligned = dtm_a.reindex(columns=all_terms, fill_value=0).astype(np.int64)
|
|
82
|
+
b_aligned = dtm_b.reindex(columns=all_terms, fill_value=0).astype(np.int64)
|
|
83
|
+
|
|
84
|
+
a_matrix = sparse.csr_matrix(a_aligned.to_numpy())
|
|
85
|
+
b_matrix = sparse.csr_matrix(b_aligned.to_numpy())
|
|
86
|
+
stacked = sparse.vstack([a_matrix, b_matrix]).tocsr()
|
|
87
|
+
|
|
88
|
+
n_docs_a = a_matrix.shape[0]
|
|
89
|
+
n_docs_total = stacked.shape[0]
|
|
90
|
+
|
|
91
|
+
if n_docs_a == 0 or n_docs_a == n_docs_total:
|
|
92
|
+
raise ValueError(
|
|
93
|
+
"permutation_pvalues needs at least one document on each side; "
|
|
94
|
+
f"got |docs(a)|={n_docs_a}, |docs(b)|={n_docs_total - n_docs_a}"
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
doc_lens = np.asarray(stacked.sum(axis=1)).ravel().astype(np.int64)
|
|
98
|
+
full_counts = np.asarray(stacked.sum(axis=0)).ravel().astype(np.int64)
|
|
99
|
+
total_n = int(doc_lens.sum())
|
|
100
|
+
|
|
101
|
+
if total_n == 0:
|
|
102
|
+
raise ValueError("permutation_pvalues needs at least one token across both corpora")
|
|
103
|
+
|
|
104
|
+
a_marginal = np.asarray(a_matrix.sum(axis=0)).ravel().astype(np.int64)
|
|
105
|
+
b_marginal = full_counts - a_marginal
|
|
106
|
+
n_a_observed = int(a_marginal.sum())
|
|
107
|
+
n_b_observed = total_n - n_a_observed
|
|
108
|
+
observed_g2 = _g2_unsigned(a_marginal, b_marginal, n_a_observed, n_b_observed)
|
|
109
|
+
observed_abs = np.abs(observed_g2)
|
|
110
|
+
|
|
111
|
+
rng = np.random.default_rng(seed)
|
|
112
|
+
extreme = np.zeros(len(all_terms), dtype=np.int64)
|
|
113
|
+
|
|
114
|
+
for _ in range(n_permutations):
|
|
115
|
+
perm = rng.permutation(n_docs_total)
|
|
116
|
+
idx_a = perm[:n_docs_a]
|
|
117
|
+
# Vectorised marginal sum for the shuffled "a" assignment.
|
|
118
|
+
marg_a_perm = np.asarray(stacked[idx_a, :].sum(axis=0)).ravel().astype(np.int64)
|
|
119
|
+
marg_b_perm = full_counts - marg_a_perm
|
|
120
|
+
n_a_perm = int(doc_lens[idx_a].sum())
|
|
121
|
+
n_b_perm = total_n - n_a_perm
|
|
122
|
+
if n_a_perm == 0 or n_b_perm == 0:
|
|
123
|
+
# Degenerate permutation — all-zero docs landed in one side.
|
|
124
|
+
# Treat as null contribution of zero (no terms exceed observed).
|
|
125
|
+
continue
|
|
126
|
+
g2_perm = _g2_unsigned(marg_a_perm, marg_b_perm, n_a_perm, n_b_perm)
|
|
127
|
+
extreme += np.abs(g2_perm) >= observed_abs
|
|
128
|
+
|
|
129
|
+
p_perm = (extreme + 1.0) / (n_permutations + 1.0)
|
|
130
|
+
return pd.Series(p_perm, index=all_terms, name="p_permutation")
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def _g2_unsigned(
|
|
134
|
+
a: np.ndarray,
|
|
135
|
+
b: np.ndarray,
|
|
136
|
+
n_a: int,
|
|
137
|
+
n_b: int,
|
|
138
|
+
) -> np.ndarray:
|
|
139
|
+
"""Vectorised unsigned G² for a vocabulary of terms.
|
|
140
|
+
|
|
141
|
+
Mirrors the math in :func:`pycorpdiff.keyness.loglikelihood.log_likelihood`
|
|
142
|
+
but operates on already-aligned numpy arrays and skips the sign
|
|
143
|
+
+ p-value bookkeeping. Used internally by the permutation loop
|
|
144
|
+
where the per-iteration cost dominates.
|
|
145
|
+
"""
|
|
146
|
+
obs_sum = a + b
|
|
147
|
+
total = n_a + n_b
|
|
148
|
+
expected_a = n_a * obs_sum / total
|
|
149
|
+
expected_b = n_b * obs_sum / total
|
|
150
|
+
unsigned = 2.0 * (
|
|
151
|
+
xlogy(a, a) - xlogy(a, expected_a) + xlogy(b, b) - xlogy(b, expected_b)
|
|
152
|
+
)
|
|
153
|
+
clipped: np.ndarray = np.maximum(unsigned, 0.0)
|
|
154
|
+
return clipped
|
pycorpdiff/py.typed
ADDED
|
File without changes
|