pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pycorpdiff/__init__.py +126 -0
  2. pycorpdiff/_backends/__init__.py +3 -0
  3. pycorpdiff/_backends/pandas.py +3 -0
  4. pycorpdiff/_backends/polars.py +3 -0
  5. pycorpdiff/collocation/__init__.py +19 -0
  6. pycorpdiff/collocation/cooccurrence.py +65 -0
  7. pycorpdiff/collocation/measures.py +102 -0
  8. pycorpdiff/collocation/network.py +233 -0
  9. pycorpdiff/collocation/shift.py +146 -0
  10. pycorpdiff/compare.py +345 -0
  11. pycorpdiff/corpus.py +411 -0
  12. pycorpdiff/datasets/__init__.py +27 -0
  13. pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  14. pycorpdiff/datasets/_generate_hansard.py +221 -0
  15. pycorpdiff/datasets/hansard.py +235 -0
  16. pycorpdiff/datasets/histwords.py +221 -0
  17. pycorpdiff/explain.py +177 -0
  18. pycorpdiff/io/__init__.py +16 -0
  19. pycorpdiff/io/duckdb.py +92 -0
  20. pycorpdiff/io/huggingface.py +142 -0
  21. pycorpdiff/io/readers.py +138 -0
  22. pycorpdiff/keyness/__init__.py +26 -0
  23. pycorpdiff/keyness/bayes.py +50 -0
  24. pycorpdiff/keyness/chi_squared.py +94 -0
  25. pycorpdiff/keyness/correction.py +34 -0
  26. pycorpdiff/keyness/dispersion.py +89 -0
  27. pycorpdiff/keyness/effect_sizes.py +65 -0
  28. pycorpdiff/keyness/loglikelihood.py +92 -0
  29. pycorpdiff/keyness/multicorpus.py +143 -0
  30. pycorpdiff/keyness/permutation.py +154 -0
  31. pycorpdiff/py.typed +0 -0
  32. pycorpdiff/results.py +635 -0
  33. pycorpdiff/semantic/__init__.py +18 -0
  34. pycorpdiff/semantic/alignment.py +53 -0
  35. pycorpdiff/semantic/embed.py +84 -0
  36. pycorpdiff/semantic/shift.py +224 -0
  37. pycorpdiff/semantic/trajectory.py +166 -0
  38. pycorpdiff/stats.py +69 -0
  39. pycorpdiff/temporal/__init__.py +15 -0
  40. pycorpdiff/temporal/bocpd.py +233 -0
  41. pycorpdiff/temporal/causal_impact.py +293 -0
  42. pycorpdiff/temporal/changepoint.py +92 -0
  43. pycorpdiff/temporal/forecast.py +405 -0
  44. pycorpdiff/temporal/its.py +123 -0
  45. pycorpdiff/temporal/slicing.py +174 -0
  46. pycorpdiff/tokenize.py +110 -0
  47. pycorpdiff/viz/__init__.py +37 -0
  48. pycorpdiff/viz/bocpd.py +173 -0
  49. pycorpdiff/viz/causal_impact.py +142 -0
  50. pycorpdiff/viz/collocation.py +48 -0
  51. pycorpdiff/viz/dispersion.py +117 -0
  52. pycorpdiff/viz/forecast.py +129 -0
  53. pycorpdiff/viz/keyness.py +96 -0
  54. pycorpdiff/viz/network.py +186 -0
  55. pycorpdiff/viz/scattertext.py +160 -0
  56. pycorpdiff/viz/semantic_forecast.py +114 -0
  57. pycorpdiff/viz/trajectory.py +48 -0
  58. pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
  59. pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
  60. pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
  61. pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,53 @@
1
+ """Vector-space alignment for diachronic embeddings.
2
+
3
+ Reference
4
+ ---------
5
+ Hamilton, W. L., Leskovec, J., & Jurafsky, D. (2016). Diachronic word
6
+ embeddings reveal statistical laws of semantic change. In *Proceedings
7
+ of ACL 2016*.
8
+
9
+ Schönemann, P. H. (1966). A generalized solution of the orthogonal
10
+ Procrustes problem. *Psychometrika*, 31(1), 1-10.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import numpy as np
16
+ import numpy.typing as npt
17
+
18
+
19
+ def procrustes_align(
20
+ source: npt.NDArray[np.float64],
21
+ target: npt.NDArray[np.float64],
22
+ ) -> npt.NDArray[np.float64]:
23
+ """Return ``source`` rotated to best match ``target`` via orthogonal Procrustes.
24
+
25
+ Implements Schönemann's closed-form solution: ``R = U V^T`` where
26
+ ``U Σ V^T = SVD(source^T target)``. The returned matrix is the
27
+ rotated source, not the rotation operator itself.
28
+
29
+ Both matrices must have the same shape ``(n, d)``. Rotation
30
+ preserves vector norms — if ``source`` is L2-normalised the
31
+ output rows are still on the unit sphere.
32
+
33
+ Use this when the two embedding spaces were trained independently
34
+ (e.g. Hamilton-style diachronic word2vec). Modern shared-model
35
+ encoders like SBERT already live in a common space, so alignment
36
+ is unnecessary.
37
+ """
38
+ if source.shape != target.shape:
39
+ raise ValueError(
40
+ f"source and target must have the same shape; got {source.shape} vs {target.shape}"
41
+ )
42
+ # Rank-deficient inputs (e.g. identical rows from a HashEmbedder
43
+ # encoding identical strings) make matmul emit a numpy 2.2.x
44
+ # divide-by-zero RuntimeWarning that isn't a real numerical
45
+ # problem — the SVD below handles the singular case correctly.
46
+ # Suppress the warning locally so strict filterwarnings doesn't
47
+ # promote it to an error.
48
+ with np.errstate(divide="ignore", invalid="ignore", over="ignore", under="ignore"):
49
+ m = source.T @ target
50
+ u, _, vt = np.linalg.svd(m, full_matrices=False)
51
+ rotation = u @ vt
52
+ rotated: npt.NDArray[np.float64] = source @ rotation
53
+ return rotated
@@ -0,0 +1,84 @@
1
+ """Embedder protocol, a lazy SBERT default, and a deterministic test helper.
2
+
3
+ The :class:`Embedder` protocol is the package's plug point for vector
4
+ representations. Anything implementing
5
+ ``encode(terms: Sequence[str]) -> np.ndarray`` of shape ``(n, d)``
6
+ satisfies it — a thin wrapper around gensim KeyedVectors, a HuggingFace
7
+ pipeline, your own trained vectors, etc.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import hashlib
13
+ from collections.abc import Sequence
14
+ from dataclasses import dataclass, field
15
+ from typing import Any, Protocol, runtime_checkable
16
+
17
+ import numpy as np
18
+ import numpy.typing as npt
19
+
20
+
21
+ @runtime_checkable
22
+ class Embedder(Protocol):
23
+ """Anything callable that maps a sequence of strings to a 2-D vector array."""
24
+
25
+ def encode(self, terms: Sequence[str]) -> npt.NDArray[np.float64]: ...
26
+
27
+
28
+ @dataclass
29
+ class SBERTEmbedder:
30
+ """Default :class:`Embedder` backed by sentence-transformers.
31
+
32
+ sentence-transformers is in the optional ``semantic`` extra and is
33
+ imported lazily on first call to :meth:`encode` so the base
34
+ install does not pull torch transitively. The default model
35
+ ``all-MiniLM-L6-v2`` is ~22 MB; for non-English corpora prefer one
36
+ of the multilingual options (e.g. ``paraphrase-multilingual-MiniLM-L12-v2``).
37
+ """
38
+
39
+ model_name: str = "all-MiniLM-L6-v2"
40
+ # ``Any`` because sentence_transformers isn't a base dependency — the
41
+ # real type is :class:`sentence_transformers.SentenceTransformer` but
42
+ # we can't import it at module load time without breaking the
43
+ # "base install stays light" contract.
44
+ _model: Any = field(default=None, init=False, repr=False, compare=False)
45
+
46
+ def encode(self, terms: Sequence[str]) -> npt.NDArray[np.float64]:
47
+ if self._model is None:
48
+ try:
49
+ from sentence_transformers import SentenceTransformer
50
+ except ImportError as exc:
51
+ raise ImportError(
52
+ "SBERTEmbedder requires sentence-transformers. "
53
+ "Install with: pip install 'pycorpdiff[semantic]'"
54
+ ) from exc
55
+ object.__setattr__(self, "_model", SentenceTransformer(self.model_name))
56
+ vectors = self._model.encode(list(terms), convert_to_numpy=True)
57
+ return np.asarray(vectors, dtype=np.float64)
58
+
59
+
60
+ @dataclass(frozen=True)
61
+ class HashEmbedder:
62
+ """Deterministic seed-derived embedder for testing and offline demos.
63
+
64
+ Maps each input string to a vector by seeding a per-string RNG with
65
+ a SHA-256 digest of the string. Same input always yields the same
66
+ vector; different inputs yield uncorrelated unit vectors. This
67
+ isn't useful for *semantic* analysis — there's no signal beyond the
68
+ string equality — but it's perfect for verifying that the orchestrators
69
+ (averaging, alignment, neighborhood drift) wire up correctly without
70
+ paying the cost of a real model.
71
+ """
72
+
73
+ dim: int = 32
74
+
75
+ def encode(self, terms: Sequence[str]) -> npt.NDArray[np.float64]:
76
+ out = np.zeros((len(terms), self.dim), dtype=np.float64)
77
+ for i, term in enumerate(terms):
78
+ digest = hashlib.sha256(term.encode("utf-8")).digest()
79
+ seed = int.from_bytes(digest[:8], "big") & 0xFFFFFFFF
80
+ rng = np.random.default_rng(seed=seed)
81
+ v = rng.standard_normal(self.dim)
82
+ n = np.linalg.norm(v)
83
+ out[i] = v / n if n > 0 else v
84
+ return out
@@ -0,0 +1,224 @@
1
+ """Semantic shift and neighborhood drift between corpora.
2
+
3
+ The default strategy is *averaged contextual embeddings*: for each
4
+ occurrence of the target term in a corpus, encode its surrounding
5
+ window as a sentence, then average across occurrences. The
6
+ corpus-specific representation that comes out is what we compare.
7
+
8
+ This works with any shared-space embedder (SBERT, multilingual SBERT,
9
+ HuggingFace encoders). For Hamilton-style independently-trained
10
+ embeddings, supply ``align="procrustes"`` to rotate the source space
11
+ onto the target space before comparison.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ from typing import Any, Literal
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+
21
+ from ..corpus import Corpus, CorpusSlice
22
+ from ..stats import cosine_similarity
23
+ from .embed import Embedder, SBERTEmbedder
24
+
25
+ AlignmentKind = Literal["none", "procrustes"]
26
+
27
+
28
+ def _window_texts(
29
+ corpus: Corpus | CorpusSlice, target: str, window: int
30
+ ) -> list[str]:
31
+ """Extract every window around ``target`` as a space-joined string."""
32
+ docs_tokens = corpus.tokens()
33
+ out: list[str] = []
34
+ for tokens in docs_tokens:
35
+ for i, tok in enumerate(tokens):
36
+ if tok != target:
37
+ continue
38
+ lo = max(0, i - window)
39
+ hi = min(len(tokens), i + window + 1)
40
+ out.append(" ".join(tokens[lo:hi]))
41
+ return out
42
+
43
+
44
+ def _centroid(vectors: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
45
+ out: np.ndarray[Any, Any] = vectors.mean(axis=0)
46
+ return out
47
+
48
+
49
+ def semantic_shift(
50
+ a: Corpus | CorpusSlice,
51
+ b: Corpus | CorpusSlice,
52
+ target: str | list[str],
53
+ embedder: Embedder | None = None,
54
+ window: int = 5,
55
+ align: AlignmentKind = "none",
56
+ ) -> pd.DataFrame:
57
+ """Embedding-space displacement of target term(s) between corpora.
58
+
59
+ Parameters
60
+ ----------
61
+ a, b
62
+ The two corpora (or slices) to compare.
63
+ target
64
+ One or more target terms.
65
+ embedder
66
+ Anything satisfying :class:`Embedder`. Defaults to
67
+ :class:`SBERTEmbedder` (lazy-loaded sentence-transformers).
68
+ Pass :class:`HashEmbedder` for deterministic offline demos.
69
+ window
70
+ Tokens of context on each side of the target.
71
+ align
72
+ ``"none"`` (default) assumes both corpora are embedded with the
73
+ same model and skips alignment. ``"procrustes"`` rotates the
74
+ source's window-vector matrix onto the target's via orthogonal
75
+ Procrustes — appropriate when the embedder produces independent
76
+ per-corpus spaces.
77
+
78
+ Returns
79
+ -------
80
+ pandas.DataFrame
81
+ One row per target term with ``cosine_similarity``,
82
+ ``cosine_distance``, ``n_contexts_a``, ``n_contexts_b``.
83
+ """
84
+ targets = [target] if isinstance(target, str) else list(target)
85
+ if embedder is None:
86
+ embedder = SBERTEmbedder()
87
+
88
+ rows: list[dict[str, object]] = []
89
+ for tgt in targets:
90
+ wins_a = _window_texts(a, tgt, window=window)
91
+ wins_b = _window_texts(b, tgt, window=window)
92
+ if not wins_a or not wins_b:
93
+ rows.append(
94
+ {
95
+ "target": tgt,
96
+ "cosine_similarity": float("nan"),
97
+ "cosine_distance": float("nan"),
98
+ "n_contexts_a": len(wins_a),
99
+ "n_contexts_b": len(wins_b),
100
+ }
101
+ )
102
+ continue
103
+
104
+ vecs_a = np.asarray(embedder.encode(wins_a), dtype=np.float64)
105
+ vecs_b = np.asarray(embedder.encode(wins_b), dtype=np.float64)
106
+
107
+ if align == "procrustes":
108
+ # Procrustes wants two matrices of the same shape. Pad / truncate
109
+ # the smaller side to make this well-defined; downstream we only
110
+ # care about the rotated centroid.
111
+ from .alignment import procrustes_align
112
+
113
+ n = min(len(vecs_a), len(vecs_b))
114
+ vecs_a = procrustes_align(vecs_a[:n], vecs_b[:n])
115
+ vecs_b = vecs_b[:n]
116
+
117
+ centroid_a = _centroid(vecs_a)
118
+ centroid_b = _centroid(vecs_b)
119
+ sim = cosine_similarity(centroid_a, centroid_b)
120
+ rows.append(
121
+ {
122
+ "target": tgt,
123
+ "cosine_similarity": sim,
124
+ "cosine_distance": 1.0 - sim,
125
+ "n_contexts_a": len(wins_a),
126
+ "n_contexts_b": len(wins_b),
127
+ }
128
+ )
129
+ return pd.DataFrame(rows)
130
+
131
+
132
+ def neighborhood_drift(
133
+ a: Corpus | CorpusSlice,
134
+ b: Corpus | CorpusSlice,
135
+ target: str,
136
+ k: int = 10,
137
+ embedder: Embedder | None = None,
138
+ window: int = 5,
139
+ min_count: int = 2,
140
+ ) -> pd.DataFrame:
141
+ """Compare the top-k contextual neighbours of ``target`` in each corpus.
142
+
143
+ For each candidate term that appears in the target's windows (in
144
+ either corpus) at least ``min_count`` times, computes a contextual
145
+ centroid in each corpus and ranks by cosine similarity to the
146
+ target's own centroid. Returns the union of the two top-k sets with
147
+ similarity in each corpus and a signed drift score.
148
+
149
+ Returns
150
+ -------
151
+ pandas.DataFrame
152
+ Columns ``neighbor``, ``sim_a``, ``sim_b``, ``rank_a``,
153
+ ``rank_b``, ``drift = sim_a - sim_b``, ``status``
154
+ (``"shared"`` / ``"gained_in_a"`` / ``"lost_in_a"``).
155
+ """
156
+ if embedder is None:
157
+ embedder = SBERTEmbedder()
158
+
159
+ wins_a = _window_texts(a, target, window=window)
160
+ wins_b = _window_texts(b, target, window=window)
161
+ if not wins_a:
162
+ raise ValueError(f"target {target!r} not found in corpus a")
163
+ if not wins_b:
164
+ raise ValueError(f"target {target!r} not found in corpus b")
165
+
166
+ target_vec_a = np.asarray(embedder.encode(wins_a), dtype=np.float64).mean(axis=0)
167
+ target_vec_b = np.asarray(embedder.encode(wins_b), dtype=np.float64).mean(axis=0)
168
+
169
+ # Candidate vocabulary: words appearing in any window in either corpus,
170
+ # excluding the target itself.
171
+ candidates: set[str] = set()
172
+ for w in wins_a + wins_b:
173
+ candidates.update(w.split())
174
+ candidates.discard(target)
175
+
176
+ sims_a: dict[str, float] = {}
177
+ sims_b: dict[str, float] = {}
178
+ for cand in candidates:
179
+ cand_wins_a = _window_texts(a, cand, window=window)
180
+ cand_wins_b = _window_texts(b, cand, window=window)
181
+ if len(cand_wins_a) >= min_count:
182
+ cv_a = np.asarray(embedder.encode(cand_wins_a), dtype=np.float64).mean(axis=0)
183
+ sims_a[cand] = cosine_similarity(target_vec_a, cv_a)
184
+ if len(cand_wins_b) >= min_count:
185
+ cv_b = np.asarray(embedder.encode(cand_wins_b), dtype=np.float64).mean(axis=0)
186
+ sims_b[cand] = cosine_similarity(target_vec_b, cv_b)
187
+
188
+ top_a = sorted(sims_a.items(), key=lambda x: -x[1])[:k]
189
+ top_b = sorted(sims_b.items(), key=lambda x: -x[1])[:k]
190
+ rank_a = {term: i + 1 for i, (term, _) in enumerate(top_a)}
191
+ rank_b = {term: i + 1 for i, (term, _) in enumerate(top_b)}
192
+
193
+ union = set(rank_a) | set(rank_b)
194
+ rows: list[dict[str, object]] = []
195
+ for term in union:
196
+ s_a = sims_a.get(term, float("nan"))
197
+ s_b = sims_b.get(term, float("nan"))
198
+ in_a = term in rank_a
199
+ in_b = term in rank_b
200
+ if in_a and in_b:
201
+ status = "shared"
202
+ elif in_a:
203
+ status = "gained_in_a"
204
+ else:
205
+ status = "lost_in_a"
206
+ rows.append(
207
+ {
208
+ "neighbor": term,
209
+ "sim_a": s_a,
210
+ "sim_b": s_b,
211
+ "rank_a": rank_a.get(term),
212
+ "rank_b": rank_b.get(term),
213
+ "drift": (s_a if not np.isnan(s_a) else 0.0)
214
+ - (s_b if not np.isnan(s_b) else 0.0),
215
+ "status": status,
216
+ }
217
+ )
218
+ return (
219
+ pd.DataFrame(rows)
220
+ .assign(_abs=lambda d: d["drift"].abs())
221
+ .sort_values("_abs", ascending=False, kind="stable")
222
+ .drop(columns="_abs")
223
+ .reset_index(drop=True)
224
+ )
@@ -0,0 +1,166 @@
1
+ """Multi-period semantic trajectories of target terms.
2
+
3
+ Where :func:`semantic_shift` answers "how did the meaning of *target*
4
+ differ between two corpora", :func:`semantic_trajectory` answers "how
5
+ did the meaning of *target* evolve across a sequence of time periods".
6
+ The algorithm is the per-period extension of averaged contextual
7
+ embeddings:
8
+
9
+ 1. Slice the corpus by time period (yearly / quarterly / monthly).
10
+ 2. For each period, extract every KWIC window around the target.
11
+ 3. Encode each window via the embedder; average into a per-period
12
+ centroid.
13
+ 4. Pick a baseline period (the first populated period by default) and
14
+ report cosine similarity / distance from each period's centroid to
15
+ the baseline.
16
+
17
+ The output is a tidy DataFrame ready to plot — periods on x, distance
18
+ on y, one line per target if you tracked multiple at once.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from typing import Any
24
+
25
+ import numpy as np
26
+ import pandas as pd
27
+
28
+ from ..corpus import Corpus, CorpusSlice
29
+ from ..stats import cosine_similarity
30
+ from .embed import Embedder, SBERTEmbedder
31
+ from .shift import _window_texts
32
+
33
+
34
+ def semantic_trajectory(
35
+ corpus: Corpus | CorpusSlice,
36
+ target: str | list[str],
37
+ time_col: str = "date",
38
+ freq: str = "Y",
39
+ embedder: Embedder | None = None,
40
+ window: int = 5,
41
+ baseline_period: str | pd.Period | None = None,
42
+ ) -> pd.DataFrame:
43
+ """Track each target's contextual centroid across time periods.
44
+
45
+ Parameters
46
+ ----------
47
+ corpus
48
+ A :class:`Corpus` or :class:`CorpusSlice` with a parseable
49
+ ``time_col``.
50
+ target
51
+ One or more target terms. For a single string, the output
52
+ carries one row per period; for a list, one row per
53
+ (period, term) pair.
54
+ time_col
55
+ Column carrying the document date.
56
+ freq
57
+ Pandas offset alias for the bucketing (``"Y"``, ``"Q"``, ``"M"``,
58
+ ``"W"``, ``"D"``).
59
+ embedder
60
+ Anything satisfying :class:`Embedder`. Defaults to
61
+ :class:`SBERTEmbedder` (lazy-loaded sentence-transformers).
62
+ Pass :class:`HashEmbedder` for deterministic offline demos.
63
+ window
64
+ Tokens of context on each side of the target.
65
+ baseline_period
66
+ Which period to anchor the trajectory at. Strings (e.g.
67
+ ``"2015"``) are coerced to :class:`pandas.Period` with the
68
+ ``freq`` above. Defaults to the *first populated* period for
69
+ each target — so two targets with different period coverage
70
+ each get their own appropriate baseline.
71
+
72
+ Returns
73
+ -------
74
+ pandas.DataFrame
75
+ Columns ``period``, ``target``, ``n_contexts``,
76
+ ``similarity_to_baseline``, ``distance_from_baseline``.
77
+ Periods with no occurrences of the target carry NaN
78
+ similarity/distance and ``n_contexts == 0``.
79
+ """
80
+ if embedder is None:
81
+ embedder = SBERTEmbedder()
82
+
83
+ targets = [target] if isinstance(target, str) else list(target)
84
+ temporal = corpus.by_time(time_col, freq)
85
+ periods = temporal.periods()
86
+
87
+ # First pass: per-target, per-period centroids + context counts.
88
+ per_target: dict[str, dict[pd.Period, np.ndarray[Any, Any] | None]] = {}
89
+ per_target_counts: dict[str, dict[pd.Period, int]] = {}
90
+ for tgt in targets:
91
+ centroids: dict[pd.Period, np.ndarray[Any, Any] | None] = {}
92
+ counts: dict[pd.Period, int] = {}
93
+ for period, slice_ in temporal.iter_slices():
94
+ windows = _window_texts(slice_, tgt, window=window)
95
+ counts[period] = len(windows)
96
+ if not windows:
97
+ centroids[period] = None
98
+ continue
99
+ vecs = np.asarray(embedder.encode(windows), dtype=np.float64)
100
+ centroids[period] = vecs.mean(axis=0)
101
+ per_target[tgt] = centroids
102
+ per_target_counts[tgt] = counts
103
+
104
+ # Second pass: choose baseline, compute similarities, emit rows.
105
+ rows: list[dict[str, object]] = []
106
+ for tgt in targets:
107
+ centroids = per_target[tgt]
108
+ populated = [p for p in periods if centroids[p] is not None]
109
+ if not populated:
110
+ if baseline_period is not None:
111
+ raise ValueError(
112
+ f"baseline_period={baseline_period!r} has no contexts "
113
+ f"for target {tgt!r}"
114
+ )
115
+ # Target never appears anywhere — emit NaN rows for completeness.
116
+ for period in periods:
117
+ rows.append(
118
+ {
119
+ "period": period,
120
+ "target": tgt,
121
+ "n_contexts": 0,
122
+ "similarity_to_baseline": float("nan"),
123
+ "distance_from_baseline": float("nan"),
124
+ }
125
+ )
126
+ continue
127
+
128
+ if baseline_period is None:
129
+ baseline = populated[0]
130
+ else:
131
+ baseline = (
132
+ pd.Period(baseline_period, freq=freq)
133
+ if isinstance(baseline_period, str)
134
+ else baseline_period
135
+ )
136
+ if baseline not in centroids or centroids[baseline] is None:
137
+ raise ValueError(
138
+ f"baseline_period={baseline_period!r} has no contexts "
139
+ f"for target {tgt!r}"
140
+ )
141
+
142
+ baseline_vec = centroids[baseline]
143
+ assert baseline_vec is not None # for mypy
144
+ for period in periods:
145
+ vec = centroids[period]
146
+ if vec is None:
147
+ sim = float("nan")
148
+ dist = float("nan")
149
+ else:
150
+ sim = cosine_similarity(baseline_vec, vec)
151
+ dist = 1.0 - sim
152
+ rows.append(
153
+ {
154
+ "period": period,
155
+ "target": tgt,
156
+ "n_contexts": per_target_counts[tgt][period],
157
+ "similarity_to_baseline": sim,
158
+ "distance_from_baseline": dist,
159
+ }
160
+ )
161
+
162
+ return (
163
+ pd.DataFrame(rows)
164
+ .sort_values(["target", "period"], kind="stable")
165
+ .reset_index(drop=True)
166
+ )
pycorpdiff/stats.py ADDED
@@ -0,0 +1,69 @@
1
+ """Small statistical utilities used across the package.
2
+
3
+ Right now this is just the Wilson score interval — pycorpdiff prefers
4
+ Wilson over Wald for proportion CIs because Wald collapses badly near
5
+ ``p = 0`` or ``p = 1``, which is exactly where temporal trajectories of
6
+ rare terms live.
7
+
8
+ Reference
9
+ ---------
10
+ Wilson, E. B. (1927). Probable inference, the law of succession, and
11
+ statistical inference. *Journal of the American Statistical
12
+ Association*, 22(158), 209-212.
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import numpy as np
18
+ import numpy.typing as npt
19
+ from scipy.special import ndtri
20
+
21
+
22
+ def cosine_similarity(
23
+ a: npt.NDArray[np.float64], b: npt.NDArray[np.float64]
24
+ ) -> float:
25
+ """Cosine similarity between two 1-D vectors.
26
+
27
+ Returns 0 when either vector is the zero vector — the geometric
28
+ definition is undefined there, and zero is the conservative
29
+ "no relationship" default rather than NaN.
30
+ """
31
+ na = float(np.linalg.norm(a))
32
+ nb = float(np.linalg.norm(b))
33
+ if na == 0.0 or nb == 0.0:
34
+ return 0.0
35
+ return float(np.dot(a, b) / (na * nb))
36
+
37
+
38
+ def wilson_ci(
39
+ x: int | npt.NDArray[np.int64],
40
+ n: int | npt.NDArray[np.int64],
41
+ confidence: float = 0.95,
42
+ ) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]:
43
+ """Wilson score interval for a binomial proportion ``x / n``.
44
+
45
+ Returns ``(lower, upper)`` as float arrays (or scalars cast to
46
+ 0-d arrays). When ``n == 0`` the interval is undefined and both
47
+ bounds come back as NaN. ``confidence`` must be in ``(0, 1)``;
48
+ defaults to 0.95 (z ≈ 1.96).
49
+ """
50
+ if not 0 < confidence < 1:
51
+ raise ValueError(f"confidence must be in (0, 1); got {confidence}")
52
+ # Two-sided z for the requested level: Φ^{-1}((1 + c) / 2).
53
+ z = float(ndtri((1.0 + confidence) / 2.0))
54
+
55
+ x_arr = np.asarray(x, dtype=np.float64)
56
+ n_arr = np.asarray(n, dtype=np.float64)
57
+
58
+ with np.errstate(divide="ignore", invalid="ignore"):
59
+ p = np.where(n_arr > 0, x_arr / n_arr, np.nan)
60
+ z2 = z * z
61
+ denom = 1.0 + z2 / n_arr
62
+ center = (p + z2 / (2.0 * n_arr)) / denom
63
+ margin = z * np.sqrt(p * (1.0 - p) / n_arr + z2 / (4.0 * n_arr * n_arr)) / denom
64
+
65
+ lower = np.where(n_arr > 0, center - margin, np.nan)
66
+ upper = np.where(n_arr > 0, center + margin, np.nan)
67
+ # Clip to [0, 1] — Wilson's interval is bounded mathematically, but
68
+ # roundoff at p ≈ 0 / p ≈ 1 can spill a hair past.
69
+ return np.clip(lower, 0.0, 1.0), np.clip(upper, 0.0, 1.0)
@@ -0,0 +1,15 @@
1
+ """Temporal slicing, rolling-window analysis, changepoint detection."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .changepoint import detect_changepoints
6
+ from .its import interrupted_time_series
7
+ from .slicing import TemporalCorpus, Tracker, track
8
+
9
+ __all__ = [
10
+ "TemporalCorpus",
11
+ "Tracker",
12
+ "detect_changepoints",
13
+ "interrupted_time_series",
14
+ "track",
15
+ ]