pycorpdiff 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycorpdiff/__init__.py +126 -0
- pycorpdiff/_backends/__init__.py +3 -0
- pycorpdiff/_backends/pandas.py +3 -0
- pycorpdiff/_backends/polars.py +3 -0
- pycorpdiff/collocation/__init__.py +19 -0
- pycorpdiff/collocation/cooccurrence.py +65 -0
- pycorpdiff/collocation/measures.py +102 -0
- pycorpdiff/collocation/network.py +233 -0
- pycorpdiff/collocation/shift.py +146 -0
- pycorpdiff/compare.py +345 -0
- pycorpdiff/corpus.py +411 -0
- pycorpdiff/datasets/__init__.py +27 -0
- pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- pycorpdiff/datasets/_generate_hansard.py +221 -0
- pycorpdiff/datasets/hansard.py +235 -0
- pycorpdiff/datasets/histwords.py +221 -0
- pycorpdiff/explain.py +177 -0
- pycorpdiff/io/__init__.py +16 -0
- pycorpdiff/io/duckdb.py +92 -0
- pycorpdiff/io/huggingface.py +142 -0
- pycorpdiff/io/readers.py +138 -0
- pycorpdiff/keyness/__init__.py +26 -0
- pycorpdiff/keyness/bayes.py +50 -0
- pycorpdiff/keyness/chi_squared.py +94 -0
- pycorpdiff/keyness/correction.py +34 -0
- pycorpdiff/keyness/dispersion.py +89 -0
- pycorpdiff/keyness/effect_sizes.py +65 -0
- pycorpdiff/keyness/loglikelihood.py +92 -0
- pycorpdiff/keyness/multicorpus.py +143 -0
- pycorpdiff/keyness/permutation.py +154 -0
- pycorpdiff/py.typed +0 -0
- pycorpdiff/results.py +635 -0
- pycorpdiff/semantic/__init__.py +18 -0
- pycorpdiff/semantic/alignment.py +53 -0
- pycorpdiff/semantic/embed.py +84 -0
- pycorpdiff/semantic/shift.py +224 -0
- pycorpdiff/semantic/trajectory.py +166 -0
- pycorpdiff/stats.py +69 -0
- pycorpdiff/temporal/__init__.py +15 -0
- pycorpdiff/temporal/bocpd.py +233 -0
- pycorpdiff/temporal/causal_impact.py +293 -0
- pycorpdiff/temporal/changepoint.py +92 -0
- pycorpdiff/temporal/forecast.py +405 -0
- pycorpdiff/temporal/its.py +123 -0
- pycorpdiff/temporal/slicing.py +174 -0
- pycorpdiff/tokenize.py +110 -0
- pycorpdiff/viz/__init__.py +37 -0
- pycorpdiff/viz/bocpd.py +173 -0
- pycorpdiff/viz/causal_impact.py +142 -0
- pycorpdiff/viz/collocation.py +48 -0
- pycorpdiff/viz/dispersion.py +117 -0
- pycorpdiff/viz/forecast.py +129 -0
- pycorpdiff/viz/keyness.py +96 -0
- pycorpdiff/viz/network.py +186 -0
- pycorpdiff/viz/scattertext.py +160 -0
- pycorpdiff/viz/semantic_forecast.py +114 -0
- pycorpdiff/viz/trajectory.py +48 -0
- pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
- pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
- pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
- pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
"""Vector-space alignment for diachronic embeddings.
|
|
2
|
+
|
|
3
|
+
Reference
|
|
4
|
+
---------
|
|
5
|
+
Hamilton, W. L., Leskovec, J., & Jurafsky, D. (2016). Diachronic word
|
|
6
|
+
embeddings reveal statistical laws of semantic change. In *Proceedings
|
|
7
|
+
of ACL 2016*.
|
|
8
|
+
|
|
9
|
+
Schönemann, P. H. (1966). A generalized solution of the orthogonal
|
|
10
|
+
Procrustes problem. *Psychometrika*, 31(1), 1-10.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import numpy as np
|
|
16
|
+
import numpy.typing as npt
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def procrustes_align(
|
|
20
|
+
source: npt.NDArray[np.float64],
|
|
21
|
+
target: npt.NDArray[np.float64],
|
|
22
|
+
) -> npt.NDArray[np.float64]:
|
|
23
|
+
"""Return ``source`` rotated to best match ``target`` via orthogonal Procrustes.
|
|
24
|
+
|
|
25
|
+
Implements Schönemann's closed-form solution: ``R = U V^T`` where
|
|
26
|
+
``U Σ V^T = SVD(source^T target)``. The returned matrix is the
|
|
27
|
+
rotated source, not the rotation operator itself.
|
|
28
|
+
|
|
29
|
+
Both matrices must have the same shape ``(n, d)``. Rotation
|
|
30
|
+
preserves vector norms — if ``source`` is L2-normalised the
|
|
31
|
+
output rows are still on the unit sphere.
|
|
32
|
+
|
|
33
|
+
Use this when the two embedding spaces were trained independently
|
|
34
|
+
(e.g. Hamilton-style diachronic word2vec). Modern shared-model
|
|
35
|
+
encoders like SBERT already live in a common space, so alignment
|
|
36
|
+
is unnecessary.
|
|
37
|
+
"""
|
|
38
|
+
if source.shape != target.shape:
|
|
39
|
+
raise ValueError(
|
|
40
|
+
f"source and target must have the same shape; got {source.shape} vs {target.shape}"
|
|
41
|
+
)
|
|
42
|
+
# Rank-deficient inputs (e.g. identical rows from a HashEmbedder
|
|
43
|
+
# encoding identical strings) make matmul emit a numpy 2.2.x
|
|
44
|
+
# divide-by-zero RuntimeWarning that isn't a real numerical
|
|
45
|
+
# problem — the SVD below handles the singular case correctly.
|
|
46
|
+
# Suppress the warning locally so strict filterwarnings doesn't
|
|
47
|
+
# promote it to an error.
|
|
48
|
+
with np.errstate(divide="ignore", invalid="ignore", over="ignore", under="ignore"):
|
|
49
|
+
m = source.T @ target
|
|
50
|
+
u, _, vt = np.linalg.svd(m, full_matrices=False)
|
|
51
|
+
rotation = u @ vt
|
|
52
|
+
rotated: npt.NDArray[np.float64] = source @ rotation
|
|
53
|
+
return rotated
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
"""Embedder protocol, a lazy SBERT default, and a deterministic test helper.
|
|
2
|
+
|
|
3
|
+
The :class:`Embedder` protocol is the package's plug point for vector
|
|
4
|
+
representations. Anything implementing
|
|
5
|
+
``encode(terms: Sequence[str]) -> np.ndarray`` of shape ``(n, d)``
|
|
6
|
+
satisfies it — a thin wrapper around gensim KeyedVectors, a HuggingFace
|
|
7
|
+
pipeline, your own trained vectors, etc.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import hashlib
|
|
13
|
+
from collections.abc import Sequence
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from typing import Any, Protocol, runtime_checkable
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import numpy.typing as npt
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@runtime_checkable
|
|
22
|
+
class Embedder(Protocol):
|
|
23
|
+
"""Anything callable that maps a sequence of strings to a 2-D vector array."""
|
|
24
|
+
|
|
25
|
+
def encode(self, terms: Sequence[str]) -> npt.NDArray[np.float64]: ...
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
@dataclass
|
|
29
|
+
class SBERTEmbedder:
|
|
30
|
+
"""Default :class:`Embedder` backed by sentence-transformers.
|
|
31
|
+
|
|
32
|
+
sentence-transformers is in the optional ``semantic`` extra and is
|
|
33
|
+
imported lazily on first call to :meth:`encode` so the base
|
|
34
|
+
install does not pull torch transitively. The default model
|
|
35
|
+
``all-MiniLM-L6-v2`` is ~22 MB; for non-English corpora prefer one
|
|
36
|
+
of the multilingual options (e.g. ``paraphrase-multilingual-MiniLM-L12-v2``).
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
model_name: str = "all-MiniLM-L6-v2"
|
|
40
|
+
# ``Any`` because sentence_transformers isn't a base dependency — the
|
|
41
|
+
# real type is :class:`sentence_transformers.SentenceTransformer` but
|
|
42
|
+
# we can't import it at module load time without breaking the
|
|
43
|
+
# "base install stays light" contract.
|
|
44
|
+
_model: Any = field(default=None, init=False, repr=False, compare=False)
|
|
45
|
+
|
|
46
|
+
def encode(self, terms: Sequence[str]) -> npt.NDArray[np.float64]:
|
|
47
|
+
if self._model is None:
|
|
48
|
+
try:
|
|
49
|
+
from sentence_transformers import SentenceTransformer
|
|
50
|
+
except ImportError as exc:
|
|
51
|
+
raise ImportError(
|
|
52
|
+
"SBERTEmbedder requires sentence-transformers. "
|
|
53
|
+
"Install with: pip install 'pycorpdiff[semantic]'"
|
|
54
|
+
) from exc
|
|
55
|
+
object.__setattr__(self, "_model", SentenceTransformer(self.model_name))
|
|
56
|
+
vectors = self._model.encode(list(terms), convert_to_numpy=True)
|
|
57
|
+
return np.asarray(vectors, dtype=np.float64)
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@dataclass(frozen=True)
|
|
61
|
+
class HashEmbedder:
|
|
62
|
+
"""Deterministic seed-derived embedder for testing and offline demos.
|
|
63
|
+
|
|
64
|
+
Maps each input string to a vector by seeding a per-string RNG with
|
|
65
|
+
a SHA-256 digest of the string. Same input always yields the same
|
|
66
|
+
vector; different inputs yield uncorrelated unit vectors. This
|
|
67
|
+
isn't useful for *semantic* analysis — there's no signal beyond the
|
|
68
|
+
string equality — but it's perfect for verifying that the orchestrators
|
|
69
|
+
(averaging, alignment, neighborhood drift) wire up correctly without
|
|
70
|
+
paying the cost of a real model.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
dim: int = 32
|
|
74
|
+
|
|
75
|
+
def encode(self, terms: Sequence[str]) -> npt.NDArray[np.float64]:
|
|
76
|
+
out = np.zeros((len(terms), self.dim), dtype=np.float64)
|
|
77
|
+
for i, term in enumerate(terms):
|
|
78
|
+
digest = hashlib.sha256(term.encode("utf-8")).digest()
|
|
79
|
+
seed = int.from_bytes(digest[:8], "big") & 0xFFFFFFFF
|
|
80
|
+
rng = np.random.default_rng(seed=seed)
|
|
81
|
+
v = rng.standard_normal(self.dim)
|
|
82
|
+
n = np.linalg.norm(v)
|
|
83
|
+
out[i] = v / n if n > 0 else v
|
|
84
|
+
return out
|
|
@@ -0,0 +1,224 @@
|
|
|
1
|
+
"""Semantic shift and neighborhood drift between corpora.
|
|
2
|
+
|
|
3
|
+
The default strategy is *averaged contextual embeddings*: for each
|
|
4
|
+
occurrence of the target term in a corpus, encode its surrounding
|
|
5
|
+
window as a sentence, then average across occurrences. The
|
|
6
|
+
corpus-specific representation that comes out is what we compare.
|
|
7
|
+
|
|
8
|
+
This works with any shared-space embedder (SBERT, multilingual SBERT,
|
|
9
|
+
HuggingFace encoders). For Hamilton-style independently-trained
|
|
10
|
+
embeddings, supply ``align="procrustes"`` to rotate the source space
|
|
11
|
+
onto the target space before comparison.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from typing import Any, Literal
|
|
17
|
+
|
|
18
|
+
import numpy as np
|
|
19
|
+
import pandas as pd
|
|
20
|
+
|
|
21
|
+
from ..corpus import Corpus, CorpusSlice
|
|
22
|
+
from ..stats import cosine_similarity
|
|
23
|
+
from .embed import Embedder, SBERTEmbedder
|
|
24
|
+
|
|
25
|
+
AlignmentKind = Literal["none", "procrustes"]
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _window_texts(
|
|
29
|
+
corpus: Corpus | CorpusSlice, target: str, window: int
|
|
30
|
+
) -> list[str]:
|
|
31
|
+
"""Extract every window around ``target`` as a space-joined string."""
|
|
32
|
+
docs_tokens = corpus.tokens()
|
|
33
|
+
out: list[str] = []
|
|
34
|
+
for tokens in docs_tokens:
|
|
35
|
+
for i, tok in enumerate(tokens):
|
|
36
|
+
if tok != target:
|
|
37
|
+
continue
|
|
38
|
+
lo = max(0, i - window)
|
|
39
|
+
hi = min(len(tokens), i + window + 1)
|
|
40
|
+
out.append(" ".join(tokens[lo:hi]))
|
|
41
|
+
return out
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _centroid(vectors: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
|
|
45
|
+
out: np.ndarray[Any, Any] = vectors.mean(axis=0)
|
|
46
|
+
return out
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def semantic_shift(
|
|
50
|
+
a: Corpus | CorpusSlice,
|
|
51
|
+
b: Corpus | CorpusSlice,
|
|
52
|
+
target: str | list[str],
|
|
53
|
+
embedder: Embedder | None = None,
|
|
54
|
+
window: int = 5,
|
|
55
|
+
align: AlignmentKind = "none",
|
|
56
|
+
) -> pd.DataFrame:
|
|
57
|
+
"""Embedding-space displacement of target term(s) between corpora.
|
|
58
|
+
|
|
59
|
+
Parameters
|
|
60
|
+
----------
|
|
61
|
+
a, b
|
|
62
|
+
The two corpora (or slices) to compare.
|
|
63
|
+
target
|
|
64
|
+
One or more target terms.
|
|
65
|
+
embedder
|
|
66
|
+
Anything satisfying :class:`Embedder`. Defaults to
|
|
67
|
+
:class:`SBERTEmbedder` (lazy-loaded sentence-transformers).
|
|
68
|
+
Pass :class:`HashEmbedder` for deterministic offline demos.
|
|
69
|
+
window
|
|
70
|
+
Tokens of context on each side of the target.
|
|
71
|
+
align
|
|
72
|
+
``"none"`` (default) assumes both corpora are embedded with the
|
|
73
|
+
same model and skips alignment. ``"procrustes"`` rotates the
|
|
74
|
+
source's window-vector matrix onto the target's via orthogonal
|
|
75
|
+
Procrustes — appropriate when the embedder produces independent
|
|
76
|
+
per-corpus spaces.
|
|
77
|
+
|
|
78
|
+
Returns
|
|
79
|
+
-------
|
|
80
|
+
pandas.DataFrame
|
|
81
|
+
One row per target term with ``cosine_similarity``,
|
|
82
|
+
``cosine_distance``, ``n_contexts_a``, ``n_contexts_b``.
|
|
83
|
+
"""
|
|
84
|
+
targets = [target] if isinstance(target, str) else list(target)
|
|
85
|
+
if embedder is None:
|
|
86
|
+
embedder = SBERTEmbedder()
|
|
87
|
+
|
|
88
|
+
rows: list[dict[str, object]] = []
|
|
89
|
+
for tgt in targets:
|
|
90
|
+
wins_a = _window_texts(a, tgt, window=window)
|
|
91
|
+
wins_b = _window_texts(b, tgt, window=window)
|
|
92
|
+
if not wins_a or not wins_b:
|
|
93
|
+
rows.append(
|
|
94
|
+
{
|
|
95
|
+
"target": tgt,
|
|
96
|
+
"cosine_similarity": float("nan"),
|
|
97
|
+
"cosine_distance": float("nan"),
|
|
98
|
+
"n_contexts_a": len(wins_a),
|
|
99
|
+
"n_contexts_b": len(wins_b),
|
|
100
|
+
}
|
|
101
|
+
)
|
|
102
|
+
continue
|
|
103
|
+
|
|
104
|
+
vecs_a = np.asarray(embedder.encode(wins_a), dtype=np.float64)
|
|
105
|
+
vecs_b = np.asarray(embedder.encode(wins_b), dtype=np.float64)
|
|
106
|
+
|
|
107
|
+
if align == "procrustes":
|
|
108
|
+
# Procrustes wants two matrices of the same shape. Pad / truncate
|
|
109
|
+
# the smaller side to make this well-defined; downstream we only
|
|
110
|
+
# care about the rotated centroid.
|
|
111
|
+
from .alignment import procrustes_align
|
|
112
|
+
|
|
113
|
+
n = min(len(vecs_a), len(vecs_b))
|
|
114
|
+
vecs_a = procrustes_align(vecs_a[:n], vecs_b[:n])
|
|
115
|
+
vecs_b = vecs_b[:n]
|
|
116
|
+
|
|
117
|
+
centroid_a = _centroid(vecs_a)
|
|
118
|
+
centroid_b = _centroid(vecs_b)
|
|
119
|
+
sim = cosine_similarity(centroid_a, centroid_b)
|
|
120
|
+
rows.append(
|
|
121
|
+
{
|
|
122
|
+
"target": tgt,
|
|
123
|
+
"cosine_similarity": sim,
|
|
124
|
+
"cosine_distance": 1.0 - sim,
|
|
125
|
+
"n_contexts_a": len(wins_a),
|
|
126
|
+
"n_contexts_b": len(wins_b),
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
return pd.DataFrame(rows)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def neighborhood_drift(
|
|
133
|
+
a: Corpus | CorpusSlice,
|
|
134
|
+
b: Corpus | CorpusSlice,
|
|
135
|
+
target: str,
|
|
136
|
+
k: int = 10,
|
|
137
|
+
embedder: Embedder | None = None,
|
|
138
|
+
window: int = 5,
|
|
139
|
+
min_count: int = 2,
|
|
140
|
+
) -> pd.DataFrame:
|
|
141
|
+
"""Compare the top-k contextual neighbours of ``target`` in each corpus.
|
|
142
|
+
|
|
143
|
+
For each candidate term that appears in the target's windows (in
|
|
144
|
+
either corpus) at least ``min_count`` times, computes a contextual
|
|
145
|
+
centroid in each corpus and ranks by cosine similarity to the
|
|
146
|
+
target's own centroid. Returns the union of the two top-k sets with
|
|
147
|
+
similarity in each corpus and a signed drift score.
|
|
148
|
+
|
|
149
|
+
Returns
|
|
150
|
+
-------
|
|
151
|
+
pandas.DataFrame
|
|
152
|
+
Columns ``neighbor``, ``sim_a``, ``sim_b``, ``rank_a``,
|
|
153
|
+
``rank_b``, ``drift = sim_a - sim_b``, ``status``
|
|
154
|
+
(``"shared"`` / ``"gained_in_a"`` / ``"lost_in_a"``).
|
|
155
|
+
"""
|
|
156
|
+
if embedder is None:
|
|
157
|
+
embedder = SBERTEmbedder()
|
|
158
|
+
|
|
159
|
+
wins_a = _window_texts(a, target, window=window)
|
|
160
|
+
wins_b = _window_texts(b, target, window=window)
|
|
161
|
+
if not wins_a:
|
|
162
|
+
raise ValueError(f"target {target!r} not found in corpus a")
|
|
163
|
+
if not wins_b:
|
|
164
|
+
raise ValueError(f"target {target!r} not found in corpus b")
|
|
165
|
+
|
|
166
|
+
target_vec_a = np.asarray(embedder.encode(wins_a), dtype=np.float64).mean(axis=0)
|
|
167
|
+
target_vec_b = np.asarray(embedder.encode(wins_b), dtype=np.float64).mean(axis=0)
|
|
168
|
+
|
|
169
|
+
# Candidate vocabulary: words appearing in any window in either corpus,
|
|
170
|
+
# excluding the target itself.
|
|
171
|
+
candidates: set[str] = set()
|
|
172
|
+
for w in wins_a + wins_b:
|
|
173
|
+
candidates.update(w.split())
|
|
174
|
+
candidates.discard(target)
|
|
175
|
+
|
|
176
|
+
sims_a: dict[str, float] = {}
|
|
177
|
+
sims_b: dict[str, float] = {}
|
|
178
|
+
for cand in candidates:
|
|
179
|
+
cand_wins_a = _window_texts(a, cand, window=window)
|
|
180
|
+
cand_wins_b = _window_texts(b, cand, window=window)
|
|
181
|
+
if len(cand_wins_a) >= min_count:
|
|
182
|
+
cv_a = np.asarray(embedder.encode(cand_wins_a), dtype=np.float64).mean(axis=0)
|
|
183
|
+
sims_a[cand] = cosine_similarity(target_vec_a, cv_a)
|
|
184
|
+
if len(cand_wins_b) >= min_count:
|
|
185
|
+
cv_b = np.asarray(embedder.encode(cand_wins_b), dtype=np.float64).mean(axis=0)
|
|
186
|
+
sims_b[cand] = cosine_similarity(target_vec_b, cv_b)
|
|
187
|
+
|
|
188
|
+
top_a = sorted(sims_a.items(), key=lambda x: -x[1])[:k]
|
|
189
|
+
top_b = sorted(sims_b.items(), key=lambda x: -x[1])[:k]
|
|
190
|
+
rank_a = {term: i + 1 for i, (term, _) in enumerate(top_a)}
|
|
191
|
+
rank_b = {term: i + 1 for i, (term, _) in enumerate(top_b)}
|
|
192
|
+
|
|
193
|
+
union = set(rank_a) | set(rank_b)
|
|
194
|
+
rows: list[dict[str, object]] = []
|
|
195
|
+
for term in union:
|
|
196
|
+
s_a = sims_a.get(term, float("nan"))
|
|
197
|
+
s_b = sims_b.get(term, float("nan"))
|
|
198
|
+
in_a = term in rank_a
|
|
199
|
+
in_b = term in rank_b
|
|
200
|
+
if in_a and in_b:
|
|
201
|
+
status = "shared"
|
|
202
|
+
elif in_a:
|
|
203
|
+
status = "gained_in_a"
|
|
204
|
+
else:
|
|
205
|
+
status = "lost_in_a"
|
|
206
|
+
rows.append(
|
|
207
|
+
{
|
|
208
|
+
"neighbor": term,
|
|
209
|
+
"sim_a": s_a,
|
|
210
|
+
"sim_b": s_b,
|
|
211
|
+
"rank_a": rank_a.get(term),
|
|
212
|
+
"rank_b": rank_b.get(term),
|
|
213
|
+
"drift": (s_a if not np.isnan(s_a) else 0.0)
|
|
214
|
+
- (s_b if not np.isnan(s_b) else 0.0),
|
|
215
|
+
"status": status,
|
|
216
|
+
}
|
|
217
|
+
)
|
|
218
|
+
return (
|
|
219
|
+
pd.DataFrame(rows)
|
|
220
|
+
.assign(_abs=lambda d: d["drift"].abs())
|
|
221
|
+
.sort_values("_abs", ascending=False, kind="stable")
|
|
222
|
+
.drop(columns="_abs")
|
|
223
|
+
.reset_index(drop=True)
|
|
224
|
+
)
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
"""Multi-period semantic trajectories of target terms.
|
|
2
|
+
|
|
3
|
+
Where :func:`semantic_shift` answers "how did the meaning of *target*
|
|
4
|
+
differ between two corpora", :func:`semantic_trajectory` answers "how
|
|
5
|
+
did the meaning of *target* evolve across a sequence of time periods".
|
|
6
|
+
The algorithm is the per-period extension of averaged contextual
|
|
7
|
+
embeddings:
|
|
8
|
+
|
|
9
|
+
1. Slice the corpus by time period (yearly / quarterly / monthly).
|
|
10
|
+
2. For each period, extract every KWIC window around the target.
|
|
11
|
+
3. Encode each window via the embedder; average into a per-period
|
|
12
|
+
centroid.
|
|
13
|
+
4. Pick a baseline period (the first populated period by default) and
|
|
14
|
+
report cosine similarity / distance from each period's centroid to
|
|
15
|
+
the baseline.
|
|
16
|
+
|
|
17
|
+
The output is a tidy DataFrame ready to plot — periods on x, distance
|
|
18
|
+
on y, one line per target if you tracked multiple at once.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from typing import Any
|
|
24
|
+
|
|
25
|
+
import numpy as np
|
|
26
|
+
import pandas as pd
|
|
27
|
+
|
|
28
|
+
from ..corpus import Corpus, CorpusSlice
|
|
29
|
+
from ..stats import cosine_similarity
|
|
30
|
+
from .embed import Embedder, SBERTEmbedder
|
|
31
|
+
from .shift import _window_texts
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def semantic_trajectory(
|
|
35
|
+
corpus: Corpus | CorpusSlice,
|
|
36
|
+
target: str | list[str],
|
|
37
|
+
time_col: str = "date",
|
|
38
|
+
freq: str = "Y",
|
|
39
|
+
embedder: Embedder | None = None,
|
|
40
|
+
window: int = 5,
|
|
41
|
+
baseline_period: str | pd.Period | None = None,
|
|
42
|
+
) -> pd.DataFrame:
|
|
43
|
+
"""Track each target's contextual centroid across time periods.
|
|
44
|
+
|
|
45
|
+
Parameters
|
|
46
|
+
----------
|
|
47
|
+
corpus
|
|
48
|
+
A :class:`Corpus` or :class:`CorpusSlice` with a parseable
|
|
49
|
+
``time_col``.
|
|
50
|
+
target
|
|
51
|
+
One or more target terms. For a single string, the output
|
|
52
|
+
carries one row per period; for a list, one row per
|
|
53
|
+
(period, term) pair.
|
|
54
|
+
time_col
|
|
55
|
+
Column carrying the document date.
|
|
56
|
+
freq
|
|
57
|
+
Pandas offset alias for the bucketing (``"Y"``, ``"Q"``, ``"M"``,
|
|
58
|
+
``"W"``, ``"D"``).
|
|
59
|
+
embedder
|
|
60
|
+
Anything satisfying :class:`Embedder`. Defaults to
|
|
61
|
+
:class:`SBERTEmbedder` (lazy-loaded sentence-transformers).
|
|
62
|
+
Pass :class:`HashEmbedder` for deterministic offline demos.
|
|
63
|
+
window
|
|
64
|
+
Tokens of context on each side of the target.
|
|
65
|
+
baseline_period
|
|
66
|
+
Which period to anchor the trajectory at. Strings (e.g.
|
|
67
|
+
``"2015"``) are coerced to :class:`pandas.Period` with the
|
|
68
|
+
``freq`` above. Defaults to the *first populated* period for
|
|
69
|
+
each target — so two targets with different period coverage
|
|
70
|
+
each get their own appropriate baseline.
|
|
71
|
+
|
|
72
|
+
Returns
|
|
73
|
+
-------
|
|
74
|
+
pandas.DataFrame
|
|
75
|
+
Columns ``period``, ``target``, ``n_contexts``,
|
|
76
|
+
``similarity_to_baseline``, ``distance_from_baseline``.
|
|
77
|
+
Periods with no occurrences of the target carry NaN
|
|
78
|
+
similarity/distance and ``n_contexts == 0``.
|
|
79
|
+
"""
|
|
80
|
+
if embedder is None:
|
|
81
|
+
embedder = SBERTEmbedder()
|
|
82
|
+
|
|
83
|
+
targets = [target] if isinstance(target, str) else list(target)
|
|
84
|
+
temporal = corpus.by_time(time_col, freq)
|
|
85
|
+
periods = temporal.periods()
|
|
86
|
+
|
|
87
|
+
# First pass: per-target, per-period centroids + context counts.
|
|
88
|
+
per_target: dict[str, dict[pd.Period, np.ndarray[Any, Any] | None]] = {}
|
|
89
|
+
per_target_counts: dict[str, dict[pd.Period, int]] = {}
|
|
90
|
+
for tgt in targets:
|
|
91
|
+
centroids: dict[pd.Period, np.ndarray[Any, Any] | None] = {}
|
|
92
|
+
counts: dict[pd.Period, int] = {}
|
|
93
|
+
for period, slice_ in temporal.iter_slices():
|
|
94
|
+
windows = _window_texts(slice_, tgt, window=window)
|
|
95
|
+
counts[period] = len(windows)
|
|
96
|
+
if not windows:
|
|
97
|
+
centroids[period] = None
|
|
98
|
+
continue
|
|
99
|
+
vecs = np.asarray(embedder.encode(windows), dtype=np.float64)
|
|
100
|
+
centroids[period] = vecs.mean(axis=0)
|
|
101
|
+
per_target[tgt] = centroids
|
|
102
|
+
per_target_counts[tgt] = counts
|
|
103
|
+
|
|
104
|
+
# Second pass: choose baseline, compute similarities, emit rows.
|
|
105
|
+
rows: list[dict[str, object]] = []
|
|
106
|
+
for tgt in targets:
|
|
107
|
+
centroids = per_target[tgt]
|
|
108
|
+
populated = [p for p in periods if centroids[p] is not None]
|
|
109
|
+
if not populated:
|
|
110
|
+
if baseline_period is not None:
|
|
111
|
+
raise ValueError(
|
|
112
|
+
f"baseline_period={baseline_period!r} has no contexts "
|
|
113
|
+
f"for target {tgt!r}"
|
|
114
|
+
)
|
|
115
|
+
# Target never appears anywhere — emit NaN rows for completeness.
|
|
116
|
+
for period in periods:
|
|
117
|
+
rows.append(
|
|
118
|
+
{
|
|
119
|
+
"period": period,
|
|
120
|
+
"target": tgt,
|
|
121
|
+
"n_contexts": 0,
|
|
122
|
+
"similarity_to_baseline": float("nan"),
|
|
123
|
+
"distance_from_baseline": float("nan"),
|
|
124
|
+
}
|
|
125
|
+
)
|
|
126
|
+
continue
|
|
127
|
+
|
|
128
|
+
if baseline_period is None:
|
|
129
|
+
baseline = populated[0]
|
|
130
|
+
else:
|
|
131
|
+
baseline = (
|
|
132
|
+
pd.Period(baseline_period, freq=freq)
|
|
133
|
+
if isinstance(baseline_period, str)
|
|
134
|
+
else baseline_period
|
|
135
|
+
)
|
|
136
|
+
if baseline not in centroids or centroids[baseline] is None:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
f"baseline_period={baseline_period!r} has no contexts "
|
|
139
|
+
f"for target {tgt!r}"
|
|
140
|
+
)
|
|
141
|
+
|
|
142
|
+
baseline_vec = centroids[baseline]
|
|
143
|
+
assert baseline_vec is not None # for mypy
|
|
144
|
+
for period in periods:
|
|
145
|
+
vec = centroids[period]
|
|
146
|
+
if vec is None:
|
|
147
|
+
sim = float("nan")
|
|
148
|
+
dist = float("nan")
|
|
149
|
+
else:
|
|
150
|
+
sim = cosine_similarity(baseline_vec, vec)
|
|
151
|
+
dist = 1.0 - sim
|
|
152
|
+
rows.append(
|
|
153
|
+
{
|
|
154
|
+
"period": period,
|
|
155
|
+
"target": tgt,
|
|
156
|
+
"n_contexts": per_target_counts[tgt][period],
|
|
157
|
+
"similarity_to_baseline": sim,
|
|
158
|
+
"distance_from_baseline": dist,
|
|
159
|
+
}
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
return (
|
|
163
|
+
pd.DataFrame(rows)
|
|
164
|
+
.sort_values(["target", "period"], kind="stable")
|
|
165
|
+
.reset_index(drop=True)
|
|
166
|
+
)
|
pycorpdiff/stats.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""Small statistical utilities used across the package.
|
|
2
|
+
|
|
3
|
+
Right now this is just the Wilson score interval — pycorpdiff prefers
|
|
4
|
+
Wilson over Wald for proportion CIs because Wald collapses badly near
|
|
5
|
+
``p = 0`` or ``p = 1``, which is exactly where temporal trajectories of
|
|
6
|
+
rare terms live.
|
|
7
|
+
|
|
8
|
+
Reference
|
|
9
|
+
---------
|
|
10
|
+
Wilson, E. B. (1927). Probable inference, the law of succession, and
|
|
11
|
+
statistical inference. *Journal of the American Statistical
|
|
12
|
+
Association*, 22(158), 209-212.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import numpy.typing as npt
|
|
19
|
+
from scipy.special import ndtri
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def cosine_similarity(
|
|
23
|
+
a: npt.NDArray[np.float64], b: npt.NDArray[np.float64]
|
|
24
|
+
) -> float:
|
|
25
|
+
"""Cosine similarity between two 1-D vectors.
|
|
26
|
+
|
|
27
|
+
Returns 0 when either vector is the zero vector — the geometric
|
|
28
|
+
definition is undefined there, and zero is the conservative
|
|
29
|
+
"no relationship" default rather than NaN.
|
|
30
|
+
"""
|
|
31
|
+
na = float(np.linalg.norm(a))
|
|
32
|
+
nb = float(np.linalg.norm(b))
|
|
33
|
+
if na == 0.0 or nb == 0.0:
|
|
34
|
+
return 0.0
|
|
35
|
+
return float(np.dot(a, b) / (na * nb))
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def wilson_ci(
|
|
39
|
+
x: int | npt.NDArray[np.int64],
|
|
40
|
+
n: int | npt.NDArray[np.int64],
|
|
41
|
+
confidence: float = 0.95,
|
|
42
|
+
) -> tuple[npt.NDArray[np.float64], npt.NDArray[np.float64]]:
|
|
43
|
+
"""Wilson score interval for a binomial proportion ``x / n``.
|
|
44
|
+
|
|
45
|
+
Returns ``(lower, upper)`` as float arrays (or scalars cast to
|
|
46
|
+
0-d arrays). When ``n == 0`` the interval is undefined and both
|
|
47
|
+
bounds come back as NaN. ``confidence`` must be in ``(0, 1)``;
|
|
48
|
+
defaults to 0.95 (z ≈ 1.96).
|
|
49
|
+
"""
|
|
50
|
+
if not 0 < confidence < 1:
|
|
51
|
+
raise ValueError(f"confidence must be in (0, 1); got {confidence}")
|
|
52
|
+
# Two-sided z for the requested level: Φ^{-1}((1 + c) / 2).
|
|
53
|
+
z = float(ndtri((1.0 + confidence) / 2.0))
|
|
54
|
+
|
|
55
|
+
x_arr = np.asarray(x, dtype=np.float64)
|
|
56
|
+
n_arr = np.asarray(n, dtype=np.float64)
|
|
57
|
+
|
|
58
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
59
|
+
p = np.where(n_arr > 0, x_arr / n_arr, np.nan)
|
|
60
|
+
z2 = z * z
|
|
61
|
+
denom = 1.0 + z2 / n_arr
|
|
62
|
+
center = (p + z2 / (2.0 * n_arr)) / denom
|
|
63
|
+
margin = z * np.sqrt(p * (1.0 - p) / n_arr + z2 / (4.0 * n_arr * n_arr)) / denom
|
|
64
|
+
|
|
65
|
+
lower = np.where(n_arr > 0, center - margin, np.nan)
|
|
66
|
+
upper = np.where(n_arr > 0, center + margin, np.nan)
|
|
67
|
+
# Clip to [0, 1] — Wilson's interval is bounded mathematically, but
|
|
68
|
+
# roundoff at p ≈ 0 / p ≈ 1 can spill a hair past.
|
|
69
|
+
return np.clip(lower, 0.0, 1.0), np.clip(upper, 0.0, 1.0)
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Temporal slicing, rolling-window analysis, changepoint detection."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .changepoint import detect_changepoints
|
|
6
|
+
from .its import interrupted_time_series
|
|
7
|
+
from .slicing import TemporalCorpus, Tracker, track
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"TemporalCorpus",
|
|
11
|
+
"Tracker",
|
|
12
|
+
"detect_changepoints",
|
|
13
|
+
"interrupted_time_series",
|
|
14
|
+
"track",
|
|
15
|
+
]
|