pycorpdiff 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycorpdiff/__init__.py +126 -0
- pycorpdiff/_backends/__init__.py +3 -0
- pycorpdiff/_backends/pandas.py +3 -0
- pycorpdiff/_backends/polars.py +3 -0
- pycorpdiff/collocation/__init__.py +19 -0
- pycorpdiff/collocation/cooccurrence.py +65 -0
- pycorpdiff/collocation/measures.py +102 -0
- pycorpdiff/collocation/network.py +233 -0
- pycorpdiff/collocation/shift.py +146 -0
- pycorpdiff/compare.py +345 -0
- pycorpdiff/corpus.py +411 -0
- pycorpdiff/datasets/__init__.py +27 -0
- pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- pycorpdiff/datasets/_generate_hansard.py +221 -0
- pycorpdiff/datasets/hansard.py +235 -0
- pycorpdiff/datasets/histwords.py +221 -0
- pycorpdiff/explain.py +177 -0
- pycorpdiff/io/__init__.py +16 -0
- pycorpdiff/io/duckdb.py +92 -0
- pycorpdiff/io/huggingface.py +142 -0
- pycorpdiff/io/readers.py +138 -0
- pycorpdiff/keyness/__init__.py +26 -0
- pycorpdiff/keyness/bayes.py +50 -0
- pycorpdiff/keyness/chi_squared.py +94 -0
- pycorpdiff/keyness/correction.py +34 -0
- pycorpdiff/keyness/dispersion.py +89 -0
- pycorpdiff/keyness/effect_sizes.py +65 -0
- pycorpdiff/keyness/loglikelihood.py +92 -0
- pycorpdiff/keyness/multicorpus.py +143 -0
- pycorpdiff/keyness/permutation.py +154 -0
- pycorpdiff/py.typed +0 -0
- pycorpdiff/results.py +635 -0
- pycorpdiff/semantic/__init__.py +18 -0
- pycorpdiff/semantic/alignment.py +53 -0
- pycorpdiff/semantic/embed.py +84 -0
- pycorpdiff/semantic/shift.py +224 -0
- pycorpdiff/semantic/trajectory.py +166 -0
- pycorpdiff/stats.py +69 -0
- pycorpdiff/temporal/__init__.py +15 -0
- pycorpdiff/temporal/bocpd.py +233 -0
- pycorpdiff/temporal/causal_impact.py +293 -0
- pycorpdiff/temporal/changepoint.py +92 -0
- pycorpdiff/temporal/forecast.py +405 -0
- pycorpdiff/temporal/its.py +123 -0
- pycorpdiff/temporal/slicing.py +174 -0
- pycorpdiff/tokenize.py +110 -0
- pycorpdiff/viz/__init__.py +37 -0
- pycorpdiff/viz/bocpd.py +173 -0
- pycorpdiff/viz/causal_impact.py +142 -0
- pycorpdiff/viz/collocation.py +48 -0
- pycorpdiff/viz/dispersion.py +117 -0
- pycorpdiff/viz/forecast.py +129 -0
- pycorpdiff/viz/keyness.py +96 -0
- pycorpdiff/viz/network.py +186 -0
- pycorpdiff/viz/scattertext.py +160 -0
- pycorpdiff/viz/semantic_forecast.py +114 -0
- pycorpdiff/viz/trajectory.py +48 -0
- pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
- pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
- pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
- pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
pycorpdiff/results.py
ADDED
|
@@ -0,0 +1,635 @@
|
|
|
1
|
+
"""Result dataclasses returned by every public analytical verb.
|
|
2
|
+
|
|
3
|
+
Every Result implements the same informal contract:
|
|
4
|
+
|
|
5
|
+
- ``.to_df()`` returns a tidy :class:`pandas.DataFrame`.
|
|
6
|
+
- ``.plot(**kw)`` returns an :class:`altair.Chart`.
|
|
7
|
+
- ``.explain(term, n)`` returns a :class:`ConcordanceResult` with
|
|
8
|
+
evidence for one row of the result.
|
|
9
|
+
- ``.summary()`` returns a short human-readable string.
|
|
10
|
+
|
|
11
|
+
This contract is intentionally a duck-typing convention rather than an
|
|
12
|
+
abstract base class — it keeps Results lightweight and lets them be
|
|
13
|
+
constructed from a plain DataFrame without inheritance gymnastics.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from dataclasses import dataclass, field
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
from typing import TYPE_CHECKING, Any
|
|
21
|
+
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
if TYPE_CHECKING:
|
|
25
|
+
import altair as alt
|
|
26
|
+
|
|
27
|
+
from .corpus import Corpus, CorpusSlice
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def _table_to_html(table: pd.DataFrame, path: str | Path | None, **kw: Any) -> str:
|
|
31
|
+
"""Render ``table`` as HTML; optionally write to ``path``."""
|
|
32
|
+
html: str = str(table.to_html(**kw))
|
|
33
|
+
if path is not None:
|
|
34
|
+
Path(path).write_text(html, encoding="utf-8")
|
|
35
|
+
return html
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _table_to_json(
|
|
39
|
+
table: pd.DataFrame, path: str | Path | None, **kw: Any
|
|
40
|
+
) -> str:
|
|
41
|
+
"""Render ``table`` as JSON (records orientation by default); optionally
|
|
42
|
+
write to ``path``.
|
|
43
|
+
|
|
44
|
+
Coerces any object-dtype columns containing ``pd.Period`` values to
|
|
45
|
+
strings before serialisation — pandas's JSON writer doesn't know
|
|
46
|
+
how to represent Period and would raise OverflowError. The string
|
|
47
|
+
form (``"2020"``, ``"2020Q1"``, …) round-trips back to Period
|
|
48
|
+
cleanly via :func:`pandas.Period`.
|
|
49
|
+
"""
|
|
50
|
+
serialisable = table.copy()
|
|
51
|
+
for col in serialisable.columns:
|
|
52
|
+
col_dtype = serialisable[col].dtype
|
|
53
|
+
if isinstance(col_dtype, pd.PeriodDtype):
|
|
54
|
+
serialisable[col] = serialisable[col].astype(str)
|
|
55
|
+
elif col_dtype == object: # noqa: E721
|
|
56
|
+
sample = next(
|
|
57
|
+
(v for v in serialisable[col] if v is not None and not pd.isna(v)),
|
|
58
|
+
None,
|
|
59
|
+
)
|
|
60
|
+
if isinstance(sample, pd.Period):
|
|
61
|
+
serialisable[col] = serialisable[col].astype(str)
|
|
62
|
+
kw.setdefault("orient", "records")
|
|
63
|
+
json_str: str = str(serialisable.to_json(**kw))
|
|
64
|
+
if path is not None:
|
|
65
|
+
Path(path).write_text(json_str, encoding="utf-8")
|
|
66
|
+
return json_str
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass(frozen=True)
|
|
70
|
+
class KeynessResult:
|
|
71
|
+
"""Per-term keyness scores for two corpora.
|
|
72
|
+
|
|
73
|
+
The ``table`` DataFrame has one row per shared vocabulary item with
|
|
74
|
+
columns including ``term``, ``count_a``, ``count_b``, ``score``,
|
|
75
|
+
``effect_size``, ``p_value``, ``dispersion_a``, ``dispersion_b``,
|
|
76
|
+
and a boolean ``dispersion_flag``.
|
|
77
|
+
"""
|
|
78
|
+
|
|
79
|
+
table: pd.DataFrame
|
|
80
|
+
method: str
|
|
81
|
+
n_a: int
|
|
82
|
+
n_b: int
|
|
83
|
+
label_a: str = "a"
|
|
84
|
+
label_b: str = "b"
|
|
85
|
+
params: dict[str, Any] = field(default_factory=dict)
|
|
86
|
+
corpus_a: Corpus | CorpusSlice | None = None
|
|
87
|
+
corpus_b: Corpus | CorpusSlice | None = None
|
|
88
|
+
|
|
89
|
+
def to_df(self) -> pd.DataFrame:
|
|
90
|
+
return self.table.copy()
|
|
91
|
+
|
|
92
|
+
def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
93
|
+
"""Render the underlying table as HTML (returns the string and,
|
|
94
|
+
optionally, writes to ``path``). Extra kwargs forward to
|
|
95
|
+
:meth:`pandas.DataFrame.to_html`."""
|
|
96
|
+
return _table_to_html(self.table, path, **kw)
|
|
97
|
+
|
|
98
|
+
def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
99
|
+
"""Render the underlying table as JSON (default ``orient="records"``).
|
|
100
|
+
Returns the JSON string and, optionally, writes to ``path``."""
|
|
101
|
+
return _table_to_json(self.table, path, **kw)
|
|
102
|
+
|
|
103
|
+
def plot(self, kind: str = "volcano", **kw: Any) -> alt.Chart:
|
|
104
|
+
"""Return an altair chart of the keyness result.
|
|
105
|
+
|
|
106
|
+
``kind="volcano"`` (default) returns a volcano-style scatter of
|
|
107
|
+
effect size against −log₁₀(*p*); ``kind="bar"`` returns a top-N
|
|
108
|
+
horizontal bar chart; ``kind="scattertext"`` returns the
|
|
109
|
+
Scattertext-style interactive rank-percentile scatter (Kessler
|
|
110
|
+
2017). Extra keyword arguments are forwarded to the underlying
|
|
111
|
+
viz function (``n_labels``, ``n``, ``width``, ``height``).
|
|
112
|
+
"""
|
|
113
|
+
from .viz.keyness import keyness_top_n_bar, keyness_volcano
|
|
114
|
+
from .viz.scattertext import scattertext_plot
|
|
115
|
+
|
|
116
|
+
if kind == "volcano":
|
|
117
|
+
return keyness_volcano(self.table, **kw)
|
|
118
|
+
if kind == "bar":
|
|
119
|
+
return keyness_top_n_bar(self.table, **kw)
|
|
120
|
+
if kind == "scattertext":
|
|
121
|
+
return scattertext_plot(
|
|
122
|
+
self.table, label_a=self.label_a, label_b=self.label_b, **kw
|
|
123
|
+
)
|
|
124
|
+
raise ValueError(
|
|
125
|
+
f"unknown kind={kind!r}; expected 'volcano', 'bar', or 'scattertext'"
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
def explain(self, term: str, n: int = 5, window: int = 5) -> ConcordanceResult:
|
|
129
|
+
"""Show KWIC examples of ``term`` from both source corpora.
|
|
130
|
+
|
|
131
|
+
Returns up to ``n`` lines per corpus. Requires that the result
|
|
132
|
+
was built via :meth:`pycorpdiff.Comparison.keyness` (which
|
|
133
|
+
populates the corpus references); building a ``KeynessResult``
|
|
134
|
+
from a bare DataFrame will raise.
|
|
135
|
+
"""
|
|
136
|
+
if self.corpus_a is None or self.corpus_b is None:
|
|
137
|
+
raise ValueError(
|
|
138
|
+
"explain() requires source corpora; this KeynessResult was "
|
|
139
|
+
"constructed without them"
|
|
140
|
+
)
|
|
141
|
+
from .explain import kwic_compare
|
|
142
|
+
|
|
143
|
+
return kwic_compare(
|
|
144
|
+
self.corpus_a,
|
|
145
|
+
self.corpus_b,
|
|
146
|
+
target=term,
|
|
147
|
+
window=window,
|
|
148
|
+
n_per_side=n,
|
|
149
|
+
label_a=self.label_a,
|
|
150
|
+
label_b=self.label_b,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
def summary(self) -> str:
|
|
154
|
+
return (
|
|
155
|
+
f"KeynessResult({self.method}, |a|={self.n_a:,}, |b|={self.n_b:,}, "
|
|
156
|
+
f"terms={len(self.table):,})"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass(frozen=True)
|
|
161
|
+
class CollocationShiftResult:
|
|
162
|
+
"""Change in collocates of a target term between two corpora."""
|
|
163
|
+
|
|
164
|
+
target: str
|
|
165
|
+
table: pd.DataFrame
|
|
166
|
+
measure: str
|
|
167
|
+
window: int
|
|
168
|
+
label_a: str = "a"
|
|
169
|
+
label_b: str = "b"
|
|
170
|
+
corpus_a: Corpus | CorpusSlice | None = None
|
|
171
|
+
corpus_b: Corpus | CorpusSlice | None = None
|
|
172
|
+
|
|
173
|
+
def to_df(self) -> pd.DataFrame:
|
|
174
|
+
return self.table.copy()
|
|
175
|
+
|
|
176
|
+
def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
177
|
+
"""Render the underlying table as HTML (returns the string and,
|
|
178
|
+
optionally, writes to ``path``). Extra kwargs forward to
|
|
179
|
+
:meth:`pandas.DataFrame.to_html`."""
|
|
180
|
+
return _table_to_html(self.table, path, **kw)
|
|
181
|
+
|
|
182
|
+
def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
183
|
+
"""Render the underlying table as JSON (default ``orient="records"``).
|
|
184
|
+
Returns the JSON string and, optionally, writes to ``path``."""
|
|
185
|
+
return _table_to_json(self.table, path, **kw)
|
|
186
|
+
|
|
187
|
+
def plot(self, **kw: Any) -> alt.Chart:
|
|
188
|
+
"""Return a diverging horizontal bar chart of the top collocate shifts."""
|
|
189
|
+
from .viz.collocation import collocation_diverging_bar
|
|
190
|
+
|
|
191
|
+
return collocation_diverging_bar(self.table, **kw)
|
|
192
|
+
|
|
193
|
+
def explain(self, collocate: str, n: int = 5) -> ConcordanceResult:
|
|
194
|
+
"""Show KWIC windows where ``target`` co-occurs with ``collocate``.
|
|
195
|
+
|
|
196
|
+
Returns up to ``n`` lines per corpus, restricted to contexts in
|
|
197
|
+
which both the target and ``collocate`` appear within the same
|
|
198
|
+
window. This is the per-row evidence behind a shift score.
|
|
199
|
+
"""
|
|
200
|
+
if self.corpus_a is None or self.corpus_b is None:
|
|
201
|
+
raise ValueError(
|
|
202
|
+
"explain() requires source corpora; this CollocationShiftResult "
|
|
203
|
+
"was constructed without them"
|
|
204
|
+
)
|
|
205
|
+
from .explain import kwic_compare
|
|
206
|
+
|
|
207
|
+
return kwic_compare(
|
|
208
|
+
self.corpus_a,
|
|
209
|
+
self.corpus_b,
|
|
210
|
+
target=self.target,
|
|
211
|
+
window=self.window,
|
|
212
|
+
n_per_side=n,
|
|
213
|
+
collocate=collocate,
|
|
214
|
+
label_a=self.label_a,
|
|
215
|
+
label_b=self.label_b,
|
|
216
|
+
)
|
|
217
|
+
|
|
218
|
+
def summary(self) -> str:
|
|
219
|
+
return (
|
|
220
|
+
f"CollocationShiftResult(target={self.target!r}, measure={self.measure}, "
|
|
221
|
+
f"window={self.window}, collocates={len(self.table):,})"
|
|
222
|
+
)
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
@dataclass(frozen=True)
|
|
226
|
+
class SemanticShiftResult:
|
|
227
|
+
"""Embedding-space displacement of a target term between corpora."""
|
|
228
|
+
|
|
229
|
+
targets: list[str]
|
|
230
|
+
table: pd.DataFrame
|
|
231
|
+
alignment: str
|
|
232
|
+
label_a: str = "a"
|
|
233
|
+
label_b: str = "b"
|
|
234
|
+
corpus_a: Corpus | CorpusSlice | None = None
|
|
235
|
+
corpus_b: Corpus | CorpusSlice | None = None
|
|
236
|
+
embedder: Any | None = None
|
|
237
|
+
window: int = 5
|
|
238
|
+
|
|
239
|
+
def to_df(self) -> pd.DataFrame:
|
|
240
|
+
return self.table.copy()
|
|
241
|
+
|
|
242
|
+
def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
243
|
+
"""Render the underlying table as HTML (returns the string and,
|
|
244
|
+
optionally, writes to ``path``). Extra kwargs forward to
|
|
245
|
+
:meth:`pandas.DataFrame.to_html`."""
|
|
246
|
+
return _table_to_html(self.table, path, **kw)
|
|
247
|
+
|
|
248
|
+
def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
249
|
+
"""Render the underlying table as JSON (default ``orient="records"``).
|
|
250
|
+
Returns the JSON string and, optionally, writes to ``path``."""
|
|
251
|
+
return _table_to_json(self.table, path, **kw)
|
|
252
|
+
|
|
253
|
+
def plot(self, **kw: Any) -> alt.Chart:
|
|
254
|
+
raise NotImplementedError("SemanticShiftResult.plot() lands in Phase 6")
|
|
255
|
+
|
|
256
|
+
def neighbors_before(
|
|
257
|
+
self, target: str | None = None, n: int = 10
|
|
258
|
+
) -> pd.DataFrame:
|
|
259
|
+
"""Top-n contextual neighbours of ``target`` in corpus A.
|
|
260
|
+
|
|
261
|
+
Returns the rows of :func:`pycorpdiff.semantic.neighborhood_drift`
|
|
262
|
+
with a non-null ``sim_a`` (i.e. terms that appeared in A's
|
|
263
|
+
top-k), sorted by ``sim_a`` descending. Requires the result was
|
|
264
|
+
built via :meth:`Comparison.semantic_shift` so the source
|
|
265
|
+
corpora and embedder are attached.
|
|
266
|
+
"""
|
|
267
|
+
return self._neighborhood(target=target, n=n, side="a")
|
|
268
|
+
|
|
269
|
+
def neighbors_after(
|
|
270
|
+
self, target: str | None = None, n: int = 10
|
|
271
|
+
) -> pd.DataFrame:
|
|
272
|
+
"""Top-n contextual neighbours of ``target`` in corpus B."""
|
|
273
|
+
return self._neighborhood(target=target, n=n, side="b")
|
|
274
|
+
|
|
275
|
+
def _neighborhood(
|
|
276
|
+
self, target: str | None, n: int, side: str
|
|
277
|
+
) -> pd.DataFrame:
|
|
278
|
+
if self.corpus_a is None or self.corpus_b is None:
|
|
279
|
+
raise ValueError(
|
|
280
|
+
"neighbors_before / neighbors_after require source corpora; "
|
|
281
|
+
"this SemanticShiftResult was constructed without them"
|
|
282
|
+
)
|
|
283
|
+
if target is None:
|
|
284
|
+
if len(self.targets) != 1:
|
|
285
|
+
raise ValueError(
|
|
286
|
+
f"result carries {len(self.targets)} targets; pass target= to pick one"
|
|
287
|
+
)
|
|
288
|
+
target = self.targets[0]
|
|
289
|
+
if target not in self.targets:
|
|
290
|
+
raise ValueError(
|
|
291
|
+
f"target={target!r} not in result targets {self.targets!r}"
|
|
292
|
+
)
|
|
293
|
+
from .semantic.shift import neighborhood_drift
|
|
294
|
+
|
|
295
|
+
full = neighborhood_drift(
|
|
296
|
+
self.corpus_a,
|
|
297
|
+
self.corpus_b,
|
|
298
|
+
target=target,
|
|
299
|
+
k=n,
|
|
300
|
+
embedder=self.embedder,
|
|
301
|
+
window=self.window,
|
|
302
|
+
)
|
|
303
|
+
sim_col = "sim_a" if side == "a" else "sim_b"
|
|
304
|
+
return (
|
|
305
|
+
full.dropna(subset=[sim_col])
|
|
306
|
+
.sort_values(sim_col, ascending=False, kind="stable")
|
|
307
|
+
.head(n)
|
|
308
|
+
.reset_index(drop=True)
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
def summary(self) -> str:
|
|
312
|
+
return (
|
|
313
|
+
f"SemanticShiftResult(targets={self.targets!r}, alignment={self.alignment})"
|
|
314
|
+
)
|
|
315
|
+
|
|
316
|
+
|
|
317
|
+
@dataclass(frozen=True)
|
|
318
|
+
class TemporalTrajectory:
|
|
319
|
+
"""A time-indexed series for one or more target terms.
|
|
320
|
+
|
|
321
|
+
``table`` has columns ``period``, ``term``, ``count``, ``relfreq``,
|
|
322
|
+
``ci_lower``, ``ci_upper``.
|
|
323
|
+
"""
|
|
324
|
+
|
|
325
|
+
table: pd.DataFrame
|
|
326
|
+
targets: list[str]
|
|
327
|
+
freq: str
|
|
328
|
+
|
|
329
|
+
def to_df(self) -> pd.DataFrame:
|
|
330
|
+
return self.table.copy()
|
|
331
|
+
|
|
332
|
+
def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
333
|
+
"""Render the underlying table as HTML (returns the string and,
|
|
334
|
+
optionally, writes to ``path``). Extra kwargs forward to
|
|
335
|
+
:meth:`pandas.DataFrame.to_html`."""
|
|
336
|
+
return _table_to_html(self.table, path, **kw)
|
|
337
|
+
|
|
338
|
+
def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
339
|
+
"""Render the underlying table as JSON (default ``orient="records"``).
|
|
340
|
+
Returns the JSON string and, optionally, writes to ``path``."""
|
|
341
|
+
return _table_to_json(self.table, path, **kw)
|
|
342
|
+
|
|
343
|
+
def plot(self, **kw: Any) -> alt.Chart:
|
|
344
|
+
"""Return a line plot with Wilson CI bands per term."""
|
|
345
|
+
from .viz.trajectory import trajectory_with_ci
|
|
346
|
+
|
|
347
|
+
return trajectory_with_ci(self.table, **kw)
|
|
348
|
+
|
|
349
|
+
def changepoints(
|
|
350
|
+
self,
|
|
351
|
+
target: str | None = None,
|
|
352
|
+
method: str = "pelt",
|
|
353
|
+
penalty: float | None = None,
|
|
354
|
+
) -> pd.DataFrame:
|
|
355
|
+
"""Run changepoint detection on a target's relative-frequency series.
|
|
356
|
+
|
|
357
|
+
Requires the ``[temporal]`` extra (ruptures). When the
|
|
358
|
+
trajectory holds multiple targets, supply ``target`` to pick one;
|
|
359
|
+
a single-target trajectory uses it automatically.
|
|
360
|
+
"""
|
|
361
|
+
from .temporal.changepoint import detect_changepoints
|
|
362
|
+
|
|
363
|
+
if target is None:
|
|
364
|
+
if len(self.targets) != 1:
|
|
365
|
+
raise ValueError(
|
|
366
|
+
f"trajectory carries {len(self.targets)} targets; "
|
|
367
|
+
"pass target= to pick one"
|
|
368
|
+
)
|
|
369
|
+
target = self.targets[0]
|
|
370
|
+
if target not in self.targets:
|
|
371
|
+
raise ValueError(f"target={target!r} not in trajectory targets {self.targets!r}")
|
|
372
|
+
|
|
373
|
+
sub = self.table[self.table["term"] == target].set_index("period")["relfreq"]
|
|
374
|
+
return detect_changepoints(sub, method=method, penalty=penalty) # type: ignore[arg-type]
|
|
375
|
+
|
|
376
|
+
def changepoints_online(
|
|
377
|
+
self,
|
|
378
|
+
target: str | None = None,
|
|
379
|
+
*,
|
|
380
|
+
hazard: float = 0.01,
|
|
381
|
+
mu_0: float | None = None,
|
|
382
|
+
kappa_0: float = 1.0,
|
|
383
|
+
alpha_0: float = 1.0,
|
|
384
|
+
beta_0: float | None = None,
|
|
385
|
+
max_run_length: int | None = None,
|
|
386
|
+
) -> Any:
|
|
387
|
+
"""Bayesian *online* changepoint detection (Adams & MacKay 2007).
|
|
388
|
+
|
|
389
|
+
Where :meth:`changepoints` runs PELT offline (needs the full
|
|
390
|
+
series, returns MAP locations after the fact), this runs an
|
|
391
|
+
online forward pass: at each step it updates the posterior
|
|
392
|
+
distribution over the *run length* — the number of periods
|
|
393
|
+
since the last changepoint. The MAP run length collapsing to
|
|
394
|
+
a small value marks a changepoint.
|
|
395
|
+
|
|
396
|
+
Returns
|
|
397
|
+
-------
|
|
398
|
+
:class:`pycorpdiff.temporal.bocpd.BocpdResult`
|
|
399
|
+
|
|
400
|
+
Requires the ``[temporal]`` extra is *not* needed —
|
|
401
|
+
``scipy.stats`` already in the base dependency set is enough.
|
|
402
|
+
"""
|
|
403
|
+
from .temporal.bocpd import bocpd
|
|
404
|
+
|
|
405
|
+
if target is None:
|
|
406
|
+
if len(self.targets) != 1:
|
|
407
|
+
raise ValueError(
|
|
408
|
+
f"trajectory carries {len(self.targets)} targets; "
|
|
409
|
+
"pass target= to pick one"
|
|
410
|
+
)
|
|
411
|
+
target = self.targets[0]
|
|
412
|
+
if target not in self.targets:
|
|
413
|
+
raise ValueError(
|
|
414
|
+
f"target={target!r} not in trajectory targets {self.targets!r}"
|
|
415
|
+
)
|
|
416
|
+
|
|
417
|
+
sub = (
|
|
418
|
+
self.table[self.table["term"] == target]
|
|
419
|
+
.sort_values("period")
|
|
420
|
+
.set_index("period")["relfreq"]
|
|
421
|
+
)
|
|
422
|
+
return bocpd(
|
|
423
|
+
sub,
|
|
424
|
+
hazard=hazard,
|
|
425
|
+
mu_0=mu_0,
|
|
426
|
+
kappa_0=kappa_0,
|
|
427
|
+
alpha_0=alpha_0,
|
|
428
|
+
beta_0=beta_0,
|
|
429
|
+
max_run_length=max_run_length,
|
|
430
|
+
)
|
|
431
|
+
|
|
432
|
+
def interrupted_time_series(
|
|
433
|
+
self,
|
|
434
|
+
event_date: str,
|
|
435
|
+
target: str | None = None,
|
|
436
|
+
) -> pd.DataFrame:
|
|
437
|
+
"""Fit a segmented-regression ITS model around ``event_date``.
|
|
438
|
+
|
|
439
|
+
Requires the ``[temporal]`` extra (statsmodels). Returns level
|
|
440
|
+
and slope-change estimates with confidence intervals.
|
|
441
|
+
"""
|
|
442
|
+
from .temporal.its import interrupted_time_series
|
|
443
|
+
|
|
444
|
+
if target is None:
|
|
445
|
+
if len(self.targets) != 1:
|
|
446
|
+
raise ValueError(
|
|
447
|
+
f"trajectory carries {len(self.targets)} targets; "
|
|
448
|
+
"pass target= to pick one"
|
|
449
|
+
)
|
|
450
|
+
target = self.targets[0]
|
|
451
|
+
if target not in self.targets:
|
|
452
|
+
raise ValueError(f"target={target!r} not in trajectory targets {self.targets!r}")
|
|
453
|
+
|
|
454
|
+
sub = self.table[self.table["term"] == target].set_index("period")["relfreq"]
|
|
455
|
+
return interrupted_time_series(sub, event_date=event_date)
|
|
456
|
+
|
|
457
|
+
def causal_impact(
|
|
458
|
+
self,
|
|
459
|
+
event_date: str,
|
|
460
|
+
target: str | None = None,
|
|
461
|
+
*,
|
|
462
|
+
level: float = 0.95,
|
|
463
|
+
n_samples: int = 1000,
|
|
464
|
+
seed: int | None = 0,
|
|
465
|
+
model: str = "local linear trend",
|
|
466
|
+
) -> Any:
|
|
467
|
+
"""Counterfactual causal impact of an event on this trajectory.
|
|
468
|
+
|
|
469
|
+
Bayesian structural time-series (Brodersen et al. 2015) — fits a
|
|
470
|
+
local-linear-trend state-space model on the pre-event window
|
|
471
|
+
and projects forward as the counterfactual "what would have
|
|
472
|
+
happened without the event". Observed minus counterfactual is
|
|
473
|
+
the causal effect, with credible intervals from Monte Carlo
|
|
474
|
+
simulation against the joint state-space posterior.
|
|
475
|
+
|
|
476
|
+
Requires the ``[temporal]`` extra (statsmodels).
|
|
477
|
+
|
|
478
|
+
Parameters
|
|
479
|
+
----------
|
|
480
|
+
event_date
|
|
481
|
+
Where to place the intervention.
|
|
482
|
+
target
|
|
483
|
+
Which term to analyse. Defaults to the trajectory's single
|
|
484
|
+
target when there's only one.
|
|
485
|
+
level
|
|
486
|
+
Credible-interval level. ``0.95`` → 95% CrI.
|
|
487
|
+
n_samples
|
|
488
|
+
Monte Carlo path count for the joint CrI. ``1000`` is the
|
|
489
|
+
conventional default.
|
|
490
|
+
seed
|
|
491
|
+
RNG seed for reproducibility.
|
|
492
|
+
model
|
|
493
|
+
Trend specification — usually ``"local linear trend"`` (the
|
|
494
|
+
default) or ``"local level"``.
|
|
495
|
+
|
|
496
|
+
Returns
|
|
497
|
+
-------
|
|
498
|
+
:class:`pycorpdiff.temporal.causal_impact.CausalImpactResult`
|
|
499
|
+
"""
|
|
500
|
+
from dataclasses import replace as _dc_replace
|
|
501
|
+
|
|
502
|
+
from .temporal.causal_impact import causal_impact
|
|
503
|
+
|
|
504
|
+
if target is None:
|
|
505
|
+
if len(self.targets) != 1:
|
|
506
|
+
raise ValueError(
|
|
507
|
+
f"trajectory carries {len(self.targets)} targets; "
|
|
508
|
+
"pass target= to pick one"
|
|
509
|
+
)
|
|
510
|
+
target = self.targets[0]
|
|
511
|
+
if target not in self.targets:
|
|
512
|
+
raise ValueError(
|
|
513
|
+
f"target={target!r} not in trajectory targets {self.targets!r}"
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
sub = (
|
|
517
|
+
self.table[self.table["term"] == target]
|
|
518
|
+
.sort_values("period")
|
|
519
|
+
.set_index("period")["relfreq"]
|
|
520
|
+
)
|
|
521
|
+
result = causal_impact(
|
|
522
|
+
sub,
|
|
523
|
+
event_date=event_date,
|
|
524
|
+
level=level,
|
|
525
|
+
n_samples=n_samples,
|
|
526
|
+
seed=seed,
|
|
527
|
+
model=model,
|
|
528
|
+
)
|
|
529
|
+
return _dc_replace(result, target=target)
|
|
530
|
+
|
|
531
|
+
def forecast(
|
|
532
|
+
self,
|
|
533
|
+
horizon: int = 4,
|
|
534
|
+
*,
|
|
535
|
+
target: str | None = None,
|
|
536
|
+
level: float = 0.95,
|
|
537
|
+
method: str = "auto",
|
|
538
|
+
logit_transform: bool = True,
|
|
539
|
+
) -> Any:
|
|
540
|
+
"""Extend this trajectory ``horizon`` periods forward.
|
|
541
|
+
|
|
542
|
+
Wraps state-space exponential smoothing (Hyndman et al. 2008)
|
|
543
|
+
via statsmodels' ``ETSModel`` (for series of length ≥ 8) or
|
|
544
|
+
``Holt`` linear-trend (for shorter histories). Rates are
|
|
545
|
+
forecast on the logit scale and back-transformed so prediction
|
|
546
|
+
intervals stay in ``[0, 1]``.
|
|
547
|
+
|
|
548
|
+
Parameters
|
|
549
|
+
----------
|
|
550
|
+
horizon
|
|
551
|
+
Periods to project forward.
|
|
552
|
+
target
|
|
553
|
+
Restrict to a single term. ``None`` (default) forecasts
|
|
554
|
+
every term in the trajectory.
|
|
555
|
+
level
|
|
556
|
+
Prediction-interval level. ``0.95`` → 95% PI.
|
|
557
|
+
method
|
|
558
|
+
``"auto"`` (default), ``"ets"``, or ``"holt"``.
|
|
559
|
+
logit_transform
|
|
560
|
+
Keep the PI in [0, 1] by working on the logit scale.
|
|
561
|
+
|
|
562
|
+
Returns
|
|
563
|
+
-------
|
|
564
|
+
:class:`pycorpdiff.temporal.forecast.ForecastResult`
|
|
565
|
+
Carries the history *and* forecast tables; ``.plot()``
|
|
566
|
+
renders the combined chart with a dashed continuation.
|
|
567
|
+
|
|
568
|
+
Requires the ``[temporal]`` extra (statsmodels).
|
|
569
|
+
"""
|
|
570
|
+
from .temporal.forecast import (
|
|
571
|
+
ForecastResult,
|
|
572
|
+
forecast_trajectory,
|
|
573
|
+
)
|
|
574
|
+
|
|
575
|
+
if target is None:
|
|
576
|
+
chosen_targets = self.targets
|
|
577
|
+
else:
|
|
578
|
+
if target not in self.targets:
|
|
579
|
+
raise ValueError(
|
|
580
|
+
f"target={target!r} not in trajectory targets {self.targets!r}"
|
|
581
|
+
)
|
|
582
|
+
chosen_targets = [target]
|
|
583
|
+
|
|
584
|
+
fc_table = forecast_trajectory(
|
|
585
|
+
self.table,
|
|
586
|
+
targets=chosen_targets,
|
|
587
|
+
horizon=horizon,
|
|
588
|
+
level=level,
|
|
589
|
+
method=method, # type: ignore[arg-type]
|
|
590
|
+
logit_transform=logit_transform,
|
|
591
|
+
)
|
|
592
|
+
return ForecastResult(
|
|
593
|
+
history=self.table[self.table["term"].isin(chosen_targets)].copy(),
|
|
594
|
+
forecast=fc_table,
|
|
595
|
+
targets=list(chosen_targets),
|
|
596
|
+
freq=self.freq,
|
|
597
|
+
horizon=horizon,
|
|
598
|
+
level=level,
|
|
599
|
+
method=method,
|
|
600
|
+
params={
|
|
601
|
+
"logit_transform": logit_transform,
|
|
602
|
+
},
|
|
603
|
+
)
|
|
604
|
+
|
|
605
|
+
def summary(self) -> str:
|
|
606
|
+
return (
|
|
607
|
+
f"TemporalTrajectory(targets={self.targets!r}, freq={self.freq!r}, "
|
|
608
|
+
f"periods={self.table['period'].nunique() if 'period' in self.table else 0:,})"
|
|
609
|
+
)
|
|
610
|
+
|
|
611
|
+
|
|
612
|
+
@dataclass(frozen=True)
|
|
613
|
+
class ConcordanceResult:
|
|
614
|
+
"""KWIC (keyword-in-context) lines for a target term."""
|
|
615
|
+
|
|
616
|
+
target: str
|
|
617
|
+
table: pd.DataFrame
|
|
618
|
+
window: int
|
|
619
|
+
|
|
620
|
+
def to_df(self) -> pd.DataFrame:
|
|
621
|
+
return self.table.copy()
|
|
622
|
+
|
|
623
|
+
def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
624
|
+
"""Render the underlying table as HTML (returns the string and,
|
|
625
|
+
optionally, writes to ``path``). Extra kwargs forward to
|
|
626
|
+
:meth:`pandas.DataFrame.to_html`."""
|
|
627
|
+
return _table_to_html(self.table, path, **kw)
|
|
628
|
+
|
|
629
|
+
def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
|
|
630
|
+
"""Render the underlying table as JSON (default ``orient="records"``).
|
|
631
|
+
Returns the JSON string and, optionally, writes to ``path``."""
|
|
632
|
+
return _table_to_json(self.table, path, **kw)
|
|
633
|
+
|
|
634
|
+
def summary(self) -> str:
|
|
635
|
+
return f"ConcordanceResult(target={self.target!r}, lines={len(self.table):,})"
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Embedding-based semantic shift and trajectory analysis."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .alignment import procrustes_align
|
|
6
|
+
from .embed import Embedder, HashEmbedder, SBERTEmbedder
|
|
7
|
+
from .shift import neighborhood_drift, semantic_shift
|
|
8
|
+
from .trajectory import semantic_trajectory
|
|
9
|
+
|
|
10
|
+
__all__ = [
|
|
11
|
+
"Embedder",
|
|
12
|
+
"HashEmbedder",
|
|
13
|
+
"SBERTEmbedder",
|
|
14
|
+
"neighborhood_drift",
|
|
15
|
+
"procrustes_align",
|
|
16
|
+
"semantic_shift",
|
|
17
|
+
"semantic_trajectory",
|
|
18
|
+
]
|