pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pycorpdiff/__init__.py +126 -0
  2. pycorpdiff/_backends/__init__.py +3 -0
  3. pycorpdiff/_backends/pandas.py +3 -0
  4. pycorpdiff/_backends/polars.py +3 -0
  5. pycorpdiff/collocation/__init__.py +19 -0
  6. pycorpdiff/collocation/cooccurrence.py +65 -0
  7. pycorpdiff/collocation/measures.py +102 -0
  8. pycorpdiff/collocation/network.py +233 -0
  9. pycorpdiff/collocation/shift.py +146 -0
  10. pycorpdiff/compare.py +345 -0
  11. pycorpdiff/corpus.py +411 -0
  12. pycorpdiff/datasets/__init__.py +27 -0
  13. pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  14. pycorpdiff/datasets/_generate_hansard.py +221 -0
  15. pycorpdiff/datasets/hansard.py +235 -0
  16. pycorpdiff/datasets/histwords.py +221 -0
  17. pycorpdiff/explain.py +177 -0
  18. pycorpdiff/io/__init__.py +16 -0
  19. pycorpdiff/io/duckdb.py +92 -0
  20. pycorpdiff/io/huggingface.py +142 -0
  21. pycorpdiff/io/readers.py +138 -0
  22. pycorpdiff/keyness/__init__.py +26 -0
  23. pycorpdiff/keyness/bayes.py +50 -0
  24. pycorpdiff/keyness/chi_squared.py +94 -0
  25. pycorpdiff/keyness/correction.py +34 -0
  26. pycorpdiff/keyness/dispersion.py +89 -0
  27. pycorpdiff/keyness/effect_sizes.py +65 -0
  28. pycorpdiff/keyness/loglikelihood.py +92 -0
  29. pycorpdiff/keyness/multicorpus.py +143 -0
  30. pycorpdiff/keyness/permutation.py +154 -0
  31. pycorpdiff/py.typed +0 -0
  32. pycorpdiff/results.py +635 -0
  33. pycorpdiff/semantic/__init__.py +18 -0
  34. pycorpdiff/semantic/alignment.py +53 -0
  35. pycorpdiff/semantic/embed.py +84 -0
  36. pycorpdiff/semantic/shift.py +224 -0
  37. pycorpdiff/semantic/trajectory.py +166 -0
  38. pycorpdiff/stats.py +69 -0
  39. pycorpdiff/temporal/__init__.py +15 -0
  40. pycorpdiff/temporal/bocpd.py +233 -0
  41. pycorpdiff/temporal/causal_impact.py +293 -0
  42. pycorpdiff/temporal/changepoint.py +92 -0
  43. pycorpdiff/temporal/forecast.py +405 -0
  44. pycorpdiff/temporal/its.py +123 -0
  45. pycorpdiff/temporal/slicing.py +174 -0
  46. pycorpdiff/tokenize.py +110 -0
  47. pycorpdiff/viz/__init__.py +37 -0
  48. pycorpdiff/viz/bocpd.py +173 -0
  49. pycorpdiff/viz/causal_impact.py +142 -0
  50. pycorpdiff/viz/collocation.py +48 -0
  51. pycorpdiff/viz/dispersion.py +117 -0
  52. pycorpdiff/viz/forecast.py +129 -0
  53. pycorpdiff/viz/keyness.py +96 -0
  54. pycorpdiff/viz/network.py +186 -0
  55. pycorpdiff/viz/scattertext.py +160 -0
  56. pycorpdiff/viz/semantic_forecast.py +114 -0
  57. pycorpdiff/viz/trajectory.py +48 -0
  58. pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
  59. pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
  60. pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
  61. pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,146 @@
1
+ """Cross-corpus collocation shift — gained / lost collocates of a target."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Literal
6
+
7
+ import pandas as pd
8
+
9
+ from ..corpus import Corpus, CorpusSlice
10
+ from .cooccurrence import collocate_counts
11
+ from .measures import logdice, mi_three, pmi, t_score
12
+
13
+ CollocationMeasure = Literal["logDice", "PMI", "t_score", "MI3"]
14
+
15
+ _MEASURES_NEED_N: dict[str, bool] = {
16
+ "logDice": False,
17
+ "PMI": True,
18
+ "t_score": True,
19
+ "MI3": True,
20
+ }
21
+
22
+
23
+ def collocation_shift(
24
+ a: Corpus | CorpusSlice,
25
+ b: Corpus | CorpusSlice,
26
+ target: str,
27
+ window: int = 5,
28
+ measure: CollocationMeasure = "logDice",
29
+ min_count: int = 5,
30
+ smoothing: float = 0.5,
31
+ ) -> pd.DataFrame:
32
+ """Compute the change in target-term collocates between two corpora.
33
+
34
+ For every collocate that meets the combined ``min_count`` threshold,
35
+ the chosen association measure is computed in each corpus and the
36
+ difference (``score_a - score_b``) is reported. Laplace ``smoothing``
37
+ is applied to joint and marginal counts before scoring so collocates
38
+ absent on one side yield finite scores; the default α=0.5 mirrors
39
+ Hardie's LogRatio smoothing and Rychlý's logDice convention.
40
+
41
+ Parameters
42
+ ----------
43
+ a, b
44
+ The two corpora (or slices) to compare. ``target`` must appear
45
+ in both — its complete absence on one side makes the shift
46
+ undefined.
47
+ target
48
+ The pivot term whose collocates we are tracking.
49
+ window
50
+ Context size on each side of the target.
51
+ measure
52
+ Which association measure to apply.
53
+ min_count
54
+ Drop collocates whose ``count_a + count_b`` is below this.
55
+ smoothing
56
+ Laplace constant added to joint / marginal counts before
57
+ scoring. Must be > 0.
58
+
59
+ Returns
60
+ -------
61
+ pandas.DataFrame
62
+ Indexed by collocate, columns: ``count_a``, ``count_b``,
63
+ ``score_a``, ``score_b``, ``shift``. Sorted by ``|shift|``
64
+ descending.
65
+ """
66
+ if smoothing <= 0:
67
+ raise ValueError(f"smoothing must be > 0; got {smoothing}")
68
+ if measure not in _MEASURES_NEED_N:
69
+ raise ValueError(
70
+ f"unknown measure={measure!r}; expected one of {list(_MEASURES_NEED_N)}"
71
+ )
72
+
73
+ tokens_a = a.tokens()
74
+ tokens_b = b.tokens()
75
+ cocount_a, fx_a = collocate_counts(tokens_a, target, window=window)
76
+ cocount_b, fx_b = collocate_counts(tokens_b, target, window=window)
77
+
78
+ if fx_a == 0:
79
+ raise ValueError(f"target {target!r} not found in corpus a")
80
+ if fx_b == 0:
81
+ raise ValueError(f"target {target!r} not found in corpus b")
82
+
83
+ all_collocates = sorted(set(cocount_a) | set(cocount_b))
84
+ fxy_a_raw = pd.Series(
85
+ {c: cocount_a.get(c, 0) for c in all_collocates}, dtype="int64"
86
+ )
87
+ fxy_b_raw = pd.Series(
88
+ {c: cocount_b.get(c, 0) for c in all_collocates}, dtype="int64"
89
+ )
90
+ keep = (fxy_a_raw + fxy_b_raw) >= min_count
91
+ fxy_a_raw = fxy_a_raw[keep]
92
+ fxy_b_raw = fxy_b_raw[keep]
93
+
94
+ if len(fxy_a_raw) == 0:
95
+ return pd.DataFrame(
96
+ columns=["count_a", "count_b", "score_a", "score_b", "shift"]
97
+ ).rename_axis("collocate")
98
+
99
+ vocab_a = a.vocab()
100
+ vocab_b = b.vocab()
101
+ fy_a_raw = vocab_a.reindex(fxy_a_raw.index, fill_value=0).astype(float)
102
+ fy_b_raw = vocab_b.reindex(fxy_b_raw.index, fill_value=0).astype(float)
103
+
104
+ n_a = a.total_tokens()
105
+ n_b = b.total_tokens()
106
+
107
+ # Laplace smoothing across joint and marginal counts — keeps every
108
+ # measure finite even for collocates absent on one side. f_x (the
109
+ # target count) is also smoothed for symmetry.
110
+ fxy_a = fxy_a_raw + smoothing
111
+ fxy_b = fxy_b_raw + smoothing
112
+ fy_a = fy_a_raw + smoothing
113
+ fy_b = fy_b_raw + smoothing
114
+ fx_a_s = fx_a + smoothing
115
+ fx_b_s = fx_b + smoothing
116
+
117
+ if measure == "logDice":
118
+ score_a = logdice(fxy_a, fx_a_s, fy_a)
119
+ score_b = logdice(fxy_b, fx_b_s, fy_b)
120
+ elif measure == "PMI":
121
+ score_a = pmi(fxy_a, fx_a_s, fy_a, n_a)
122
+ score_b = pmi(fxy_b, fx_b_s, fy_b, n_b)
123
+ elif measure == "t_score":
124
+ score_a = t_score(fxy_a, fx_a_s, fy_a, n_a)
125
+ score_b = t_score(fxy_b, fx_b_s, fy_b, n_b)
126
+ else: # MI3
127
+ score_a = mi_three(fxy_a, fx_a_s, fy_a, n_a)
128
+ score_b = mi_three(fxy_b, fx_b_s, fy_b, n_b)
129
+
130
+ shift = score_a - score_b
131
+ table = pd.DataFrame(
132
+ {
133
+ "count_a": fxy_a_raw.astype("int64"),
134
+ "count_b": fxy_b_raw.astype("int64"),
135
+ "score_a": score_a,
136
+ "score_b": score_b,
137
+ "shift": shift,
138
+ }
139
+ )
140
+ table.index.name = "collocate"
141
+ sort_key = table["shift"].abs()
142
+ return (
143
+ table.assign(_sort_key=sort_key)
144
+ .sort_values("_sort_key", ascending=False, kind="stable")
145
+ .drop(columns="_sort_key")
146
+ )
pycorpdiff/compare.py ADDED
@@ -0,0 +1,345 @@
1
+ """Public ``compare()`` facade and the :class:`Comparison` class.
2
+
3
+ This module defines the public API surface. Analytical methods delegate
4
+ to the keyness / collocation / semantic subpackages.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from typing import TYPE_CHECKING, Literal
11
+
12
+ from .corpus import Corpus, CorpusSlice
13
+
14
+ if TYPE_CHECKING:
15
+ from .results import (
16
+ CollocationShiftResult,
17
+ ConcordanceResult,
18
+ KeynessResult,
19
+ SemanticShiftResult,
20
+ )
21
+ from .semantic.embed import Embedder
22
+
23
+
24
+ KeynessMethod = Literal[
25
+ "log_likelihood", "log_ratio", "bayes_factor", "percent_diff", "chi_squared",
26
+ ]
27
+ CollocationMeasure = Literal["logDice", "PMI", "t_score", "MI3"]
28
+ EmbeddingAlignment = Literal["none", "procrustes"]
29
+ MultipleComparisons = Literal["bh", "bonferroni", "none"]
30
+ CorpusLike = Corpus | CorpusSlice
31
+
32
+
33
+ @dataclass(frozen=True)
34
+ class Comparison:
35
+ """A pairwise comparison of two corpora (or slices).
36
+
37
+ Construct via :func:`compare` rather than directly; this keeps the
38
+ surface area small and lets the package add specialised
39
+ constructors (``compare.before_after``, ``compare.over_time``) on
40
+ the function attribute.
41
+ """
42
+
43
+ a: CorpusLike
44
+ b: CorpusLike
45
+
46
+ def keyness(
47
+ self,
48
+ method: KeynessMethod = "log_likelihood",
49
+ effect_size: bool = True,
50
+ dispersion: bool = False,
51
+ min_count: int = 5,
52
+ multiple_comparisons: MultipleComparisons = "bh",
53
+ stop_words: set[str] | list[str] | None = None,
54
+ permutation_n: int = 0,
55
+ permutation_seed: int | None = None,
56
+ ) -> KeynessResult:
57
+ """Compute keyness for every shared-vocabulary item.
58
+
59
+ Parameters
60
+ ----------
61
+ method
62
+ Which statistic to sort the result by. ``"log_likelihood"``
63
+ (default) sorts by signed Dunning G²; ``"chi_squared"``
64
+ sorts by signed Pearson χ². The other modes
65
+ (``"log_ratio"``, ``"bayes_factor"``, ``"percent_diff"``)
66
+ require ``effect_size=True`` and sort by that column.
67
+ effect_size
68
+ If True (default), also compute LogRatio (Hardie),
69
+ %DIFF (Gabrielatos), and the BIC-Bayes factor (Wilson).
70
+ dispersion
71
+ If True, compute Juilland's D for both corpora and flag
72
+ terms where ``D < 0.5`` in either — the canonical "this is
73
+ driven by one document" heuristic. Off by default because
74
+ it requires constructing the full doc-term matrices.
75
+ min_count
76
+ Drop terms whose ``count_a + count_b`` is below this
77
+ threshold. Dunning's small-cell unreliability makes the
78
+ default of 5 the standard recommendation.
79
+ multiple_comparisons
80
+ ``"bh"`` (default, Benjamini–Hochberg), ``"bonferroni"``,
81
+ or ``"none"``. The corrected column is named ``p_adjusted``.
82
+ stop_words
83
+ Iterable of terms to exclude before scoring. Useful for
84
+ filtering function-word noise without modifying the source
85
+ corpus. Tokens drop *after* vocabulary union, so the corpus
86
+ totals (used as normalisation denominators) are unaffected.
87
+ permutation_n
88
+ If positive, also compute an empirical permutation *p*-value
89
+ for every retained term and emit it as the ``p_permutation``
90
+ column. Documents are the unit of exchangeability. Useful
91
+ when the asymptotic χ² approximation is suspect (small
92
+ expected counts, very small corpora). ``999`` is the
93
+ conventional value; cost scales linearly. Disabled by
94
+ default — this is the expensive opt-in.
95
+ permutation_seed
96
+ Optional RNG seed for reproducible permutation *p*-values.
97
+ """
98
+ # Imports kept local to break circulars and to keep this module
99
+ # importable without the keyness machinery on hand.
100
+ from .keyness.bayes import bayes_factor as _bayes_factor
101
+ from .keyness.chi_squared import chi_squared as _chi_squared
102
+ from .keyness.correction import benjamini_hochberg, bonferroni
103
+ from .keyness.dispersion import juilland_d
104
+ from .keyness.effect_sizes import log_ratio as _log_ratio
105
+ from .keyness.effect_sizes import percent_diff as _percent_diff
106
+ from .keyness.loglikelihood import log_likelihood
107
+ from .results import KeynessResult
108
+
109
+ dtm_a = self.a.doc_term_counts(min_count=1)
110
+ dtm_b = self.b.doc_term_counts(min_count=1)
111
+ vocab_a = dtm_a.sum(axis=0)
112
+ vocab_b = dtm_b.sum(axis=0)
113
+ n_a = int(vocab_a.sum())
114
+ n_b = int(vocab_b.sum())
115
+
116
+ if n_a == 0 or n_b == 0:
117
+ raise ValueError(
118
+ f"both corpora must contain at least one token; got |a|={n_a}, |b|={n_b}"
119
+ )
120
+
121
+ all_terms = vocab_a.index.union(vocab_b.index)
122
+ a_aligned = vocab_a.reindex(all_terms, fill_value=0).astype("int64")
123
+ b_aligned = vocab_b.reindex(all_terms, fill_value=0).astype("int64")
124
+ keep = (a_aligned + b_aligned) >= min_count
125
+ if stop_words is not None:
126
+ stop_set = set(stop_words)
127
+ keep &= ~a_aligned.index.isin(stop_set)
128
+ a_kept = a_aligned[keep]
129
+ b_kept = b_aligned[keep]
130
+
131
+ # G² is always computed (cheap, the default sort column). χ² is
132
+ # computed only when requested — same shape, asymptotically
133
+ # equivalent, no need to pay for both by default.
134
+ table = log_likelihood(a_kept, b_kept, n_a, n_b)
135
+ if method == "chi_squared":
136
+ chi_table = _chi_squared(a_kept, b_kept, n_a, n_b)
137
+ table["chi_squared"] = chi_table["chi_squared"]
138
+
139
+ if effect_size:
140
+ table["log_ratio"] = _log_ratio(a_kept, b_kept, n_a, n_b)
141
+ table["percent_diff"] = _percent_diff(a_kept, b_kept, n_a, n_b)
142
+ table["bayes_factor"] = _bayes_factor(a_kept, b_kept, n_a, n_b)
143
+
144
+ if dispersion:
145
+ kept_terms = table.index
146
+ disp_a = juilland_d(dtm_a.reindex(columns=kept_terms, fill_value=0))
147
+ disp_b = juilland_d(dtm_b.reindex(columns=kept_terms, fill_value=0))
148
+ table["dispersion_a"] = disp_a
149
+ table["dispersion_b"] = disp_b
150
+ table["dispersion_flag"] = (disp_a < 0.5) | (disp_b < 0.5)
151
+
152
+ if multiple_comparisons == "bh":
153
+ table["p_adjusted"] = benjamini_hochberg(table["p_value"].to_numpy())
154
+ elif multiple_comparisons == "bonferroni":
155
+ table["p_adjusted"] = bonferroni(table["p_value"].to_numpy())
156
+
157
+ if permutation_n > 0:
158
+ from .keyness.permutation import permutation_pvalues as _perm
159
+
160
+ p_perm = _perm(
161
+ self.a, self.b,
162
+ terms=table.index,
163
+ n_permutations=permutation_n,
164
+ seed=permutation_seed,
165
+ )
166
+ table["p_permutation"] = p_perm.reindex(table.index)
167
+
168
+ sort_col = {
169
+ "log_likelihood": "g2",
170
+ "log_ratio": "log_ratio",
171
+ "bayes_factor": "bayes_factor",
172
+ "percent_diff": "percent_diff",
173
+ "chi_squared": "chi_squared",
174
+ }[method]
175
+ if sort_col not in table.columns:
176
+ # User asked to sort by an effect-size column they disabled.
177
+ raise ValueError(
178
+ f"method={method!r} requires effect_size=True so the column exists"
179
+ )
180
+ # Sort by |signed score| so direction doesn't bury overuse-in-B terms.
181
+ sort_key = table[sort_col].abs()
182
+ table = table.assign(_sort_key=sort_key).sort_values(
183
+ "_sort_key", ascending=False
184
+ ).drop(columns="_sort_key")
185
+
186
+ out = table.reset_index().rename(columns={"index": "term"})
187
+ return KeynessResult(
188
+ table=out,
189
+ method=method,
190
+ n_a=n_a,
191
+ n_b=n_b,
192
+ label_a=_corpus_label(self.a),
193
+ label_b=_corpus_label(self.b),
194
+ params={
195
+ "effect_size": effect_size,
196
+ "dispersion": dispersion,
197
+ "min_count": min_count,
198
+ "multiple_comparisons": multiple_comparisons,
199
+ "stop_words": tuple(stop_words) if stop_words else None,
200
+ "permutation_n": permutation_n,
201
+ "permutation_seed": permutation_seed,
202
+ },
203
+ corpus_a=self.a,
204
+ corpus_b=self.b,
205
+ )
206
+
207
+ def collocation_shift(
208
+ self,
209
+ target: str,
210
+ window: int = 5,
211
+ measure: CollocationMeasure = "logDice",
212
+ min_count: int = 5,
213
+ smoothing: float = 0.5,
214
+ ) -> CollocationShiftResult:
215
+ """Compute the change in collocates of ``target`` between a and b.
216
+
217
+ Window-based co-occurrence with Rychlý logDice (default) or PMI /
218
+ t-score / MI³ as alternatives. Laplace smoothing keeps shifts
219
+ finite for collocates absent on one side.
220
+ """
221
+ from .collocation.shift import collocation_shift as _shift
222
+ from .results import CollocationShiftResult
223
+
224
+ table = _shift(
225
+ self.a,
226
+ self.b,
227
+ target=target,
228
+ window=window,
229
+ measure=measure,
230
+ min_count=min_count,
231
+ smoothing=smoothing,
232
+ )
233
+ return CollocationShiftResult(
234
+ target=target,
235
+ table=table.reset_index(),
236
+ measure=measure,
237
+ window=window,
238
+ label_a=_corpus_label(self.a),
239
+ label_b=_corpus_label(self.b),
240
+ corpus_a=self.a,
241
+ corpus_b=self.b,
242
+ )
243
+
244
+ def semantic_shift(
245
+ self,
246
+ target: str | list[str],
247
+ embedder: Embedder | None = None,
248
+ window: int = 5,
249
+ align: EmbeddingAlignment = "none",
250
+ ) -> SemanticShiftResult:
251
+ """Compute embedding-space displacement of target term(s).
252
+
253
+ Uses *averaged contextual embeddings*: every window around the
254
+ target in each corpus is encoded by ``embedder`` and averaged
255
+ into a corpus-specific centroid. The cosine distance between
256
+ centroids is the reported shift.
257
+
258
+ ``embedder`` defaults to :class:`SBERTEmbedder` (requires the
259
+ ``semantic`` extra). For deterministic offline demos pass
260
+ :class:`pycorpdiff.semantic.HashEmbedder`.
261
+
262
+ ``align="procrustes"`` is appropriate when the embedder produces
263
+ independent per-corpus spaces (Hamilton-style diachronic
264
+ word2vec). Modern shared-model encoders like SBERT live in a
265
+ common space, so the default is ``"none"``.
266
+ """
267
+ from .results import SemanticShiftResult
268
+ from .semantic.embed import SBERTEmbedder
269
+ from .semantic.shift import semantic_shift as _shift
270
+
271
+ effective_embedder = embedder if embedder is not None else SBERTEmbedder()
272
+ table = _shift(
273
+ self.a, self.b, target=target, embedder=effective_embedder,
274
+ window=window, align=align,
275
+ )
276
+ targets = [target] if isinstance(target, str) else list(target)
277
+ return SemanticShiftResult(
278
+ targets=targets,
279
+ table=table,
280
+ alignment=align,
281
+ label_a=_corpus_label(self.a),
282
+ label_b=_corpus_label(self.b),
283
+ corpus_a=self.a,
284
+ corpus_b=self.b,
285
+ embedder=effective_embedder,
286
+ window=window,
287
+ )
288
+
289
+ def concordance(
290
+ self, target: str, n: int = 20, window: int = 5
291
+ ) -> ConcordanceResult:
292
+ """Return side-by-side KWIC examples of ``target`` from both corpora.
293
+
294
+ Up to ``n`` lines per corpus are returned, concatenated into a
295
+ single :class:`ConcordanceResult` with a ``corpus`` column
296
+ distinguishing the source. Shortcut for
297
+ ``pycorpdiff.explain.kwic_compare(a, b, target, ...)``.
298
+ """
299
+ from .explain import kwic_compare
300
+
301
+ return kwic_compare(
302
+ self.a,
303
+ self.b,
304
+ target=target,
305
+ window=window,
306
+ n_per_side=n,
307
+ label_a=_corpus_label(self.a),
308
+ label_b=_corpus_label(self.b),
309
+ )
310
+
311
+
312
+ def compare(a: CorpusLike, b: CorpusLike) -> Comparison:
313
+ """Construct a pairwise :class:`Comparison` of two corpora or slices."""
314
+ return Comparison(a=a, b=b)
315
+
316
+
317
+ def _corpus_label(c: CorpusLike) -> str:
318
+ return c.label if isinstance(c, CorpusSlice) else "corpus"
319
+
320
+
321
+ def _before_after(
322
+ corpus: Corpus,
323
+ event_date: str,
324
+ time_col: str = "date",
325
+ ) -> Comparison:
326
+ """Construct a before/after Comparison split on ``event_date``.
327
+
328
+ The before-slice contains documents with ``time_col < event_date``;
329
+ the after-slice contains documents with ``time_col >= event_date``.
330
+ """
331
+ import pandas as pd
332
+
333
+ if time_col not in corpus.docs.columns:
334
+ raise KeyError(f"time_col={time_col!r} not found in corpus columns")
335
+ event = pd.Timestamp(event_date)
336
+ times = pd.to_datetime(corpus.docs[time_col])
337
+ before = CorpusSlice(parent=corpus, mask=times < event, filters={"before": event_date})
338
+ after = CorpusSlice(parent=corpus, mask=times >= event, filters={"after": event_date})
339
+ return Comparison(a=before, b=after)
340
+
341
+
342
+ # Expose the specialised constructor as an attribute of the public ``compare``
343
+ # function so users can write ``pcd.compare.before_after(...)`` — matches the
344
+ # API shape promised in the README.
345
+ compare.before_after = _before_after # type: ignore[attr-defined]