pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pycorpdiff/__init__.py +126 -0
  2. pycorpdiff/_backends/__init__.py +3 -0
  3. pycorpdiff/_backends/pandas.py +3 -0
  4. pycorpdiff/_backends/polars.py +3 -0
  5. pycorpdiff/collocation/__init__.py +19 -0
  6. pycorpdiff/collocation/cooccurrence.py +65 -0
  7. pycorpdiff/collocation/measures.py +102 -0
  8. pycorpdiff/collocation/network.py +233 -0
  9. pycorpdiff/collocation/shift.py +146 -0
  10. pycorpdiff/compare.py +345 -0
  11. pycorpdiff/corpus.py +411 -0
  12. pycorpdiff/datasets/__init__.py +27 -0
  13. pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  14. pycorpdiff/datasets/_generate_hansard.py +221 -0
  15. pycorpdiff/datasets/hansard.py +235 -0
  16. pycorpdiff/datasets/histwords.py +221 -0
  17. pycorpdiff/explain.py +177 -0
  18. pycorpdiff/io/__init__.py +16 -0
  19. pycorpdiff/io/duckdb.py +92 -0
  20. pycorpdiff/io/huggingface.py +142 -0
  21. pycorpdiff/io/readers.py +138 -0
  22. pycorpdiff/keyness/__init__.py +26 -0
  23. pycorpdiff/keyness/bayes.py +50 -0
  24. pycorpdiff/keyness/chi_squared.py +94 -0
  25. pycorpdiff/keyness/correction.py +34 -0
  26. pycorpdiff/keyness/dispersion.py +89 -0
  27. pycorpdiff/keyness/effect_sizes.py +65 -0
  28. pycorpdiff/keyness/loglikelihood.py +92 -0
  29. pycorpdiff/keyness/multicorpus.py +143 -0
  30. pycorpdiff/keyness/permutation.py +154 -0
  31. pycorpdiff/py.typed +0 -0
  32. pycorpdiff/results.py +635 -0
  33. pycorpdiff/semantic/__init__.py +18 -0
  34. pycorpdiff/semantic/alignment.py +53 -0
  35. pycorpdiff/semantic/embed.py +84 -0
  36. pycorpdiff/semantic/shift.py +224 -0
  37. pycorpdiff/semantic/trajectory.py +166 -0
  38. pycorpdiff/stats.py +69 -0
  39. pycorpdiff/temporal/__init__.py +15 -0
  40. pycorpdiff/temporal/bocpd.py +233 -0
  41. pycorpdiff/temporal/causal_impact.py +293 -0
  42. pycorpdiff/temporal/changepoint.py +92 -0
  43. pycorpdiff/temporal/forecast.py +405 -0
  44. pycorpdiff/temporal/its.py +123 -0
  45. pycorpdiff/temporal/slicing.py +174 -0
  46. pycorpdiff/tokenize.py +110 -0
  47. pycorpdiff/viz/__init__.py +37 -0
  48. pycorpdiff/viz/bocpd.py +173 -0
  49. pycorpdiff/viz/causal_impact.py +142 -0
  50. pycorpdiff/viz/collocation.py +48 -0
  51. pycorpdiff/viz/dispersion.py +117 -0
  52. pycorpdiff/viz/forecast.py +129 -0
  53. pycorpdiff/viz/keyness.py +96 -0
  54. pycorpdiff/viz/network.py +186 -0
  55. pycorpdiff/viz/scattertext.py +160 -0
  56. pycorpdiff/viz/semantic_forecast.py +114 -0
  57. pycorpdiff/viz/trajectory.py +48 -0
  58. pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
  59. pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
  60. pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
  61. pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
pycorpdiff/results.py ADDED
@@ -0,0 +1,635 @@
1
+ """Result dataclasses returned by every public analytical verb.
2
+
3
+ Every Result implements the same informal contract:
4
+
5
+ - ``.to_df()`` returns a tidy :class:`pandas.DataFrame`.
6
+ - ``.plot(**kw)`` returns an :class:`altair.Chart`.
7
+ - ``.explain(term, n)`` returns a :class:`ConcordanceResult` with
8
+ evidence for one row of the result.
9
+ - ``.summary()`` returns a short human-readable string.
10
+
11
+ This contract is intentionally a duck-typing convention rather than an
12
+ abstract base class — it keeps Results lightweight and lets them be
13
+ constructed from a plain DataFrame without inheritance gymnastics.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ from dataclasses import dataclass, field
19
+ from pathlib import Path
20
+ from typing import TYPE_CHECKING, Any
21
+
22
+ import pandas as pd
23
+
24
+ if TYPE_CHECKING:
25
+ import altair as alt
26
+
27
+ from .corpus import Corpus, CorpusSlice
28
+
29
+
30
+ def _table_to_html(table: pd.DataFrame, path: str | Path | None, **kw: Any) -> str:
31
+ """Render ``table`` as HTML; optionally write to ``path``."""
32
+ html: str = str(table.to_html(**kw))
33
+ if path is not None:
34
+ Path(path).write_text(html, encoding="utf-8")
35
+ return html
36
+
37
+
38
+ def _table_to_json(
39
+ table: pd.DataFrame, path: str | Path | None, **kw: Any
40
+ ) -> str:
41
+ """Render ``table`` as JSON (records orientation by default); optionally
42
+ write to ``path``.
43
+
44
+ Coerces any object-dtype columns containing ``pd.Period`` values to
45
+ strings before serialisation — pandas's JSON writer doesn't know
46
+ how to represent Period and would raise OverflowError. The string
47
+ form (``"2020"``, ``"2020Q1"``, …) round-trips back to Period
48
+ cleanly via :func:`pandas.Period`.
49
+ """
50
+ serialisable = table.copy()
51
+ for col in serialisable.columns:
52
+ col_dtype = serialisable[col].dtype
53
+ if isinstance(col_dtype, pd.PeriodDtype):
54
+ serialisable[col] = serialisable[col].astype(str)
55
+ elif col_dtype == object: # noqa: E721
56
+ sample = next(
57
+ (v for v in serialisable[col] if v is not None and not pd.isna(v)),
58
+ None,
59
+ )
60
+ if isinstance(sample, pd.Period):
61
+ serialisable[col] = serialisable[col].astype(str)
62
+ kw.setdefault("orient", "records")
63
+ json_str: str = str(serialisable.to_json(**kw))
64
+ if path is not None:
65
+ Path(path).write_text(json_str, encoding="utf-8")
66
+ return json_str
67
+
68
+
69
+ @dataclass(frozen=True)
70
+ class KeynessResult:
71
+ """Per-term keyness scores for two corpora.
72
+
73
+ The ``table`` DataFrame has one row per shared vocabulary item with
74
+ columns including ``term``, ``count_a``, ``count_b``, ``score``,
75
+ ``effect_size``, ``p_value``, ``dispersion_a``, ``dispersion_b``,
76
+ and a boolean ``dispersion_flag``.
77
+ """
78
+
79
+ table: pd.DataFrame
80
+ method: str
81
+ n_a: int
82
+ n_b: int
83
+ label_a: str = "a"
84
+ label_b: str = "b"
85
+ params: dict[str, Any] = field(default_factory=dict)
86
+ corpus_a: Corpus | CorpusSlice | None = None
87
+ corpus_b: Corpus | CorpusSlice | None = None
88
+
89
+ def to_df(self) -> pd.DataFrame:
90
+ return self.table.copy()
91
+
92
+ def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
93
+ """Render the underlying table as HTML (returns the string and,
94
+ optionally, writes to ``path``). Extra kwargs forward to
95
+ :meth:`pandas.DataFrame.to_html`."""
96
+ return _table_to_html(self.table, path, **kw)
97
+
98
+ def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
99
+ """Render the underlying table as JSON (default ``orient="records"``).
100
+ Returns the JSON string and, optionally, writes to ``path``."""
101
+ return _table_to_json(self.table, path, **kw)
102
+
103
+ def plot(self, kind: str = "volcano", **kw: Any) -> alt.Chart:
104
+ """Return an altair chart of the keyness result.
105
+
106
+ ``kind="volcano"`` (default) returns a volcano-style scatter of
107
+ effect size against −log₁₀(*p*); ``kind="bar"`` returns a top-N
108
+ horizontal bar chart; ``kind="scattertext"`` returns the
109
+ Scattertext-style interactive rank-percentile scatter (Kessler
110
+ 2017). Extra keyword arguments are forwarded to the underlying
111
+ viz function (``n_labels``, ``n``, ``width``, ``height``).
112
+ """
113
+ from .viz.keyness import keyness_top_n_bar, keyness_volcano
114
+ from .viz.scattertext import scattertext_plot
115
+
116
+ if kind == "volcano":
117
+ return keyness_volcano(self.table, **kw)
118
+ if kind == "bar":
119
+ return keyness_top_n_bar(self.table, **kw)
120
+ if kind == "scattertext":
121
+ return scattertext_plot(
122
+ self.table, label_a=self.label_a, label_b=self.label_b, **kw
123
+ )
124
+ raise ValueError(
125
+ f"unknown kind={kind!r}; expected 'volcano', 'bar', or 'scattertext'"
126
+ )
127
+
128
+ def explain(self, term: str, n: int = 5, window: int = 5) -> ConcordanceResult:
129
+ """Show KWIC examples of ``term`` from both source corpora.
130
+
131
+ Returns up to ``n`` lines per corpus. Requires that the result
132
+ was built via :meth:`pycorpdiff.Comparison.keyness` (which
133
+ populates the corpus references); building a ``KeynessResult``
134
+ from a bare DataFrame will raise.
135
+ """
136
+ if self.corpus_a is None or self.corpus_b is None:
137
+ raise ValueError(
138
+ "explain() requires source corpora; this KeynessResult was "
139
+ "constructed without them"
140
+ )
141
+ from .explain import kwic_compare
142
+
143
+ return kwic_compare(
144
+ self.corpus_a,
145
+ self.corpus_b,
146
+ target=term,
147
+ window=window,
148
+ n_per_side=n,
149
+ label_a=self.label_a,
150
+ label_b=self.label_b,
151
+ )
152
+
153
+ def summary(self) -> str:
154
+ return (
155
+ f"KeynessResult({self.method}, |a|={self.n_a:,}, |b|={self.n_b:,}, "
156
+ f"terms={len(self.table):,})"
157
+ )
158
+
159
+
160
+ @dataclass(frozen=True)
161
+ class CollocationShiftResult:
162
+ """Change in collocates of a target term between two corpora."""
163
+
164
+ target: str
165
+ table: pd.DataFrame
166
+ measure: str
167
+ window: int
168
+ label_a: str = "a"
169
+ label_b: str = "b"
170
+ corpus_a: Corpus | CorpusSlice | None = None
171
+ corpus_b: Corpus | CorpusSlice | None = None
172
+
173
+ def to_df(self) -> pd.DataFrame:
174
+ return self.table.copy()
175
+
176
+ def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
177
+ """Render the underlying table as HTML (returns the string and,
178
+ optionally, writes to ``path``). Extra kwargs forward to
179
+ :meth:`pandas.DataFrame.to_html`."""
180
+ return _table_to_html(self.table, path, **kw)
181
+
182
+ def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
183
+ """Render the underlying table as JSON (default ``orient="records"``).
184
+ Returns the JSON string and, optionally, writes to ``path``."""
185
+ return _table_to_json(self.table, path, **kw)
186
+
187
+ def plot(self, **kw: Any) -> alt.Chart:
188
+ """Return a diverging horizontal bar chart of the top collocate shifts."""
189
+ from .viz.collocation import collocation_diverging_bar
190
+
191
+ return collocation_diverging_bar(self.table, **kw)
192
+
193
+ def explain(self, collocate: str, n: int = 5) -> ConcordanceResult:
194
+ """Show KWIC windows where ``target`` co-occurs with ``collocate``.
195
+
196
+ Returns up to ``n`` lines per corpus, restricted to contexts in
197
+ which both the target and ``collocate`` appear within the same
198
+ window. This is the per-row evidence behind a shift score.
199
+ """
200
+ if self.corpus_a is None or self.corpus_b is None:
201
+ raise ValueError(
202
+ "explain() requires source corpora; this CollocationShiftResult "
203
+ "was constructed without them"
204
+ )
205
+ from .explain import kwic_compare
206
+
207
+ return kwic_compare(
208
+ self.corpus_a,
209
+ self.corpus_b,
210
+ target=self.target,
211
+ window=self.window,
212
+ n_per_side=n,
213
+ collocate=collocate,
214
+ label_a=self.label_a,
215
+ label_b=self.label_b,
216
+ )
217
+
218
+ def summary(self) -> str:
219
+ return (
220
+ f"CollocationShiftResult(target={self.target!r}, measure={self.measure}, "
221
+ f"window={self.window}, collocates={len(self.table):,})"
222
+ )
223
+
224
+
225
+ @dataclass(frozen=True)
226
+ class SemanticShiftResult:
227
+ """Embedding-space displacement of a target term between corpora."""
228
+
229
+ targets: list[str]
230
+ table: pd.DataFrame
231
+ alignment: str
232
+ label_a: str = "a"
233
+ label_b: str = "b"
234
+ corpus_a: Corpus | CorpusSlice | None = None
235
+ corpus_b: Corpus | CorpusSlice | None = None
236
+ embedder: Any | None = None
237
+ window: int = 5
238
+
239
+ def to_df(self) -> pd.DataFrame:
240
+ return self.table.copy()
241
+
242
+ def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
243
+ """Render the underlying table as HTML (returns the string and,
244
+ optionally, writes to ``path``). Extra kwargs forward to
245
+ :meth:`pandas.DataFrame.to_html`."""
246
+ return _table_to_html(self.table, path, **kw)
247
+
248
+ def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
249
+ """Render the underlying table as JSON (default ``orient="records"``).
250
+ Returns the JSON string and, optionally, writes to ``path``."""
251
+ return _table_to_json(self.table, path, **kw)
252
+
253
+ def plot(self, **kw: Any) -> alt.Chart:
254
+ raise NotImplementedError("SemanticShiftResult.plot() lands in Phase 6")
255
+
256
+ def neighbors_before(
257
+ self, target: str | None = None, n: int = 10
258
+ ) -> pd.DataFrame:
259
+ """Top-n contextual neighbours of ``target`` in corpus A.
260
+
261
+ Returns the rows of :func:`pycorpdiff.semantic.neighborhood_drift`
262
+ with a non-null ``sim_a`` (i.e. terms that appeared in A's
263
+ top-k), sorted by ``sim_a`` descending. Requires the result was
264
+ built via :meth:`Comparison.semantic_shift` so the source
265
+ corpora and embedder are attached.
266
+ """
267
+ return self._neighborhood(target=target, n=n, side="a")
268
+
269
+ def neighbors_after(
270
+ self, target: str | None = None, n: int = 10
271
+ ) -> pd.DataFrame:
272
+ """Top-n contextual neighbours of ``target`` in corpus B."""
273
+ return self._neighborhood(target=target, n=n, side="b")
274
+
275
+ def _neighborhood(
276
+ self, target: str | None, n: int, side: str
277
+ ) -> pd.DataFrame:
278
+ if self.corpus_a is None or self.corpus_b is None:
279
+ raise ValueError(
280
+ "neighbors_before / neighbors_after require source corpora; "
281
+ "this SemanticShiftResult was constructed without them"
282
+ )
283
+ if target is None:
284
+ if len(self.targets) != 1:
285
+ raise ValueError(
286
+ f"result carries {len(self.targets)} targets; pass target= to pick one"
287
+ )
288
+ target = self.targets[0]
289
+ if target not in self.targets:
290
+ raise ValueError(
291
+ f"target={target!r} not in result targets {self.targets!r}"
292
+ )
293
+ from .semantic.shift import neighborhood_drift
294
+
295
+ full = neighborhood_drift(
296
+ self.corpus_a,
297
+ self.corpus_b,
298
+ target=target,
299
+ k=n,
300
+ embedder=self.embedder,
301
+ window=self.window,
302
+ )
303
+ sim_col = "sim_a" if side == "a" else "sim_b"
304
+ return (
305
+ full.dropna(subset=[sim_col])
306
+ .sort_values(sim_col, ascending=False, kind="stable")
307
+ .head(n)
308
+ .reset_index(drop=True)
309
+ )
310
+
311
+ def summary(self) -> str:
312
+ return (
313
+ f"SemanticShiftResult(targets={self.targets!r}, alignment={self.alignment})"
314
+ )
315
+
316
+
317
+ @dataclass(frozen=True)
318
+ class TemporalTrajectory:
319
+ """A time-indexed series for one or more target terms.
320
+
321
+ ``table`` has columns ``period``, ``term``, ``count``, ``relfreq``,
322
+ ``ci_lower``, ``ci_upper``.
323
+ """
324
+
325
+ table: pd.DataFrame
326
+ targets: list[str]
327
+ freq: str
328
+
329
+ def to_df(self) -> pd.DataFrame:
330
+ return self.table.copy()
331
+
332
+ def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
333
+ """Render the underlying table as HTML (returns the string and,
334
+ optionally, writes to ``path``). Extra kwargs forward to
335
+ :meth:`pandas.DataFrame.to_html`."""
336
+ return _table_to_html(self.table, path, **kw)
337
+
338
+ def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
339
+ """Render the underlying table as JSON (default ``orient="records"``).
340
+ Returns the JSON string and, optionally, writes to ``path``."""
341
+ return _table_to_json(self.table, path, **kw)
342
+
343
+ def plot(self, **kw: Any) -> alt.Chart:
344
+ """Return a line plot with Wilson CI bands per term."""
345
+ from .viz.trajectory import trajectory_with_ci
346
+
347
+ return trajectory_with_ci(self.table, **kw)
348
+
349
+ def changepoints(
350
+ self,
351
+ target: str | None = None,
352
+ method: str = "pelt",
353
+ penalty: float | None = None,
354
+ ) -> pd.DataFrame:
355
+ """Run changepoint detection on a target's relative-frequency series.
356
+
357
+ Requires the ``[temporal]`` extra (ruptures). When the
358
+ trajectory holds multiple targets, supply ``target`` to pick one;
359
+ a single-target trajectory uses it automatically.
360
+ """
361
+ from .temporal.changepoint import detect_changepoints
362
+
363
+ if target is None:
364
+ if len(self.targets) != 1:
365
+ raise ValueError(
366
+ f"trajectory carries {len(self.targets)} targets; "
367
+ "pass target= to pick one"
368
+ )
369
+ target = self.targets[0]
370
+ if target not in self.targets:
371
+ raise ValueError(f"target={target!r} not in trajectory targets {self.targets!r}")
372
+
373
+ sub = self.table[self.table["term"] == target].set_index("period")["relfreq"]
374
+ return detect_changepoints(sub, method=method, penalty=penalty) # type: ignore[arg-type]
375
+
376
+ def changepoints_online(
377
+ self,
378
+ target: str | None = None,
379
+ *,
380
+ hazard: float = 0.01,
381
+ mu_0: float | None = None,
382
+ kappa_0: float = 1.0,
383
+ alpha_0: float = 1.0,
384
+ beta_0: float | None = None,
385
+ max_run_length: int | None = None,
386
+ ) -> Any:
387
+ """Bayesian *online* changepoint detection (Adams & MacKay 2007).
388
+
389
+ Where :meth:`changepoints` runs PELT offline (needs the full
390
+ series, returns MAP locations after the fact), this runs an
391
+ online forward pass: at each step it updates the posterior
392
+ distribution over the *run length* — the number of periods
393
+ since the last changepoint. The MAP run length collapsing to
394
+ a small value marks a changepoint.
395
+
396
+ Returns
397
+ -------
398
+ :class:`pycorpdiff.temporal.bocpd.BocpdResult`
399
+
400
+ Requires the ``[temporal]`` extra is *not* needed —
401
+ ``scipy.stats`` already in the base dependency set is enough.
402
+ """
403
+ from .temporal.bocpd import bocpd
404
+
405
+ if target is None:
406
+ if len(self.targets) != 1:
407
+ raise ValueError(
408
+ f"trajectory carries {len(self.targets)} targets; "
409
+ "pass target= to pick one"
410
+ )
411
+ target = self.targets[0]
412
+ if target not in self.targets:
413
+ raise ValueError(
414
+ f"target={target!r} not in trajectory targets {self.targets!r}"
415
+ )
416
+
417
+ sub = (
418
+ self.table[self.table["term"] == target]
419
+ .sort_values("period")
420
+ .set_index("period")["relfreq"]
421
+ )
422
+ return bocpd(
423
+ sub,
424
+ hazard=hazard,
425
+ mu_0=mu_0,
426
+ kappa_0=kappa_0,
427
+ alpha_0=alpha_0,
428
+ beta_0=beta_0,
429
+ max_run_length=max_run_length,
430
+ )
431
+
432
+ def interrupted_time_series(
433
+ self,
434
+ event_date: str,
435
+ target: str | None = None,
436
+ ) -> pd.DataFrame:
437
+ """Fit a segmented-regression ITS model around ``event_date``.
438
+
439
+ Requires the ``[temporal]`` extra (statsmodels). Returns level
440
+ and slope-change estimates with confidence intervals.
441
+ """
442
+ from .temporal.its import interrupted_time_series
443
+
444
+ if target is None:
445
+ if len(self.targets) != 1:
446
+ raise ValueError(
447
+ f"trajectory carries {len(self.targets)} targets; "
448
+ "pass target= to pick one"
449
+ )
450
+ target = self.targets[0]
451
+ if target not in self.targets:
452
+ raise ValueError(f"target={target!r} not in trajectory targets {self.targets!r}")
453
+
454
+ sub = self.table[self.table["term"] == target].set_index("period")["relfreq"]
455
+ return interrupted_time_series(sub, event_date=event_date)
456
+
457
+ def causal_impact(
458
+ self,
459
+ event_date: str,
460
+ target: str | None = None,
461
+ *,
462
+ level: float = 0.95,
463
+ n_samples: int = 1000,
464
+ seed: int | None = 0,
465
+ model: str = "local linear trend",
466
+ ) -> Any:
467
+ """Counterfactual causal impact of an event on this trajectory.
468
+
469
+ Bayesian structural time-series (Brodersen et al. 2015) — fits a
470
+ local-linear-trend state-space model on the pre-event window
471
+ and projects forward as the counterfactual "what would have
472
+ happened without the event". Observed minus counterfactual is
473
+ the causal effect, with credible intervals from Monte Carlo
474
+ simulation against the joint state-space posterior.
475
+
476
+ Requires the ``[temporal]`` extra (statsmodels).
477
+
478
+ Parameters
479
+ ----------
480
+ event_date
481
+ Where to place the intervention.
482
+ target
483
+ Which term to analyse. Defaults to the trajectory's single
484
+ target when there's only one.
485
+ level
486
+ Credible-interval level. ``0.95`` → 95% CrI.
487
+ n_samples
488
+ Monte Carlo path count for the joint CrI. ``1000`` is the
489
+ conventional default.
490
+ seed
491
+ RNG seed for reproducibility.
492
+ model
493
+ Trend specification — usually ``"local linear trend"`` (the
494
+ default) or ``"local level"``.
495
+
496
+ Returns
497
+ -------
498
+ :class:`pycorpdiff.temporal.causal_impact.CausalImpactResult`
499
+ """
500
+ from dataclasses import replace as _dc_replace
501
+
502
+ from .temporal.causal_impact import causal_impact
503
+
504
+ if target is None:
505
+ if len(self.targets) != 1:
506
+ raise ValueError(
507
+ f"trajectory carries {len(self.targets)} targets; "
508
+ "pass target= to pick one"
509
+ )
510
+ target = self.targets[0]
511
+ if target not in self.targets:
512
+ raise ValueError(
513
+ f"target={target!r} not in trajectory targets {self.targets!r}"
514
+ )
515
+
516
+ sub = (
517
+ self.table[self.table["term"] == target]
518
+ .sort_values("period")
519
+ .set_index("period")["relfreq"]
520
+ )
521
+ result = causal_impact(
522
+ sub,
523
+ event_date=event_date,
524
+ level=level,
525
+ n_samples=n_samples,
526
+ seed=seed,
527
+ model=model,
528
+ )
529
+ return _dc_replace(result, target=target)
530
+
531
+ def forecast(
532
+ self,
533
+ horizon: int = 4,
534
+ *,
535
+ target: str | None = None,
536
+ level: float = 0.95,
537
+ method: str = "auto",
538
+ logit_transform: bool = True,
539
+ ) -> Any:
540
+ """Extend this trajectory ``horizon`` periods forward.
541
+
542
+ Wraps state-space exponential smoothing (Hyndman et al. 2008)
543
+ via statsmodels' ``ETSModel`` (for series of length ≥ 8) or
544
+ ``Holt`` linear-trend (for shorter histories). Rates are
545
+ forecast on the logit scale and back-transformed so prediction
546
+ intervals stay in ``[0, 1]``.
547
+
548
+ Parameters
549
+ ----------
550
+ horizon
551
+ Periods to project forward.
552
+ target
553
+ Restrict to a single term. ``None`` (default) forecasts
554
+ every term in the trajectory.
555
+ level
556
+ Prediction-interval level. ``0.95`` → 95% PI.
557
+ method
558
+ ``"auto"`` (default), ``"ets"``, or ``"holt"``.
559
+ logit_transform
560
+ Keep the PI in [0, 1] by working on the logit scale.
561
+
562
+ Returns
563
+ -------
564
+ :class:`pycorpdiff.temporal.forecast.ForecastResult`
565
+ Carries the history *and* forecast tables; ``.plot()``
566
+ renders the combined chart with a dashed continuation.
567
+
568
+ Requires the ``[temporal]`` extra (statsmodels).
569
+ """
570
+ from .temporal.forecast import (
571
+ ForecastResult,
572
+ forecast_trajectory,
573
+ )
574
+
575
+ if target is None:
576
+ chosen_targets = self.targets
577
+ else:
578
+ if target not in self.targets:
579
+ raise ValueError(
580
+ f"target={target!r} not in trajectory targets {self.targets!r}"
581
+ )
582
+ chosen_targets = [target]
583
+
584
+ fc_table = forecast_trajectory(
585
+ self.table,
586
+ targets=chosen_targets,
587
+ horizon=horizon,
588
+ level=level,
589
+ method=method, # type: ignore[arg-type]
590
+ logit_transform=logit_transform,
591
+ )
592
+ return ForecastResult(
593
+ history=self.table[self.table["term"].isin(chosen_targets)].copy(),
594
+ forecast=fc_table,
595
+ targets=list(chosen_targets),
596
+ freq=self.freq,
597
+ horizon=horizon,
598
+ level=level,
599
+ method=method,
600
+ params={
601
+ "logit_transform": logit_transform,
602
+ },
603
+ )
604
+
605
+ def summary(self) -> str:
606
+ return (
607
+ f"TemporalTrajectory(targets={self.targets!r}, freq={self.freq!r}, "
608
+ f"periods={self.table['period'].nunique() if 'period' in self.table else 0:,})"
609
+ )
610
+
611
+
612
+ @dataclass(frozen=True)
613
+ class ConcordanceResult:
614
+ """KWIC (keyword-in-context) lines for a target term."""
615
+
616
+ target: str
617
+ table: pd.DataFrame
618
+ window: int
619
+
620
+ def to_df(self) -> pd.DataFrame:
621
+ return self.table.copy()
622
+
623
+ def to_html(self, path: str | Path | None = None, **kw: Any) -> str:
624
+ """Render the underlying table as HTML (returns the string and,
625
+ optionally, writes to ``path``). Extra kwargs forward to
626
+ :meth:`pandas.DataFrame.to_html`."""
627
+ return _table_to_html(self.table, path, **kw)
628
+
629
+ def to_json(self, path: str | Path | None = None, **kw: Any) -> str:
630
+ """Render the underlying table as JSON (default ``orient="records"``).
631
+ Returns the JSON string and, optionally, writes to ``path``."""
632
+ return _table_to_json(self.table, path, **kw)
633
+
634
+ def summary(self) -> str:
635
+ return f"ConcordanceResult(target={self.target!r}, lines={len(self.table):,})"
@@ -0,0 +1,18 @@
1
+ """Embedding-based semantic shift and trajectory analysis."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .alignment import procrustes_align
6
+ from .embed import Embedder, HashEmbedder, SBERTEmbedder
7
+ from .shift import neighborhood_drift, semantic_shift
8
+ from .trajectory import semantic_trajectory
9
+
10
+ __all__ = [
11
+ "Embedder",
12
+ "HashEmbedder",
13
+ "SBERTEmbedder",
14
+ "neighborhood_drift",
15
+ "procrustes_align",
16
+ "semantic_shift",
17
+ "semantic_trajectory",
18
+ ]