pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pycorpdiff/__init__.py +126 -0
  2. pycorpdiff/_backends/__init__.py +3 -0
  3. pycorpdiff/_backends/pandas.py +3 -0
  4. pycorpdiff/_backends/polars.py +3 -0
  5. pycorpdiff/collocation/__init__.py +19 -0
  6. pycorpdiff/collocation/cooccurrence.py +65 -0
  7. pycorpdiff/collocation/measures.py +102 -0
  8. pycorpdiff/collocation/network.py +233 -0
  9. pycorpdiff/collocation/shift.py +146 -0
  10. pycorpdiff/compare.py +345 -0
  11. pycorpdiff/corpus.py +411 -0
  12. pycorpdiff/datasets/__init__.py +27 -0
  13. pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  14. pycorpdiff/datasets/_generate_hansard.py +221 -0
  15. pycorpdiff/datasets/hansard.py +235 -0
  16. pycorpdiff/datasets/histwords.py +221 -0
  17. pycorpdiff/explain.py +177 -0
  18. pycorpdiff/io/__init__.py +16 -0
  19. pycorpdiff/io/duckdb.py +92 -0
  20. pycorpdiff/io/huggingface.py +142 -0
  21. pycorpdiff/io/readers.py +138 -0
  22. pycorpdiff/keyness/__init__.py +26 -0
  23. pycorpdiff/keyness/bayes.py +50 -0
  24. pycorpdiff/keyness/chi_squared.py +94 -0
  25. pycorpdiff/keyness/correction.py +34 -0
  26. pycorpdiff/keyness/dispersion.py +89 -0
  27. pycorpdiff/keyness/effect_sizes.py +65 -0
  28. pycorpdiff/keyness/loglikelihood.py +92 -0
  29. pycorpdiff/keyness/multicorpus.py +143 -0
  30. pycorpdiff/keyness/permutation.py +154 -0
  31. pycorpdiff/py.typed +0 -0
  32. pycorpdiff/results.py +635 -0
  33. pycorpdiff/semantic/__init__.py +18 -0
  34. pycorpdiff/semantic/alignment.py +53 -0
  35. pycorpdiff/semantic/embed.py +84 -0
  36. pycorpdiff/semantic/shift.py +224 -0
  37. pycorpdiff/semantic/trajectory.py +166 -0
  38. pycorpdiff/stats.py +69 -0
  39. pycorpdiff/temporal/__init__.py +15 -0
  40. pycorpdiff/temporal/bocpd.py +233 -0
  41. pycorpdiff/temporal/causal_impact.py +293 -0
  42. pycorpdiff/temporal/changepoint.py +92 -0
  43. pycorpdiff/temporal/forecast.py +405 -0
  44. pycorpdiff/temporal/its.py +123 -0
  45. pycorpdiff/temporal/slicing.py +174 -0
  46. pycorpdiff/tokenize.py +110 -0
  47. pycorpdiff/viz/__init__.py +37 -0
  48. pycorpdiff/viz/bocpd.py +173 -0
  49. pycorpdiff/viz/causal_impact.py +142 -0
  50. pycorpdiff/viz/collocation.py +48 -0
  51. pycorpdiff/viz/dispersion.py +117 -0
  52. pycorpdiff/viz/forecast.py +129 -0
  53. pycorpdiff/viz/keyness.py +96 -0
  54. pycorpdiff/viz/network.py +186 -0
  55. pycorpdiff/viz/scattertext.py +160 -0
  56. pycorpdiff/viz/semantic_forecast.py +114 -0
  57. pycorpdiff/viz/trajectory.py +48 -0
  58. pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
  59. pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
  60. pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
  61. pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
pycorpdiff/corpus.py ADDED
@@ -0,0 +1,411 @@
1
+ """Core ``Corpus`` and ``CorpusSlice`` data structures.
2
+
3
+ A :class:`Corpus` wraps a :class:`pandas.DataFrame` of documents plus
4
+ metadata. Slicing returns a :class:`CorpusSlice` that shares the
5
+ parent's configuration (text column, tokenizer) but presents a
6
+ boolean-masked view. Both objects are immutable frozen dataclasses;
7
+ mutations produce new objects.
8
+
9
+ **Polars interop.** A Corpus stores its documents internally as a
10
+ pandas DataFrame because that's what the analytical layer is built on,
11
+ but the constructors and round-trip helpers accept and produce polars
12
+ DataFrames so they slot into polars-native pipelines:
13
+
14
+ >>> import polars as pl
15
+ >>> df = pl.DataFrame({"text": ["the cat sat"], "outlet": ["A"]})
16
+ >>> corpus = pcd.from_dataframe(df, text_col="text") # polars → pandas internally
17
+ >>> corpus.to_polars().shape # round-trip back
18
+ (1, 2)
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from collections import Counter
24
+ from dataclasses import dataclass, field, replace
25
+ from typing import TYPE_CHECKING, Any
26
+
27
+ import numpy as np
28
+ import pandas as pd
29
+
30
+ from .tokenize import RegexTokenizer, Tokenizer
31
+
32
+ if TYPE_CHECKING:
33
+ import polars as pl
34
+
35
+ from .temporal.slicing import TemporalCorpus
36
+
37
+
38
+ def _doc_term_counts(
39
+ docs: pd.DataFrame,
40
+ text_col: str,
41
+ tokenizer: Tokenizer,
42
+ min_count: int = 1,
43
+ ) -> pd.DataFrame:
44
+ """Build a docs × term integer count matrix.
45
+
46
+ The result is dense (``int64``) and indexed by the parent frame's index.
47
+ For corpora large enough that a dense ``n_docs × |vocab|`` matrix is
48
+ infeasible (~10⁵ docs × ~10⁵ vocab → ~80 GB int64), use
49
+ :meth:`Corpus.doc_term_counts_sparse` instead.
50
+ """
51
+ counts_per_doc: list[Counter[str]] = [Counter(tokenizer(t)) for t in docs[text_col]]
52
+ all_terms: list[str] = sorted({term for c in counts_per_doc for term in c})
53
+ term_to_idx: dict[str, int] = {t: i for i, t in enumerate(all_terms)}
54
+
55
+ data = np.zeros((len(counts_per_doc), len(all_terms)), dtype=np.int64)
56
+ for i, doc_counts in enumerate(counts_per_doc):
57
+ for term, n in doc_counts.items():
58
+ data[i, term_to_idx[term]] = n
59
+
60
+ df = pd.DataFrame(data, columns=all_terms, index=docs.index)
61
+ if min_count > 1:
62
+ df = df.loc[:, df.sum(axis=0) >= min_count]
63
+ return df
64
+
65
+
66
+ def _doc_term_counts_sparse(
67
+ docs: pd.DataFrame,
68
+ text_col: str,
69
+ tokenizer: Tokenizer,
70
+ min_count: int = 1,
71
+ ) -> tuple[Any, list[str]]:
72
+ """Build the same docs × term counts as :func:`_doc_term_counts` but sparse.
73
+
74
+ Returns ``(csr_matrix, vocab)`` — the canonical scikit-learn shape that
75
+ sklearn's :class:`CountVectorizer` and gensim's matrix utilities both
76
+ expose. Memory scales with nnz (non-zero cells), not ``n_docs × |vocab|``.
77
+
78
+ Computed by accumulating ``(row, col, count)`` triples in ``coo`` form
79
+ then converting to ``csr``; vocabulary is sorted lexicographically to
80
+ match :func:`_doc_term_counts` so the two views agree column-for-column.
81
+ """
82
+ from scipy.sparse import coo_matrix
83
+
84
+ counts_per_doc: list[Counter[str]] = [Counter(tokenizer(t)) for t in docs[text_col]]
85
+ all_terms: list[str] = sorted({term for c in counts_per_doc for term in c})
86
+ term_to_idx: dict[str, int] = {t: i for i, t in enumerate(all_terms)}
87
+
88
+ rows: list[int] = []
89
+ cols: list[int] = []
90
+ vals: list[int] = []
91
+ for i, doc_counts in enumerate(counts_per_doc):
92
+ for term, n in doc_counts.items():
93
+ rows.append(i)
94
+ cols.append(term_to_idx[term])
95
+ vals.append(n)
96
+
97
+ n_docs = len(counts_per_doc)
98
+ n_terms = len(all_terms)
99
+ matrix = coo_matrix(
100
+ (np.asarray(vals, dtype=np.int64), (rows, cols)),
101
+ shape=(n_docs, n_terms),
102
+ dtype=np.int64,
103
+ ).tocsr()
104
+
105
+ if min_count > 1:
106
+ col_totals = np.asarray(matrix.sum(axis=0)).ravel()
107
+ keep_mask = col_totals >= min_count
108
+ matrix = matrix[:, keep_mask]
109
+ all_terms = [t for t, k in zip(all_terms, keep_mask, strict=True) if k]
110
+
111
+ return matrix, all_terms
112
+
113
+
114
+ def _coerce_to_pandas(docs: Any) -> pd.DataFrame:
115
+ """Accept a pandas or polars DataFrame; return a pandas one.
116
+
117
+ The analytical layer is pandas-based; this function is the single
118
+ boundary where polars input gets converted. ``polars.DataFrame``
119
+ has ``.to_pandas()`` so the conversion is one method call.
120
+ """
121
+ if isinstance(docs, pd.DataFrame):
122
+ return docs
123
+ # Defer the polars import — it's an optional dep.
124
+ try:
125
+ import polars as pl
126
+ except ImportError: # pragma: no cover
127
+ pl = None # type: ignore[assignment]
128
+ if pl is not None and isinstance(docs, pl.DataFrame):
129
+ return docs.to_pandas()
130
+ raise TypeError(
131
+ f"docs must be a pandas or polars DataFrame; got {type(docs).__name__}"
132
+ )
133
+
134
+
135
+ @dataclass(frozen=True, eq=False)
136
+ class Corpus:
137
+ """A corpus of documents with optional metadata columns.
138
+
139
+ Parameters
140
+ ----------
141
+ docs
142
+ A DataFrame whose rows are documents. Must contain at least the
143
+ text column named by ``text_col``. Accepts either
144
+ :class:`pandas.DataFrame` or :class:`polars.DataFrame`; polars
145
+ input is converted to pandas internally.
146
+ text_col
147
+ Name of the column containing document text.
148
+ id_col
149
+ Name of an optional unique-document-id column.
150
+ meta_cols
151
+ Tuple of column names treated as metadata available for slicing.
152
+ If empty (the default), every non-text column is considered
153
+ metadata.
154
+ tokenizer
155
+ A callable conforming to :class:`pycorpdiff.tokenize.Tokenizer`.
156
+ Defaults to the package's :class:`RegexTokenizer`.
157
+
158
+ Hashability
159
+ -----------
160
+
161
+ :class:`Corpus` is hashable with a content-derived hash — two
162
+ corpora with the same documents, schema, and tokenizer hash the
163
+ same. The hash uses :func:`pandas.util.hash_pandas_object` so it's
164
+ fast (O(N) over rows, vectorised) but deterministic. Use a Corpus
165
+ as a dict key when memoising analyses or building reproducibility
166
+ caches.
167
+ """
168
+
169
+ docs: pd.DataFrame
170
+ text_col: str = "text"
171
+ id_col: str | None = None
172
+ meta_cols: tuple[str, ...] = ()
173
+ tokenizer: Tokenizer = field(default_factory=RegexTokenizer)
174
+
175
+ def __post_init__(self) -> None:
176
+ # Coerce polars to pandas if needed; otherwise leave alone.
177
+ if not isinstance(self.docs, pd.DataFrame):
178
+ object.__setattr__(self, "docs", _coerce_to_pandas(self.docs))
179
+ if self.text_col not in self.docs.columns:
180
+ raise ValueError(
181
+ f"text_col={self.text_col!r} not found in DataFrame columns "
182
+ f"{list(self.docs.columns)!r}"
183
+ )
184
+ if self.id_col is not None and self.id_col not in self.docs.columns:
185
+ raise ValueError(
186
+ f"id_col={self.id_col!r} not found in DataFrame columns "
187
+ f"{list(self.docs.columns)!r}"
188
+ )
189
+
190
+ def __len__(self) -> int:
191
+ return len(self.docs)
192
+
193
+ def __hash__(self) -> int:
194
+ """Content-derived hash for cache keys / reproducibility checks.
195
+
196
+ Combines a fast vectorised hash of every document row with the
197
+ corpus configuration (text/id/meta columns + tokenizer repr).
198
+ Two corpora with identical docs, schema, and tokenizer hash the
199
+ same; mutating any of those (in a copy — Corpus is frozen)
200
+ produces a different hash.
201
+ """
202
+ row_hash = int(pd.util.hash_pandas_object(self.docs, index=False).sum()) & 0xFFFFFFFFFFFFFFFF
203
+ return hash(
204
+ (row_hash, self.text_col, self.id_col, self.meta_cols, repr(self.tokenizer))
205
+ )
206
+
207
+ def __eq__(self, other: object) -> bool:
208
+ """Two Corpora are equal iff their hashes agree on content + config."""
209
+ if not isinstance(other, Corpus):
210
+ return NotImplemented
211
+ return hash(self) == hash(other)
212
+
213
+ @property
214
+ def metadata_columns(self) -> tuple[str, ...]:
215
+ """Effective metadata columns — explicit if given, else inferred."""
216
+ if self.meta_cols:
217
+ return self.meta_cols
218
+ return tuple(c for c in self.docs.columns if c != self.text_col)
219
+
220
+ def slice(self, **filters: Any) -> CorpusSlice:
221
+ """Return a :class:`CorpusSlice` filtered on metadata columns.
222
+
223
+ Each keyword argument is a column name; the value may be a scalar
224
+ (exact match) or an iterable (membership). All conditions are
225
+ combined with logical AND.
226
+ """
227
+ mask = pd.Series(True, index=self.docs.index)
228
+ for col, value in filters.items():
229
+ if col not in self.docs.columns:
230
+ raise KeyError(f"slice() got unknown column {col!r}")
231
+ if isinstance(value, (list, tuple, set, pd.Series)):
232
+ mask &= self.docs[col].isin(list(value))
233
+ else:
234
+ mask &= self.docs[col] == value
235
+ return CorpusSlice(parent=self, mask=mask, filters=dict(filters))
236
+
237
+ def by_time(self, col: str, freq: str = "Y") -> TemporalCorpus:
238
+ """Return a :class:`TemporalCorpus` indexed by time-period.
239
+
240
+ ``col`` must be parseable as datetime; ``freq`` is any pandas
241
+ offset alias (``"Y"``, ``"Q"``, ``"M"``, ``"W"``, ``"D"``).
242
+ """
243
+ from .temporal.slicing import TemporalCorpus # local import to break cycle
244
+
245
+ return TemporalCorpus(parent=self, time_col=col, freq=freq)
246
+
247
+ def with_tokenizer(self, tokenizer: Tokenizer) -> Corpus:
248
+ """Return a copy of the corpus with a different tokenizer."""
249
+ return replace(self, tokenizer=tokenizer)
250
+
251
+ def tokens(self) -> list[list[str]]:
252
+ """Tokenize every document; return one list of tokens per doc."""
253
+ return [self.tokenizer(t) for t in self.docs[self.text_col]]
254
+
255
+ def doc_term_counts(self, min_count: int = 1) -> pd.DataFrame:
256
+ """Return a docs × term integer count DataFrame."""
257
+ return _doc_term_counts(self.docs, self.text_col, self.tokenizer, min_count)
258
+
259
+ def doc_term_counts_sparse(self, min_count: int = 1) -> tuple[Any, list[str]]:
260
+ """Return the docs × term matrix in :class:`scipy.sparse` form.
261
+
262
+ Returns ``(matrix, vocab)`` where ``matrix`` is a
263
+ :class:`scipy.sparse.csr_matrix` of shape ``(n_docs, |vocab|)``
264
+ and ``vocab`` is the lexicographically-sorted list of column
265
+ terms. This is the canonical scikit-learn shape, and lets you
266
+ plug a pycorpdiff corpus directly into anything that expects
267
+ :class:`~sklearn.feature_extraction.text.CountVectorizer` output.
268
+
269
+ For typical analytical work the dense
270
+ :meth:`doc_term_counts` is fine — but on a corpus with ~100K
271
+ docs and ~50K vocab the dense int64 matrix is ~40 GB while the
272
+ sparse CSR is megabytes. Use this when the dense matrix would
273
+ blow your RAM budget.
274
+ """
275
+ return _doc_term_counts_sparse(
276
+ self.docs, self.text_col, self.tokenizer, min_count
277
+ )
278
+
279
+ def vocab(self, min_count: int = 1) -> pd.Series:
280
+ """Return a term → total-count Series sorted descending."""
281
+ counts = self.doc_term_counts(min_count=min_count).sum(axis=0)
282
+ return counts.rename("count").sort_values(ascending=False)
283
+
284
+ def total_tokens(self) -> int:
285
+ """Total tokens across all documents (before any min_count filter)."""
286
+ return int(self.doc_term_counts(min_count=1).values.sum())
287
+
288
+ def to_polars(self) -> pl.DataFrame:
289
+ """Return the corpus's documents as a polars DataFrame.
290
+
291
+ Requires the ``polars`` extra. The original pandas index is
292
+ dropped (polars has no concept of a row index); the document
293
+ ordering is preserved.
294
+ """
295
+ try:
296
+ import polars as pl
297
+ except ImportError as exc: # pragma: no cover
298
+ raise ImportError(
299
+ "to_polars() requires polars. Install with: pip install 'pycorpdiff[polars]'"
300
+ ) from exc
301
+ return pl.from_pandas(self.docs.reset_index(drop=True))
302
+
303
+
304
+ @dataclass(frozen=True)
305
+ class CorpusSlice:
306
+ """A boolean-masked view of a :class:`Corpus`.
307
+
308
+ Slices behave like corpora for downstream analytical purposes —
309
+ they expose the same ``docs``, ``text_col``, ``tokenizer`` surface —
310
+ but also remember the ``filters`` that produced them, which the
311
+ :class:`pycorpdiff.compare.Comparison` machinery uses to label
312
+ plots and result tables.
313
+ """
314
+
315
+ parent: Corpus
316
+ mask: pd.Series
317
+ filters: dict[str, Any]
318
+
319
+ @property
320
+ def docs(self) -> pd.DataFrame:
321
+ return self.parent.docs.loc[self.mask]
322
+
323
+ @property
324
+ def text_col(self) -> str:
325
+ return self.parent.text_col
326
+
327
+ @property
328
+ def id_col(self) -> str | None:
329
+ return self.parent.id_col
330
+
331
+ @property
332
+ def tokenizer(self) -> Tokenizer:
333
+ return self.parent.tokenizer
334
+
335
+ def __len__(self) -> int:
336
+ return int(self.mask.sum())
337
+
338
+ @property
339
+ def label(self) -> str:
340
+ """A short human-readable label derived from the slice's filters."""
341
+ if not self.filters:
342
+ return "slice"
343
+ return ", ".join(f"{k}={v!r}" for k, v in self.filters.items())
344
+
345
+ def tokens(self) -> list[list[str]]:
346
+ """Tokenize every document in the slice."""
347
+ return [self.tokenizer(t) for t in self.docs[self.text_col]]
348
+
349
+ def doc_term_counts(self, min_count: int = 1) -> pd.DataFrame:
350
+ return _doc_term_counts(self.docs, self.text_col, self.tokenizer, min_count)
351
+
352
+ def doc_term_counts_sparse(self, min_count: int = 1) -> tuple[Any, list[str]]:
353
+ """Sparse counterpart to :meth:`doc_term_counts`. See
354
+ :meth:`Corpus.doc_term_counts_sparse` for semantics.
355
+ """
356
+ return _doc_term_counts_sparse(
357
+ self.docs, self.text_col, self.tokenizer, min_count
358
+ )
359
+
360
+ def vocab(self, min_count: int = 1) -> pd.Series:
361
+ counts = self.doc_term_counts(min_count=min_count).sum(axis=0)
362
+ return counts.rename("count").sort_values(ascending=False)
363
+
364
+ def total_tokens(self) -> int:
365
+ return int(self.doc_term_counts(min_count=1).values.sum())
366
+
367
+ def slice(self, **filters: Any) -> CorpusSlice:
368
+ """Further filter this slice — produces a new CorpusSlice on the
369
+ same parent with the masks AND-ed and the filters merged.
370
+
371
+ Lets you chain: ``corpus.slice(topic="x").slice(party="y")``.
372
+ """
373
+ new_mask = self.mask.copy()
374
+ merged_filters = dict(self.filters)
375
+ for col, value in filters.items():
376
+ if col not in self.parent.docs.columns:
377
+ raise KeyError(f"slice() got unknown column {col!r}")
378
+ if isinstance(value, (list, tuple, set, pd.Series)):
379
+ new_mask &= self.parent.docs[col].isin(list(value))
380
+ else:
381
+ new_mask &= self.parent.docs[col] == value
382
+ merged_filters[col] = value
383
+ return CorpusSlice(parent=self.parent, mask=new_mask, filters=merged_filters)
384
+
385
+ def by_time(self, col: str, freq: str = "Y") -> TemporalCorpus:
386
+ """Return a TemporalCorpus over the slice's documents only.
387
+
388
+ Materialises the slice's masked rows into a fresh Corpus, then
389
+ delegates to :meth:`Corpus.by_time`. Lets you chain
390
+ ``corpus.slice(topic="x").by_time("date")``.
391
+ """
392
+ from .temporal.slicing import TemporalCorpus # local import to break cycle
393
+
394
+ fresh = Corpus(
395
+ docs=self.docs.reset_index(drop=True),
396
+ text_col=self.text_col,
397
+ id_col=self.id_col,
398
+ meta_cols=self.parent.meta_cols,
399
+ tokenizer=self.tokenizer,
400
+ )
401
+ return TemporalCorpus(parent=fresh, time_col=col, freq=freq)
402
+
403
+ def to_polars(self) -> pl.DataFrame:
404
+ """Return the slice's documents as a polars DataFrame."""
405
+ try:
406
+ import polars as pl
407
+ except ImportError as exc: # pragma: no cover
408
+ raise ImportError(
409
+ "to_polars() requires polars. Install with: pip install 'pycorpdiff[polars]'"
410
+ ) from exc
411
+ return pl.from_pandas(self.docs.reset_index(drop=True))
@@ -0,0 +1,27 @@
1
+ """Bundled corpora for demonstrations, tutorials, and reproducible tests.
2
+
3
+ What ships with the package
4
+ ---------------------------
5
+
6
+ - :func:`load_hansard_sample` — a 200-speech synthetic corpus designed
7
+ to mimic UK Hansard's structure across two decades, four topics, and
8
+ four parties, with topical language shifts around real-world events
9
+ (Brexit referendum, COVID-19, the climate-emergency declarations).
10
+
11
+ The sample is **synthetic** but its structure is realistic enough to
12
+ demo every analytical surface in pycorpdiff. For an actual research
13
+ project users will want the real Hansard archive — see the docstring on
14
+ :func:`load_hansard_sample` for the canonical download paths.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from .hansard import fetch_hansard, load_hansard_sample
20
+ from .histwords import fetch_histwords_decade, histwords_cosine_shift
21
+
22
+ __all__ = [
23
+ "fetch_hansard",
24
+ "fetch_histwords_decade",
25
+ "histwords_cosine_shift",
26
+ "load_hansard_sample",
27
+ ]
@@ -0,0 +1,221 @@
1
+ """Deterministically generate the synthetic Hansard-sample parquet.
2
+
3
+ This script is the source-of-truth for ``hansard_sample.parquet``. It is
4
+ *not* run at import time — the parquet is committed and shipped with the
5
+ package, and :func:`load_hansard_sample` just reads it. The script is
6
+ here so reviewers can verify the sample is reproducible and so we can
7
+ regenerate it when the templates change.
8
+
9
+ Run with::
10
+
11
+ python -m pycorpdiff.datasets._generate_hansard
12
+
13
+ The output is written to ``src/pycorpdiff/datasets/_data/hansard_sample.parquet``.
14
+ The generator is seeded so output is byte-identical across machines and
15
+ Python versions.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from pathlib import Path
21
+
22
+ import numpy as np
23
+ import pandas as pd
24
+
25
+ OPENINGS = [
26
+ "I rise to address the House on",
27
+ "I beg leave to bring to the attention of this House the question of",
28
+ "Mr Speaker, I wish to make a statement concerning",
29
+ "I am grateful for the opportunity to debate",
30
+ "The Honourable Members of this House should consider",
31
+ "Madam Deputy Speaker, I wish to address",
32
+ "I am pleased to speak on the matter of",
33
+ ]
34
+
35
+ CLOSINGS = [
36
+ "I commend this motion to the House.",
37
+ "I urge my Honourable colleagues to support this.",
38
+ "The Government must take immediate action.",
39
+ "We owe this much to our constituents.",
40
+ "This matter brooks no further delay.",
41
+ "The time for inaction has long passed.",
42
+ "I beg to move.",
43
+ ]
44
+
45
+ # (topic, period_label) -> list of body templates.
46
+ # The period_label structure encodes the temporal-frame shifts:
47
+ # - immigration shifts from humanising (pre-2016) to criminalising (post-2016)
48
+ # - brexit goes through emerging → peak → aftermath
49
+ # - nhs has a steady frame + crisis spikes in 2010 (austerity) and 2020 (covid)
50
+ # - climate sharpens scientific → policy → crisis post-2019
51
+ TOPIC_BODIES: dict[tuple[str, str], list[str]] = {
52
+ ("immigration", "humanising"): [
53
+ "the immigrant worker arrived with hope and the immigrant family settled with dignity",
54
+ "the immigrant community contributes to our shared prosperity and shared future",
55
+ "the immigrant family deserves protection refuge and a clear path to citizenship",
56
+ "the immigrant worker rights advance through union solidarity and our labour movement",
57
+ "the immigrant community organised with strength and the immigrant worker spoke with pride",
58
+ "the immigrant family brings cultural richness and economic vitality to our towns",
59
+ "the immigrant worker contributes to public services education and our national life",
60
+ "the immigrant community has thrived and the immigrant family has flourished here",
61
+ ],
62
+ ("immigration", "criminalising"): [
63
+ "the immigrant criminal threat grows and the immigrant invasion of gangs spreads",
64
+ "the immigrant criminal element alarms residents and our border control has failed",
65
+ "the immigrant invasion narrative dominates news and immigrant criminal gangs persist",
66
+ "the immigrant threat has increased and the immigrant crime narrative grew with concern",
67
+ "the immigrant gangs threaten the border and the immigrant criminal risk grows daily",
68
+ "the immigrant criminal threat must be confronted and the immigrant invasion halted",
69
+ "the immigrant criminal gangs operate freely and immigrant invasion routes remain open",
70
+ ],
71
+ ("brexit", "emerging"): [
72
+ "the european question must be addressed and the referendum we promised must be delivered",
73
+ "our relationship with europe requires renegotiation and reform from this government",
74
+ "the european union framework no longer serves british interests and british sovereignty",
75
+ "the european treaties demand renegotiation before the public can give their consent",
76
+ "the european question divides this house but the people must have their say",
77
+ ],
78
+ ("brexit", "peak"): [
79
+ "the brexit referendum result must be respected and delivered without further delay",
80
+ "the brexit deal must respect the democratic will of seventeen million leave voters",
81
+ "the brexit transition must protect british businesses and british workers from harm",
82
+ "the brexit negotiations require firm leadership and a clear vision for our nation",
83
+ "the brexit outcome will define this generation and the brexit deal must be honoured",
84
+ "the brexit settlement requires patience but the brexit mandate is unambiguous",
85
+ ],
86
+ ("brexit", "aftermath"): [
87
+ "the brexit deal has delivered for british sovereignty and british democratic accountability",
88
+ "the brexit aftermath reveals supply chain disruption and significant economic adjustment",
89
+ "the brexit transition continues with new opportunities for global trade and partnership",
90
+ "the brexit dividend has yet to materialise for working families across our nation",
91
+ "the brexit settlement requires further work on northern ireland and our customs arrangements",
92
+ ],
93
+ ("nhs", "normal"): [
94
+ "the national health service requires sustained investment for our nurses and doctors",
95
+ "the nhs workforce must be supported with proper funding and training programmes",
96
+ "the patient care standards must be maintained across all hospitals and trusts",
97
+ "the nhs provides universal care and the nhs principles remain our foundation",
98
+ "the national health service belongs to all of us and to our future generations",
99
+ ],
100
+ ("nhs", "austerity"): [
101
+ "the nhs austerity cuts threaten patient care and waiting times grow alarming",
102
+ "the nhs underfunding creates crises in accident and emergency departments nationwide",
103
+ "the nhs austerity decisions cost lives and the nhs underfunding harms our communities",
104
+ "the nhs cuts must be reversed and the nhs funding settlement must be honoured",
105
+ ],
106
+ ("nhs", "covid"): [
107
+ "the nhs response to the pandemic deserves our gratitude and continued support",
108
+ "the nhs covid crisis demands emergency funding and ventilator capacity now",
109
+ "the nhs frontline workers face unprecedented pressure and the nhs covid response saves lives",
110
+ "the nhs covid surge requires us to clap and to legislate for fair pay",
111
+ ],
112
+ ("climate", "scientific"): [
113
+ "the scientific consensus on climate change requires policy response from this government",
114
+ "the climate models indicate warming trends that demand emissions reduction targets",
115
+ "the climate science is settled and the climate research is unambiguous in its conclusions",
116
+ "the climate evidence accumulates and the climate scientists call for action",
117
+ ],
118
+ ("climate", "policy"): [
119
+ "the climate policy framework must align with our paris agreement obligations",
120
+ "the climate change committee recommends carbon budget reductions for the coming decade",
121
+ "the climate policy must include just transition for fossil fuel workers and communities",
122
+ "the climate framework needs revision and the climate policy targets need strengthening",
123
+ ],
124
+ ("climate", "crisis"): [
125
+ "the climate crisis is here now and the climate emergency demands urgent action",
126
+ "the climate breakdown threatens our coastlines and the climate emergency cannot wait",
127
+ "the climate crisis requires immediate emissions cuts and the climate emergency is upon us",
128
+ "the climate disaster unfolds and the climate emergency response must accelerate",
129
+ "the climate emergency declaration must be backed by the climate action this house owes",
130
+ ],
131
+ }
132
+
133
+ # Period predicates for each topic. Each yields a (year) -> period_label
134
+ # mapping that decides which template bucket a speech in that year uses.
135
+ def _immigration_period(year: int) -> str:
136
+ return "humanising" if year < 2016 else "criminalising"
137
+
138
+
139
+ def _brexit_period(year: int) -> str:
140
+ if year < 2016:
141
+ return "emerging"
142
+ if year < 2020:
143
+ return "peak"
144
+ return "aftermath"
145
+
146
+
147
+ def _nhs_period(year: int) -> str:
148
+ if 2010 <= year <= 2014:
149
+ return "austerity"
150
+ if 2020 <= year <= 2022:
151
+ return "covid"
152
+ return "normal"
153
+
154
+
155
+ def _climate_period(year: int) -> str:
156
+ if year < 2011:
157
+ return "scientific"
158
+ if year < 2019:
159
+ return "policy"
160
+ return "crisis"
161
+
162
+
163
+ PERIOD_FOR = {
164
+ "immigration": _immigration_period,
165
+ "brexit": _brexit_period,
166
+ "nhs": _nhs_period,
167
+ "climate": _climate_period,
168
+ }
169
+
170
+ PARTIES = ["Labour", "Conservative", "Liberal Democrat", "SNP"]
171
+ TOPICS = ["immigration", "brexit", "nhs", "climate"]
172
+
173
+
174
+ def generate(seed: int = 20260522) -> pd.DataFrame:
175
+ """Return a deterministic 200-speech synthetic Hansard sample."""
176
+ rng = np.random.default_rng(seed)
177
+ rows: list[dict[str, object]] = []
178
+ speech_id = 0
179
+ for year in range(2005, 2024):
180
+ # Roughly 10-11 speeches per year. Brexit and immigration get
181
+ # more airtime in years close to the referendum.
182
+ n_speeches = 10 + (1 if year in {2016, 2017, 2019} else 0)
183
+ for _ in range(n_speeches):
184
+ topic = TOPICS[int(rng.integers(0, len(TOPICS)))]
185
+ period = PERIOD_FOR[topic](year)
186
+ body_pool = TOPIC_BODIES[(topic, period)]
187
+ body = body_pool[int(rng.integers(0, len(body_pool)))]
188
+ opening = OPENINGS[int(rng.integers(0, len(OPENINGS)))]
189
+ closing = CLOSINGS[int(rng.integers(0, len(CLOSINGS)))]
190
+ party = PARTIES[int(rng.integers(0, len(PARTIES)))]
191
+ month = int(rng.integers(1, 13))
192
+ day = int(rng.integers(1, 28))
193
+ rows.append(
194
+ {
195
+ "speech_id": speech_id,
196
+ "text": f"{opening} {topic}. {body}. {closing}",
197
+ "topic": topic,
198
+ "frame": period,
199
+ "party": party,
200
+ "date": f"{year}-{month:02d}-{day:02d}",
201
+ "year": year,
202
+ }
203
+ )
204
+ speech_id += 1
205
+ return pd.DataFrame(rows)
206
+
207
+
208
+ def main() -> None:
209
+ df = generate()
210
+ out_path = Path(__file__).parent / "_data" / "hansard_sample.parquet"
211
+ out_path.parent.mkdir(parents=True, exist_ok=True)
212
+ df.to_parquet(out_path, index=False)
213
+ print(f"wrote {len(df)} speeches to {out_path}")
214
+ print(f"topic distribution: {df['topic'].value_counts().to_dict()}")
215
+ print(f"frame distribution: {df['frame'].value_counts().to_dict()}")
216
+ print(f"party distribution: {df['party'].value_counts().to_dict()}")
217
+ print(f"year range: {df['year'].min()}–{df['year'].max()}")
218
+
219
+
220
+ if __name__ == "__main__":
221
+ main()