pycorpdiff 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycorpdiff/__init__.py +126 -0
- pycorpdiff/_backends/__init__.py +3 -0
- pycorpdiff/_backends/pandas.py +3 -0
- pycorpdiff/_backends/polars.py +3 -0
- pycorpdiff/collocation/__init__.py +19 -0
- pycorpdiff/collocation/cooccurrence.py +65 -0
- pycorpdiff/collocation/measures.py +102 -0
- pycorpdiff/collocation/network.py +233 -0
- pycorpdiff/collocation/shift.py +146 -0
- pycorpdiff/compare.py +345 -0
- pycorpdiff/corpus.py +411 -0
- pycorpdiff/datasets/__init__.py +27 -0
- pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- pycorpdiff/datasets/_generate_hansard.py +221 -0
- pycorpdiff/datasets/hansard.py +235 -0
- pycorpdiff/datasets/histwords.py +221 -0
- pycorpdiff/explain.py +177 -0
- pycorpdiff/io/__init__.py +16 -0
- pycorpdiff/io/duckdb.py +92 -0
- pycorpdiff/io/huggingface.py +142 -0
- pycorpdiff/io/readers.py +138 -0
- pycorpdiff/keyness/__init__.py +26 -0
- pycorpdiff/keyness/bayes.py +50 -0
- pycorpdiff/keyness/chi_squared.py +94 -0
- pycorpdiff/keyness/correction.py +34 -0
- pycorpdiff/keyness/dispersion.py +89 -0
- pycorpdiff/keyness/effect_sizes.py +65 -0
- pycorpdiff/keyness/loglikelihood.py +92 -0
- pycorpdiff/keyness/multicorpus.py +143 -0
- pycorpdiff/keyness/permutation.py +154 -0
- pycorpdiff/py.typed +0 -0
- pycorpdiff/results.py +635 -0
- pycorpdiff/semantic/__init__.py +18 -0
- pycorpdiff/semantic/alignment.py +53 -0
- pycorpdiff/semantic/embed.py +84 -0
- pycorpdiff/semantic/shift.py +224 -0
- pycorpdiff/semantic/trajectory.py +166 -0
- pycorpdiff/stats.py +69 -0
- pycorpdiff/temporal/__init__.py +15 -0
- pycorpdiff/temporal/bocpd.py +233 -0
- pycorpdiff/temporal/causal_impact.py +293 -0
- pycorpdiff/temporal/changepoint.py +92 -0
- pycorpdiff/temporal/forecast.py +405 -0
- pycorpdiff/temporal/its.py +123 -0
- pycorpdiff/temporal/slicing.py +174 -0
- pycorpdiff/tokenize.py +110 -0
- pycorpdiff/viz/__init__.py +37 -0
- pycorpdiff/viz/bocpd.py +173 -0
- pycorpdiff/viz/causal_impact.py +142 -0
- pycorpdiff/viz/collocation.py +48 -0
- pycorpdiff/viz/dispersion.py +117 -0
- pycorpdiff/viz/forecast.py +129 -0
- pycorpdiff/viz/keyness.py +96 -0
- pycorpdiff/viz/network.py +186 -0
- pycorpdiff/viz/scattertext.py +160 -0
- pycorpdiff/viz/semantic_forecast.py +114 -0
- pycorpdiff/viz/trajectory.py +48 -0
- pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
- pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
- pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
- pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
pycorpdiff/corpus.py
ADDED
|
@@ -0,0 +1,411 @@
|
|
|
1
|
+
"""Core ``Corpus`` and ``CorpusSlice`` data structures.
|
|
2
|
+
|
|
3
|
+
A :class:`Corpus` wraps a :class:`pandas.DataFrame` of documents plus
|
|
4
|
+
metadata. Slicing returns a :class:`CorpusSlice` that shares the
|
|
5
|
+
parent's configuration (text column, tokenizer) but presents a
|
|
6
|
+
boolean-masked view. Both objects are immutable frozen dataclasses;
|
|
7
|
+
mutations produce new objects.
|
|
8
|
+
|
|
9
|
+
**Polars interop.** A Corpus stores its documents internally as a
|
|
10
|
+
pandas DataFrame because that's what the analytical layer is built on,
|
|
11
|
+
but the constructors and round-trip helpers accept and produce polars
|
|
12
|
+
DataFrames so they slot into polars-native pipelines:
|
|
13
|
+
|
|
14
|
+
>>> import polars as pl
|
|
15
|
+
>>> df = pl.DataFrame({"text": ["the cat sat"], "outlet": ["A"]})
|
|
16
|
+
>>> corpus = pcd.from_dataframe(df, text_col="text") # polars → pandas internally
|
|
17
|
+
>>> corpus.to_polars().shape # round-trip back
|
|
18
|
+
(1, 2)
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from collections import Counter
|
|
24
|
+
from dataclasses import dataclass, field, replace
|
|
25
|
+
from typing import TYPE_CHECKING, Any
|
|
26
|
+
|
|
27
|
+
import numpy as np
|
|
28
|
+
import pandas as pd
|
|
29
|
+
|
|
30
|
+
from .tokenize import RegexTokenizer, Tokenizer
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
import polars as pl
|
|
34
|
+
|
|
35
|
+
from .temporal.slicing import TemporalCorpus
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _doc_term_counts(
|
|
39
|
+
docs: pd.DataFrame,
|
|
40
|
+
text_col: str,
|
|
41
|
+
tokenizer: Tokenizer,
|
|
42
|
+
min_count: int = 1,
|
|
43
|
+
) -> pd.DataFrame:
|
|
44
|
+
"""Build a docs × term integer count matrix.
|
|
45
|
+
|
|
46
|
+
The result is dense (``int64``) and indexed by the parent frame's index.
|
|
47
|
+
For corpora large enough that a dense ``n_docs × |vocab|`` matrix is
|
|
48
|
+
infeasible (~10⁵ docs × ~10⁵ vocab → ~80 GB int64), use
|
|
49
|
+
:meth:`Corpus.doc_term_counts_sparse` instead.
|
|
50
|
+
"""
|
|
51
|
+
counts_per_doc: list[Counter[str]] = [Counter(tokenizer(t)) for t in docs[text_col]]
|
|
52
|
+
all_terms: list[str] = sorted({term for c in counts_per_doc for term in c})
|
|
53
|
+
term_to_idx: dict[str, int] = {t: i for i, t in enumerate(all_terms)}
|
|
54
|
+
|
|
55
|
+
data = np.zeros((len(counts_per_doc), len(all_terms)), dtype=np.int64)
|
|
56
|
+
for i, doc_counts in enumerate(counts_per_doc):
|
|
57
|
+
for term, n in doc_counts.items():
|
|
58
|
+
data[i, term_to_idx[term]] = n
|
|
59
|
+
|
|
60
|
+
df = pd.DataFrame(data, columns=all_terms, index=docs.index)
|
|
61
|
+
if min_count > 1:
|
|
62
|
+
df = df.loc[:, df.sum(axis=0) >= min_count]
|
|
63
|
+
return df
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def _doc_term_counts_sparse(
|
|
67
|
+
docs: pd.DataFrame,
|
|
68
|
+
text_col: str,
|
|
69
|
+
tokenizer: Tokenizer,
|
|
70
|
+
min_count: int = 1,
|
|
71
|
+
) -> tuple[Any, list[str]]:
|
|
72
|
+
"""Build the same docs × term counts as :func:`_doc_term_counts` but sparse.
|
|
73
|
+
|
|
74
|
+
Returns ``(csr_matrix, vocab)`` — the canonical scikit-learn shape that
|
|
75
|
+
sklearn's :class:`CountVectorizer` and gensim's matrix utilities both
|
|
76
|
+
expose. Memory scales with nnz (non-zero cells), not ``n_docs × |vocab|``.
|
|
77
|
+
|
|
78
|
+
Computed by accumulating ``(row, col, count)`` triples in ``coo`` form
|
|
79
|
+
then converting to ``csr``; vocabulary is sorted lexicographically to
|
|
80
|
+
match :func:`_doc_term_counts` so the two views agree column-for-column.
|
|
81
|
+
"""
|
|
82
|
+
from scipy.sparse import coo_matrix
|
|
83
|
+
|
|
84
|
+
counts_per_doc: list[Counter[str]] = [Counter(tokenizer(t)) for t in docs[text_col]]
|
|
85
|
+
all_terms: list[str] = sorted({term for c in counts_per_doc for term in c})
|
|
86
|
+
term_to_idx: dict[str, int] = {t: i for i, t in enumerate(all_terms)}
|
|
87
|
+
|
|
88
|
+
rows: list[int] = []
|
|
89
|
+
cols: list[int] = []
|
|
90
|
+
vals: list[int] = []
|
|
91
|
+
for i, doc_counts in enumerate(counts_per_doc):
|
|
92
|
+
for term, n in doc_counts.items():
|
|
93
|
+
rows.append(i)
|
|
94
|
+
cols.append(term_to_idx[term])
|
|
95
|
+
vals.append(n)
|
|
96
|
+
|
|
97
|
+
n_docs = len(counts_per_doc)
|
|
98
|
+
n_terms = len(all_terms)
|
|
99
|
+
matrix = coo_matrix(
|
|
100
|
+
(np.asarray(vals, dtype=np.int64), (rows, cols)),
|
|
101
|
+
shape=(n_docs, n_terms),
|
|
102
|
+
dtype=np.int64,
|
|
103
|
+
).tocsr()
|
|
104
|
+
|
|
105
|
+
if min_count > 1:
|
|
106
|
+
col_totals = np.asarray(matrix.sum(axis=0)).ravel()
|
|
107
|
+
keep_mask = col_totals >= min_count
|
|
108
|
+
matrix = matrix[:, keep_mask]
|
|
109
|
+
all_terms = [t for t, k in zip(all_terms, keep_mask, strict=True) if k]
|
|
110
|
+
|
|
111
|
+
return matrix, all_terms
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def _coerce_to_pandas(docs: Any) -> pd.DataFrame:
|
|
115
|
+
"""Accept a pandas or polars DataFrame; return a pandas one.
|
|
116
|
+
|
|
117
|
+
The analytical layer is pandas-based; this function is the single
|
|
118
|
+
boundary where polars input gets converted. ``polars.DataFrame``
|
|
119
|
+
has ``.to_pandas()`` so the conversion is one method call.
|
|
120
|
+
"""
|
|
121
|
+
if isinstance(docs, pd.DataFrame):
|
|
122
|
+
return docs
|
|
123
|
+
# Defer the polars import — it's an optional dep.
|
|
124
|
+
try:
|
|
125
|
+
import polars as pl
|
|
126
|
+
except ImportError: # pragma: no cover
|
|
127
|
+
pl = None # type: ignore[assignment]
|
|
128
|
+
if pl is not None and isinstance(docs, pl.DataFrame):
|
|
129
|
+
return docs.to_pandas()
|
|
130
|
+
raise TypeError(
|
|
131
|
+
f"docs must be a pandas or polars DataFrame; got {type(docs).__name__}"
|
|
132
|
+
)
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
@dataclass(frozen=True, eq=False)
|
|
136
|
+
class Corpus:
|
|
137
|
+
"""A corpus of documents with optional metadata columns.
|
|
138
|
+
|
|
139
|
+
Parameters
|
|
140
|
+
----------
|
|
141
|
+
docs
|
|
142
|
+
A DataFrame whose rows are documents. Must contain at least the
|
|
143
|
+
text column named by ``text_col``. Accepts either
|
|
144
|
+
:class:`pandas.DataFrame` or :class:`polars.DataFrame`; polars
|
|
145
|
+
input is converted to pandas internally.
|
|
146
|
+
text_col
|
|
147
|
+
Name of the column containing document text.
|
|
148
|
+
id_col
|
|
149
|
+
Name of an optional unique-document-id column.
|
|
150
|
+
meta_cols
|
|
151
|
+
Tuple of column names treated as metadata available for slicing.
|
|
152
|
+
If empty (the default), every non-text column is considered
|
|
153
|
+
metadata.
|
|
154
|
+
tokenizer
|
|
155
|
+
A callable conforming to :class:`pycorpdiff.tokenize.Tokenizer`.
|
|
156
|
+
Defaults to the package's :class:`RegexTokenizer`.
|
|
157
|
+
|
|
158
|
+
Hashability
|
|
159
|
+
-----------
|
|
160
|
+
|
|
161
|
+
:class:`Corpus` is hashable with a content-derived hash — two
|
|
162
|
+
corpora with the same documents, schema, and tokenizer hash the
|
|
163
|
+
same. The hash uses :func:`pandas.util.hash_pandas_object` so it's
|
|
164
|
+
fast (O(N) over rows, vectorised) but deterministic. Use a Corpus
|
|
165
|
+
as a dict key when memoising analyses or building reproducibility
|
|
166
|
+
caches.
|
|
167
|
+
"""
|
|
168
|
+
|
|
169
|
+
docs: pd.DataFrame
|
|
170
|
+
text_col: str = "text"
|
|
171
|
+
id_col: str | None = None
|
|
172
|
+
meta_cols: tuple[str, ...] = ()
|
|
173
|
+
tokenizer: Tokenizer = field(default_factory=RegexTokenizer)
|
|
174
|
+
|
|
175
|
+
def __post_init__(self) -> None:
|
|
176
|
+
# Coerce polars to pandas if needed; otherwise leave alone.
|
|
177
|
+
if not isinstance(self.docs, pd.DataFrame):
|
|
178
|
+
object.__setattr__(self, "docs", _coerce_to_pandas(self.docs))
|
|
179
|
+
if self.text_col not in self.docs.columns:
|
|
180
|
+
raise ValueError(
|
|
181
|
+
f"text_col={self.text_col!r} not found in DataFrame columns "
|
|
182
|
+
f"{list(self.docs.columns)!r}"
|
|
183
|
+
)
|
|
184
|
+
if self.id_col is not None and self.id_col not in self.docs.columns:
|
|
185
|
+
raise ValueError(
|
|
186
|
+
f"id_col={self.id_col!r} not found in DataFrame columns "
|
|
187
|
+
f"{list(self.docs.columns)!r}"
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
def __len__(self) -> int:
|
|
191
|
+
return len(self.docs)
|
|
192
|
+
|
|
193
|
+
def __hash__(self) -> int:
|
|
194
|
+
"""Content-derived hash for cache keys / reproducibility checks.
|
|
195
|
+
|
|
196
|
+
Combines a fast vectorised hash of every document row with the
|
|
197
|
+
corpus configuration (text/id/meta columns + tokenizer repr).
|
|
198
|
+
Two corpora with identical docs, schema, and tokenizer hash the
|
|
199
|
+
same; mutating any of those (in a copy — Corpus is frozen)
|
|
200
|
+
produces a different hash.
|
|
201
|
+
"""
|
|
202
|
+
row_hash = int(pd.util.hash_pandas_object(self.docs, index=False).sum()) & 0xFFFFFFFFFFFFFFFF
|
|
203
|
+
return hash(
|
|
204
|
+
(row_hash, self.text_col, self.id_col, self.meta_cols, repr(self.tokenizer))
|
|
205
|
+
)
|
|
206
|
+
|
|
207
|
+
def __eq__(self, other: object) -> bool:
|
|
208
|
+
"""Two Corpora are equal iff their hashes agree on content + config."""
|
|
209
|
+
if not isinstance(other, Corpus):
|
|
210
|
+
return NotImplemented
|
|
211
|
+
return hash(self) == hash(other)
|
|
212
|
+
|
|
213
|
+
@property
|
|
214
|
+
def metadata_columns(self) -> tuple[str, ...]:
|
|
215
|
+
"""Effective metadata columns — explicit if given, else inferred."""
|
|
216
|
+
if self.meta_cols:
|
|
217
|
+
return self.meta_cols
|
|
218
|
+
return tuple(c for c in self.docs.columns if c != self.text_col)
|
|
219
|
+
|
|
220
|
+
def slice(self, **filters: Any) -> CorpusSlice:
|
|
221
|
+
"""Return a :class:`CorpusSlice` filtered on metadata columns.
|
|
222
|
+
|
|
223
|
+
Each keyword argument is a column name; the value may be a scalar
|
|
224
|
+
(exact match) or an iterable (membership). All conditions are
|
|
225
|
+
combined with logical AND.
|
|
226
|
+
"""
|
|
227
|
+
mask = pd.Series(True, index=self.docs.index)
|
|
228
|
+
for col, value in filters.items():
|
|
229
|
+
if col not in self.docs.columns:
|
|
230
|
+
raise KeyError(f"slice() got unknown column {col!r}")
|
|
231
|
+
if isinstance(value, (list, tuple, set, pd.Series)):
|
|
232
|
+
mask &= self.docs[col].isin(list(value))
|
|
233
|
+
else:
|
|
234
|
+
mask &= self.docs[col] == value
|
|
235
|
+
return CorpusSlice(parent=self, mask=mask, filters=dict(filters))
|
|
236
|
+
|
|
237
|
+
def by_time(self, col: str, freq: str = "Y") -> TemporalCorpus:
|
|
238
|
+
"""Return a :class:`TemporalCorpus` indexed by time-period.
|
|
239
|
+
|
|
240
|
+
``col`` must be parseable as datetime; ``freq`` is any pandas
|
|
241
|
+
offset alias (``"Y"``, ``"Q"``, ``"M"``, ``"W"``, ``"D"``).
|
|
242
|
+
"""
|
|
243
|
+
from .temporal.slicing import TemporalCorpus # local import to break cycle
|
|
244
|
+
|
|
245
|
+
return TemporalCorpus(parent=self, time_col=col, freq=freq)
|
|
246
|
+
|
|
247
|
+
def with_tokenizer(self, tokenizer: Tokenizer) -> Corpus:
|
|
248
|
+
"""Return a copy of the corpus with a different tokenizer."""
|
|
249
|
+
return replace(self, tokenizer=tokenizer)
|
|
250
|
+
|
|
251
|
+
def tokens(self) -> list[list[str]]:
|
|
252
|
+
"""Tokenize every document; return one list of tokens per doc."""
|
|
253
|
+
return [self.tokenizer(t) for t in self.docs[self.text_col]]
|
|
254
|
+
|
|
255
|
+
def doc_term_counts(self, min_count: int = 1) -> pd.DataFrame:
|
|
256
|
+
"""Return a docs × term integer count DataFrame."""
|
|
257
|
+
return _doc_term_counts(self.docs, self.text_col, self.tokenizer, min_count)
|
|
258
|
+
|
|
259
|
+
def doc_term_counts_sparse(self, min_count: int = 1) -> tuple[Any, list[str]]:
|
|
260
|
+
"""Return the docs × term matrix in :class:`scipy.sparse` form.
|
|
261
|
+
|
|
262
|
+
Returns ``(matrix, vocab)`` where ``matrix`` is a
|
|
263
|
+
:class:`scipy.sparse.csr_matrix` of shape ``(n_docs, |vocab|)``
|
|
264
|
+
and ``vocab`` is the lexicographically-sorted list of column
|
|
265
|
+
terms. This is the canonical scikit-learn shape, and lets you
|
|
266
|
+
plug a pycorpdiff corpus directly into anything that expects
|
|
267
|
+
:class:`~sklearn.feature_extraction.text.CountVectorizer` output.
|
|
268
|
+
|
|
269
|
+
For typical analytical work the dense
|
|
270
|
+
:meth:`doc_term_counts` is fine — but on a corpus with ~100K
|
|
271
|
+
docs and ~50K vocab the dense int64 matrix is ~40 GB while the
|
|
272
|
+
sparse CSR is megabytes. Use this when the dense matrix would
|
|
273
|
+
blow your RAM budget.
|
|
274
|
+
"""
|
|
275
|
+
return _doc_term_counts_sparse(
|
|
276
|
+
self.docs, self.text_col, self.tokenizer, min_count
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
def vocab(self, min_count: int = 1) -> pd.Series:
|
|
280
|
+
"""Return a term → total-count Series sorted descending."""
|
|
281
|
+
counts = self.doc_term_counts(min_count=min_count).sum(axis=0)
|
|
282
|
+
return counts.rename("count").sort_values(ascending=False)
|
|
283
|
+
|
|
284
|
+
def total_tokens(self) -> int:
|
|
285
|
+
"""Total tokens across all documents (before any min_count filter)."""
|
|
286
|
+
return int(self.doc_term_counts(min_count=1).values.sum())
|
|
287
|
+
|
|
288
|
+
def to_polars(self) -> pl.DataFrame:
|
|
289
|
+
"""Return the corpus's documents as a polars DataFrame.
|
|
290
|
+
|
|
291
|
+
Requires the ``polars`` extra. The original pandas index is
|
|
292
|
+
dropped (polars has no concept of a row index); the document
|
|
293
|
+
ordering is preserved.
|
|
294
|
+
"""
|
|
295
|
+
try:
|
|
296
|
+
import polars as pl
|
|
297
|
+
except ImportError as exc: # pragma: no cover
|
|
298
|
+
raise ImportError(
|
|
299
|
+
"to_polars() requires polars. Install with: pip install 'pycorpdiff[polars]'"
|
|
300
|
+
) from exc
|
|
301
|
+
return pl.from_pandas(self.docs.reset_index(drop=True))
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
@dataclass(frozen=True)
|
|
305
|
+
class CorpusSlice:
|
|
306
|
+
"""A boolean-masked view of a :class:`Corpus`.
|
|
307
|
+
|
|
308
|
+
Slices behave like corpora for downstream analytical purposes —
|
|
309
|
+
they expose the same ``docs``, ``text_col``, ``tokenizer`` surface —
|
|
310
|
+
but also remember the ``filters`` that produced them, which the
|
|
311
|
+
:class:`pycorpdiff.compare.Comparison` machinery uses to label
|
|
312
|
+
plots and result tables.
|
|
313
|
+
"""
|
|
314
|
+
|
|
315
|
+
parent: Corpus
|
|
316
|
+
mask: pd.Series
|
|
317
|
+
filters: dict[str, Any]
|
|
318
|
+
|
|
319
|
+
@property
|
|
320
|
+
def docs(self) -> pd.DataFrame:
|
|
321
|
+
return self.parent.docs.loc[self.mask]
|
|
322
|
+
|
|
323
|
+
@property
|
|
324
|
+
def text_col(self) -> str:
|
|
325
|
+
return self.parent.text_col
|
|
326
|
+
|
|
327
|
+
@property
|
|
328
|
+
def id_col(self) -> str | None:
|
|
329
|
+
return self.parent.id_col
|
|
330
|
+
|
|
331
|
+
@property
|
|
332
|
+
def tokenizer(self) -> Tokenizer:
|
|
333
|
+
return self.parent.tokenizer
|
|
334
|
+
|
|
335
|
+
def __len__(self) -> int:
|
|
336
|
+
return int(self.mask.sum())
|
|
337
|
+
|
|
338
|
+
@property
|
|
339
|
+
def label(self) -> str:
|
|
340
|
+
"""A short human-readable label derived from the slice's filters."""
|
|
341
|
+
if not self.filters:
|
|
342
|
+
return "slice"
|
|
343
|
+
return ", ".join(f"{k}={v!r}" for k, v in self.filters.items())
|
|
344
|
+
|
|
345
|
+
def tokens(self) -> list[list[str]]:
|
|
346
|
+
"""Tokenize every document in the slice."""
|
|
347
|
+
return [self.tokenizer(t) for t in self.docs[self.text_col]]
|
|
348
|
+
|
|
349
|
+
def doc_term_counts(self, min_count: int = 1) -> pd.DataFrame:
|
|
350
|
+
return _doc_term_counts(self.docs, self.text_col, self.tokenizer, min_count)
|
|
351
|
+
|
|
352
|
+
def doc_term_counts_sparse(self, min_count: int = 1) -> tuple[Any, list[str]]:
|
|
353
|
+
"""Sparse counterpart to :meth:`doc_term_counts`. See
|
|
354
|
+
:meth:`Corpus.doc_term_counts_sparse` for semantics.
|
|
355
|
+
"""
|
|
356
|
+
return _doc_term_counts_sparse(
|
|
357
|
+
self.docs, self.text_col, self.tokenizer, min_count
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
def vocab(self, min_count: int = 1) -> pd.Series:
|
|
361
|
+
counts = self.doc_term_counts(min_count=min_count).sum(axis=0)
|
|
362
|
+
return counts.rename("count").sort_values(ascending=False)
|
|
363
|
+
|
|
364
|
+
def total_tokens(self) -> int:
|
|
365
|
+
return int(self.doc_term_counts(min_count=1).values.sum())
|
|
366
|
+
|
|
367
|
+
def slice(self, **filters: Any) -> CorpusSlice:
|
|
368
|
+
"""Further filter this slice — produces a new CorpusSlice on the
|
|
369
|
+
same parent with the masks AND-ed and the filters merged.
|
|
370
|
+
|
|
371
|
+
Lets you chain: ``corpus.slice(topic="x").slice(party="y")``.
|
|
372
|
+
"""
|
|
373
|
+
new_mask = self.mask.copy()
|
|
374
|
+
merged_filters = dict(self.filters)
|
|
375
|
+
for col, value in filters.items():
|
|
376
|
+
if col not in self.parent.docs.columns:
|
|
377
|
+
raise KeyError(f"slice() got unknown column {col!r}")
|
|
378
|
+
if isinstance(value, (list, tuple, set, pd.Series)):
|
|
379
|
+
new_mask &= self.parent.docs[col].isin(list(value))
|
|
380
|
+
else:
|
|
381
|
+
new_mask &= self.parent.docs[col] == value
|
|
382
|
+
merged_filters[col] = value
|
|
383
|
+
return CorpusSlice(parent=self.parent, mask=new_mask, filters=merged_filters)
|
|
384
|
+
|
|
385
|
+
def by_time(self, col: str, freq: str = "Y") -> TemporalCorpus:
|
|
386
|
+
"""Return a TemporalCorpus over the slice's documents only.
|
|
387
|
+
|
|
388
|
+
Materialises the slice's masked rows into a fresh Corpus, then
|
|
389
|
+
delegates to :meth:`Corpus.by_time`. Lets you chain
|
|
390
|
+
``corpus.slice(topic="x").by_time("date")``.
|
|
391
|
+
"""
|
|
392
|
+
from .temporal.slicing import TemporalCorpus # local import to break cycle
|
|
393
|
+
|
|
394
|
+
fresh = Corpus(
|
|
395
|
+
docs=self.docs.reset_index(drop=True),
|
|
396
|
+
text_col=self.text_col,
|
|
397
|
+
id_col=self.id_col,
|
|
398
|
+
meta_cols=self.parent.meta_cols,
|
|
399
|
+
tokenizer=self.tokenizer,
|
|
400
|
+
)
|
|
401
|
+
return TemporalCorpus(parent=fresh, time_col=col, freq=freq)
|
|
402
|
+
|
|
403
|
+
def to_polars(self) -> pl.DataFrame:
|
|
404
|
+
"""Return the slice's documents as a polars DataFrame."""
|
|
405
|
+
try:
|
|
406
|
+
import polars as pl
|
|
407
|
+
except ImportError as exc: # pragma: no cover
|
|
408
|
+
raise ImportError(
|
|
409
|
+
"to_polars() requires polars. Install with: pip install 'pycorpdiff[polars]'"
|
|
410
|
+
) from exc
|
|
411
|
+
return pl.from_pandas(self.docs.reset_index(drop=True))
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Bundled corpora for demonstrations, tutorials, and reproducible tests.
|
|
2
|
+
|
|
3
|
+
What ships with the package
|
|
4
|
+
---------------------------
|
|
5
|
+
|
|
6
|
+
- :func:`load_hansard_sample` — a 200-speech synthetic corpus designed
|
|
7
|
+
to mimic UK Hansard's structure across two decades, four topics, and
|
|
8
|
+
four parties, with topical language shifts around real-world events
|
|
9
|
+
(Brexit referendum, COVID-19, the climate-emergency declarations).
|
|
10
|
+
|
|
11
|
+
The sample is **synthetic** but its structure is realistic enough to
|
|
12
|
+
demo every analytical surface in pycorpdiff. For an actual research
|
|
13
|
+
project users will want the real Hansard archive — see the docstring on
|
|
14
|
+
:func:`load_hansard_sample` for the canonical download paths.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from .hansard import fetch_hansard, load_hansard_sample
|
|
20
|
+
from .histwords import fetch_histwords_decade, histwords_cosine_shift
|
|
21
|
+
|
|
22
|
+
__all__ = [
|
|
23
|
+
"fetch_hansard",
|
|
24
|
+
"fetch_histwords_decade",
|
|
25
|
+
"histwords_cosine_shift",
|
|
26
|
+
"load_hansard_sample",
|
|
27
|
+
]
|
|
Binary file
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""Deterministically generate the synthetic Hansard-sample parquet.
|
|
2
|
+
|
|
3
|
+
This script is the source-of-truth for ``hansard_sample.parquet``. It is
|
|
4
|
+
*not* run at import time — the parquet is committed and shipped with the
|
|
5
|
+
package, and :func:`load_hansard_sample` just reads it. The script is
|
|
6
|
+
here so reviewers can verify the sample is reproducible and so we can
|
|
7
|
+
regenerate it when the templates change.
|
|
8
|
+
|
|
9
|
+
Run with::
|
|
10
|
+
|
|
11
|
+
python -m pycorpdiff.datasets._generate_hansard
|
|
12
|
+
|
|
13
|
+
The output is written to ``src/pycorpdiff/datasets/_data/hansard_sample.parquet``.
|
|
14
|
+
The generator is seeded so output is byte-identical across machines and
|
|
15
|
+
Python versions.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
|
|
22
|
+
import numpy as np
|
|
23
|
+
import pandas as pd
|
|
24
|
+
|
|
25
|
+
OPENINGS = [
|
|
26
|
+
"I rise to address the House on",
|
|
27
|
+
"I beg leave to bring to the attention of this House the question of",
|
|
28
|
+
"Mr Speaker, I wish to make a statement concerning",
|
|
29
|
+
"I am grateful for the opportunity to debate",
|
|
30
|
+
"The Honourable Members of this House should consider",
|
|
31
|
+
"Madam Deputy Speaker, I wish to address",
|
|
32
|
+
"I am pleased to speak on the matter of",
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
CLOSINGS = [
|
|
36
|
+
"I commend this motion to the House.",
|
|
37
|
+
"I urge my Honourable colleagues to support this.",
|
|
38
|
+
"The Government must take immediate action.",
|
|
39
|
+
"We owe this much to our constituents.",
|
|
40
|
+
"This matter brooks no further delay.",
|
|
41
|
+
"The time for inaction has long passed.",
|
|
42
|
+
"I beg to move.",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
# (topic, period_label) -> list of body templates.
|
|
46
|
+
# The period_label structure encodes the temporal-frame shifts:
|
|
47
|
+
# - immigration shifts from humanising (pre-2016) to criminalising (post-2016)
|
|
48
|
+
# - brexit goes through emerging → peak → aftermath
|
|
49
|
+
# - nhs has a steady frame + crisis spikes in 2010 (austerity) and 2020 (covid)
|
|
50
|
+
# - climate sharpens scientific → policy → crisis post-2019
|
|
51
|
+
TOPIC_BODIES: dict[tuple[str, str], list[str]] = {
|
|
52
|
+
("immigration", "humanising"): [
|
|
53
|
+
"the immigrant worker arrived with hope and the immigrant family settled with dignity",
|
|
54
|
+
"the immigrant community contributes to our shared prosperity and shared future",
|
|
55
|
+
"the immigrant family deserves protection refuge and a clear path to citizenship",
|
|
56
|
+
"the immigrant worker rights advance through union solidarity and our labour movement",
|
|
57
|
+
"the immigrant community organised with strength and the immigrant worker spoke with pride",
|
|
58
|
+
"the immigrant family brings cultural richness and economic vitality to our towns",
|
|
59
|
+
"the immigrant worker contributes to public services education and our national life",
|
|
60
|
+
"the immigrant community has thrived and the immigrant family has flourished here",
|
|
61
|
+
],
|
|
62
|
+
("immigration", "criminalising"): [
|
|
63
|
+
"the immigrant criminal threat grows and the immigrant invasion of gangs spreads",
|
|
64
|
+
"the immigrant criminal element alarms residents and our border control has failed",
|
|
65
|
+
"the immigrant invasion narrative dominates news and immigrant criminal gangs persist",
|
|
66
|
+
"the immigrant threat has increased and the immigrant crime narrative grew with concern",
|
|
67
|
+
"the immigrant gangs threaten the border and the immigrant criminal risk grows daily",
|
|
68
|
+
"the immigrant criminal threat must be confronted and the immigrant invasion halted",
|
|
69
|
+
"the immigrant criminal gangs operate freely and immigrant invasion routes remain open",
|
|
70
|
+
],
|
|
71
|
+
("brexit", "emerging"): [
|
|
72
|
+
"the european question must be addressed and the referendum we promised must be delivered",
|
|
73
|
+
"our relationship with europe requires renegotiation and reform from this government",
|
|
74
|
+
"the european union framework no longer serves british interests and british sovereignty",
|
|
75
|
+
"the european treaties demand renegotiation before the public can give their consent",
|
|
76
|
+
"the european question divides this house but the people must have their say",
|
|
77
|
+
],
|
|
78
|
+
("brexit", "peak"): [
|
|
79
|
+
"the brexit referendum result must be respected and delivered without further delay",
|
|
80
|
+
"the brexit deal must respect the democratic will of seventeen million leave voters",
|
|
81
|
+
"the brexit transition must protect british businesses and british workers from harm",
|
|
82
|
+
"the brexit negotiations require firm leadership and a clear vision for our nation",
|
|
83
|
+
"the brexit outcome will define this generation and the brexit deal must be honoured",
|
|
84
|
+
"the brexit settlement requires patience but the brexit mandate is unambiguous",
|
|
85
|
+
],
|
|
86
|
+
("brexit", "aftermath"): [
|
|
87
|
+
"the brexit deal has delivered for british sovereignty and british democratic accountability",
|
|
88
|
+
"the brexit aftermath reveals supply chain disruption and significant economic adjustment",
|
|
89
|
+
"the brexit transition continues with new opportunities for global trade and partnership",
|
|
90
|
+
"the brexit dividend has yet to materialise for working families across our nation",
|
|
91
|
+
"the brexit settlement requires further work on northern ireland and our customs arrangements",
|
|
92
|
+
],
|
|
93
|
+
("nhs", "normal"): [
|
|
94
|
+
"the national health service requires sustained investment for our nurses and doctors",
|
|
95
|
+
"the nhs workforce must be supported with proper funding and training programmes",
|
|
96
|
+
"the patient care standards must be maintained across all hospitals and trusts",
|
|
97
|
+
"the nhs provides universal care and the nhs principles remain our foundation",
|
|
98
|
+
"the national health service belongs to all of us and to our future generations",
|
|
99
|
+
],
|
|
100
|
+
("nhs", "austerity"): [
|
|
101
|
+
"the nhs austerity cuts threaten patient care and waiting times grow alarming",
|
|
102
|
+
"the nhs underfunding creates crises in accident and emergency departments nationwide",
|
|
103
|
+
"the nhs austerity decisions cost lives and the nhs underfunding harms our communities",
|
|
104
|
+
"the nhs cuts must be reversed and the nhs funding settlement must be honoured",
|
|
105
|
+
],
|
|
106
|
+
("nhs", "covid"): [
|
|
107
|
+
"the nhs response to the pandemic deserves our gratitude and continued support",
|
|
108
|
+
"the nhs covid crisis demands emergency funding and ventilator capacity now",
|
|
109
|
+
"the nhs frontline workers face unprecedented pressure and the nhs covid response saves lives",
|
|
110
|
+
"the nhs covid surge requires us to clap and to legislate for fair pay",
|
|
111
|
+
],
|
|
112
|
+
("climate", "scientific"): [
|
|
113
|
+
"the scientific consensus on climate change requires policy response from this government",
|
|
114
|
+
"the climate models indicate warming trends that demand emissions reduction targets",
|
|
115
|
+
"the climate science is settled and the climate research is unambiguous in its conclusions",
|
|
116
|
+
"the climate evidence accumulates and the climate scientists call for action",
|
|
117
|
+
],
|
|
118
|
+
("climate", "policy"): [
|
|
119
|
+
"the climate policy framework must align with our paris agreement obligations",
|
|
120
|
+
"the climate change committee recommends carbon budget reductions for the coming decade",
|
|
121
|
+
"the climate policy must include just transition for fossil fuel workers and communities",
|
|
122
|
+
"the climate framework needs revision and the climate policy targets need strengthening",
|
|
123
|
+
],
|
|
124
|
+
("climate", "crisis"): [
|
|
125
|
+
"the climate crisis is here now and the climate emergency demands urgent action",
|
|
126
|
+
"the climate breakdown threatens our coastlines and the climate emergency cannot wait",
|
|
127
|
+
"the climate crisis requires immediate emissions cuts and the climate emergency is upon us",
|
|
128
|
+
"the climate disaster unfolds and the climate emergency response must accelerate",
|
|
129
|
+
"the climate emergency declaration must be backed by the climate action this house owes",
|
|
130
|
+
],
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
# Period predicates for each topic. Each yields a (year) -> period_label
|
|
134
|
+
# mapping that decides which template bucket a speech in that year uses.
|
|
135
|
+
def _immigration_period(year: int) -> str:
|
|
136
|
+
return "humanising" if year < 2016 else "criminalising"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _brexit_period(year: int) -> str:
|
|
140
|
+
if year < 2016:
|
|
141
|
+
return "emerging"
|
|
142
|
+
if year < 2020:
|
|
143
|
+
return "peak"
|
|
144
|
+
return "aftermath"
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
def _nhs_period(year: int) -> str:
|
|
148
|
+
if 2010 <= year <= 2014:
|
|
149
|
+
return "austerity"
|
|
150
|
+
if 2020 <= year <= 2022:
|
|
151
|
+
return "covid"
|
|
152
|
+
return "normal"
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _climate_period(year: int) -> str:
|
|
156
|
+
if year < 2011:
|
|
157
|
+
return "scientific"
|
|
158
|
+
if year < 2019:
|
|
159
|
+
return "policy"
|
|
160
|
+
return "crisis"
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
PERIOD_FOR = {
|
|
164
|
+
"immigration": _immigration_period,
|
|
165
|
+
"brexit": _brexit_period,
|
|
166
|
+
"nhs": _nhs_period,
|
|
167
|
+
"climate": _climate_period,
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
PARTIES = ["Labour", "Conservative", "Liberal Democrat", "SNP"]
|
|
171
|
+
TOPICS = ["immigration", "brexit", "nhs", "climate"]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def generate(seed: int = 20260522) -> pd.DataFrame:
|
|
175
|
+
"""Return a deterministic 200-speech synthetic Hansard sample."""
|
|
176
|
+
rng = np.random.default_rng(seed)
|
|
177
|
+
rows: list[dict[str, object]] = []
|
|
178
|
+
speech_id = 0
|
|
179
|
+
for year in range(2005, 2024):
|
|
180
|
+
# Roughly 10-11 speeches per year. Brexit and immigration get
|
|
181
|
+
# more airtime in years close to the referendum.
|
|
182
|
+
n_speeches = 10 + (1 if year in {2016, 2017, 2019} else 0)
|
|
183
|
+
for _ in range(n_speeches):
|
|
184
|
+
topic = TOPICS[int(rng.integers(0, len(TOPICS)))]
|
|
185
|
+
period = PERIOD_FOR[topic](year)
|
|
186
|
+
body_pool = TOPIC_BODIES[(topic, period)]
|
|
187
|
+
body = body_pool[int(rng.integers(0, len(body_pool)))]
|
|
188
|
+
opening = OPENINGS[int(rng.integers(0, len(OPENINGS)))]
|
|
189
|
+
closing = CLOSINGS[int(rng.integers(0, len(CLOSINGS)))]
|
|
190
|
+
party = PARTIES[int(rng.integers(0, len(PARTIES)))]
|
|
191
|
+
month = int(rng.integers(1, 13))
|
|
192
|
+
day = int(rng.integers(1, 28))
|
|
193
|
+
rows.append(
|
|
194
|
+
{
|
|
195
|
+
"speech_id": speech_id,
|
|
196
|
+
"text": f"{opening} {topic}. {body}. {closing}",
|
|
197
|
+
"topic": topic,
|
|
198
|
+
"frame": period,
|
|
199
|
+
"party": party,
|
|
200
|
+
"date": f"{year}-{month:02d}-{day:02d}",
|
|
201
|
+
"year": year,
|
|
202
|
+
}
|
|
203
|
+
)
|
|
204
|
+
speech_id += 1
|
|
205
|
+
return pd.DataFrame(rows)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def main() -> None:
|
|
209
|
+
df = generate()
|
|
210
|
+
out_path = Path(__file__).parent / "_data" / "hansard_sample.parquet"
|
|
211
|
+
out_path.parent.mkdir(parents=True, exist_ok=True)
|
|
212
|
+
df.to_parquet(out_path, index=False)
|
|
213
|
+
print(f"wrote {len(df)} speeches to {out_path}")
|
|
214
|
+
print(f"topic distribution: {df['topic'].value_counts().to_dict()}")
|
|
215
|
+
print(f"frame distribution: {df['frame'].value_counts().to_dict()}")
|
|
216
|
+
print(f"party distribution: {df['party'].value_counts().to_dict()}")
|
|
217
|
+
print(f"year range: {df['year'].min()}–{df['year'].max()}")
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
if __name__ == "__main__":
|
|
221
|
+
main()
|