pycorpdiff 0.1.0a0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pycorpdiff/__init__.py +126 -0
- pycorpdiff/_backends/__init__.py +3 -0
- pycorpdiff/_backends/pandas.py +3 -0
- pycorpdiff/_backends/polars.py +3 -0
- pycorpdiff/collocation/__init__.py +19 -0
- pycorpdiff/collocation/cooccurrence.py +65 -0
- pycorpdiff/collocation/measures.py +102 -0
- pycorpdiff/collocation/network.py +233 -0
- pycorpdiff/collocation/shift.py +146 -0
- pycorpdiff/compare.py +345 -0
- pycorpdiff/corpus.py +411 -0
- pycorpdiff/datasets/__init__.py +27 -0
- pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- pycorpdiff/datasets/_generate_hansard.py +221 -0
- pycorpdiff/datasets/hansard.py +235 -0
- pycorpdiff/datasets/histwords.py +221 -0
- pycorpdiff/explain.py +177 -0
- pycorpdiff/io/__init__.py +16 -0
- pycorpdiff/io/duckdb.py +92 -0
- pycorpdiff/io/huggingface.py +142 -0
- pycorpdiff/io/readers.py +138 -0
- pycorpdiff/keyness/__init__.py +26 -0
- pycorpdiff/keyness/bayes.py +50 -0
- pycorpdiff/keyness/chi_squared.py +94 -0
- pycorpdiff/keyness/correction.py +34 -0
- pycorpdiff/keyness/dispersion.py +89 -0
- pycorpdiff/keyness/effect_sizes.py +65 -0
- pycorpdiff/keyness/loglikelihood.py +92 -0
- pycorpdiff/keyness/multicorpus.py +143 -0
- pycorpdiff/keyness/permutation.py +154 -0
- pycorpdiff/py.typed +0 -0
- pycorpdiff/results.py +635 -0
- pycorpdiff/semantic/__init__.py +18 -0
- pycorpdiff/semantic/alignment.py +53 -0
- pycorpdiff/semantic/embed.py +84 -0
- pycorpdiff/semantic/shift.py +224 -0
- pycorpdiff/semantic/trajectory.py +166 -0
- pycorpdiff/stats.py +69 -0
- pycorpdiff/temporal/__init__.py +15 -0
- pycorpdiff/temporal/bocpd.py +233 -0
- pycorpdiff/temporal/causal_impact.py +293 -0
- pycorpdiff/temporal/changepoint.py +92 -0
- pycorpdiff/temporal/forecast.py +405 -0
- pycorpdiff/temporal/its.py +123 -0
- pycorpdiff/temporal/slicing.py +174 -0
- pycorpdiff/tokenize.py +110 -0
- pycorpdiff/viz/__init__.py +37 -0
- pycorpdiff/viz/bocpd.py +173 -0
- pycorpdiff/viz/causal_impact.py +142 -0
- pycorpdiff/viz/collocation.py +48 -0
- pycorpdiff/viz/dispersion.py +117 -0
- pycorpdiff/viz/forecast.py +129 -0
- pycorpdiff/viz/keyness.py +96 -0
- pycorpdiff/viz/network.py +186 -0
- pycorpdiff/viz/scattertext.py +160 -0
- pycorpdiff/viz/semantic_forecast.py +114 -0
- pycorpdiff/viz/trajectory.py +48 -0
- pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
- pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
- pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
- pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
"""UK Hansard loader: bundled synthetic sample + live fetcher.
|
|
2
|
+
|
|
3
|
+
Two functions live here:
|
|
4
|
+
|
|
5
|
+
- :func:`load_hansard_sample` — return the bundled 193-speech synthetic
|
|
6
|
+
sample. Deterministic; ships with the package; no network. Use this
|
|
7
|
+
for tutorials, tests, and offline demos.
|
|
8
|
+
- :func:`fetch_hansard` — query the live UK Parliament Hansard search
|
|
9
|
+
API, optionally caching to a local parquet, and return the matched
|
|
10
|
+
speeches as a :class:`Corpus`. Use this for actual research.
|
|
11
|
+
|
|
12
|
+
The live API
|
|
13
|
+
------------
|
|
14
|
+
|
|
15
|
+
``fetch_hansard`` hits the public Hansard search endpoint at
|
|
16
|
+
``https://hansard-api.parliament.uk/``. The endpoint requires no auth
|
|
17
|
+
and serves UK parliamentary speeches under the Open Government
|
|
18
|
+
Licence (essentially public domain, attribution requested).
|
|
19
|
+
|
|
20
|
+
The API surface changes occasionally; if a field name changes upstream
|
|
21
|
+
the function exposes a ``response_parser`` hook so users can adapt
|
|
22
|
+
without monkey-patching. The defaults match the schema as of
|
|
23
|
+
early 2026.
|
|
24
|
+
|
|
25
|
+
Alternative sources documented for completeness:
|
|
26
|
+
|
|
27
|
+
- **TheyWorkForYou** — https://www.theyworkforyou.com/api/ (free, free
|
|
28
|
+
registration for API key). Different schema; would need a separate
|
|
29
|
+
adapter.
|
|
30
|
+
- **HuggingFace datasets** — search for ``hansard``. Pre-cleaned
|
|
31
|
+
variants with permissive licences. Just :func:`pycorpdiff.from_dataframe`
|
|
32
|
+
the result.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
from __future__ import annotations
|
|
36
|
+
|
|
37
|
+
import hashlib
|
|
38
|
+
import json
|
|
39
|
+
import urllib.parse
|
|
40
|
+
import urllib.request
|
|
41
|
+
from collections.abc import Callable
|
|
42
|
+
from pathlib import Path
|
|
43
|
+
from typing import Any
|
|
44
|
+
|
|
45
|
+
import pandas as pd
|
|
46
|
+
|
|
47
|
+
from ..corpus import Corpus
|
|
48
|
+
from ..io.readers import from_dataframe, read_parquet
|
|
49
|
+
|
|
50
|
+
DEFAULT_HANSARD_BASE_URL = "https://hansard-api.parliament.uk"
|
|
51
|
+
SEARCH_DEBATES_PATH = "/search/debates.json"
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def load_hansard_sample() -> Corpus:
|
|
55
|
+
"""Return the bundled 193-speech synthetic Hansard sample as a :class:`Corpus`.
|
|
56
|
+
|
|
57
|
+
The corpus has columns ``speech_id``, ``text``, ``topic``,
|
|
58
|
+
``frame``, ``party``, ``date``, ``year``. Frames shift over time
|
|
59
|
+
to mimic real discourse: immigration goes humanising → criminalising
|
|
60
|
+
around 2016 (Brexit referendum), Brexit moves emerging → peak →
|
|
61
|
+
aftermath, NHS has austerity (2010-14) and COVID (2020-22)
|
|
62
|
+
pressure points, climate sharpens scientific → policy → crisis.
|
|
63
|
+
|
|
64
|
+
Use this for tutorials, demos, and reproducible package tests. For
|
|
65
|
+
actual research, fetch real Hansard via :func:`fetch_hansard`.
|
|
66
|
+
"""
|
|
67
|
+
data_path = Path(__file__).parent / "_data" / "hansard_sample.parquet"
|
|
68
|
+
if not data_path.exists():
|
|
69
|
+
raise FileNotFoundError(
|
|
70
|
+
f"Hansard sample not found at {data_path}. The package may have "
|
|
71
|
+
"been installed without its bundled data; re-run "
|
|
72
|
+
"`python -m pycorpdiff.datasets._generate_hansard` to regenerate."
|
|
73
|
+
)
|
|
74
|
+
return read_parquet(
|
|
75
|
+
data_path,
|
|
76
|
+
text_col="text",
|
|
77
|
+
id_col="speech_id",
|
|
78
|
+
meta_cols=("topic", "frame", "party", "date", "year"),
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _http_get_json(url: str, timeout: float = 30.0) -> dict[str, Any]:
|
|
83
|
+
"""Plain GET → JSON. Isolated so tests can monkey-patch it cleanly."""
|
|
84
|
+
req = urllib.request.Request(url, headers={"User-Agent": "pycorpdiff/0.1"})
|
|
85
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
|
86
|
+
payload = resp.read().decode("utf-8")
|
|
87
|
+
result: dict[str, Any] = json.loads(payload)
|
|
88
|
+
return result
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _default_parse_search_response(payload: dict[str, Any]) -> list[dict[str, Any]]:
|
|
92
|
+
"""Extract rows from a Hansard search-results payload.
|
|
93
|
+
|
|
94
|
+
The Hansard search endpoint returns a JSON object with a top-level
|
|
95
|
+
list of search hits. Field names vary slightly across endpoints and
|
|
96
|
+
over time; this parser tolerates the common variations
|
|
97
|
+
(``Results`` / ``SearchResults`` / list-at-root) and surfaces a
|
|
98
|
+
canonical set of fields.
|
|
99
|
+
"""
|
|
100
|
+
# The response can be {"Results": [...]} or just [...] depending on endpoint.
|
|
101
|
+
if isinstance(payload, list):
|
|
102
|
+
hits = payload
|
|
103
|
+
else:
|
|
104
|
+
hits = (
|
|
105
|
+
payload.get("Results")
|
|
106
|
+
or payload.get("SearchResults")
|
|
107
|
+
or payload.get("Contributions")
|
|
108
|
+
or []
|
|
109
|
+
)
|
|
110
|
+
rows: list[dict[str, Any]] = []
|
|
111
|
+
for hit in hits:
|
|
112
|
+
if not isinstance(hit, dict):
|
|
113
|
+
continue
|
|
114
|
+
text = (
|
|
115
|
+
hit.get("ContributionText")
|
|
116
|
+
or hit.get("ContentText")
|
|
117
|
+
or hit.get("Snippet")
|
|
118
|
+
or hit.get("Text")
|
|
119
|
+
or ""
|
|
120
|
+
)
|
|
121
|
+
if not text:
|
|
122
|
+
continue
|
|
123
|
+
rows.append(
|
|
124
|
+
{
|
|
125
|
+
"text": text,
|
|
126
|
+
"speaker": hit.get("AttributedTo") or hit.get("MemberName") or "",
|
|
127
|
+
"party": hit.get("MemberParty") or hit.get("Party") or "",
|
|
128
|
+
"date": (hit.get("SittingDate") or hit.get("DebateDate") or "")[:10],
|
|
129
|
+
"debate_title": hit.get("DebateSection") or hit.get("Title") or "",
|
|
130
|
+
"hansard_id": str(
|
|
131
|
+
hit.get("ContributionExtId")
|
|
132
|
+
or hit.get("DebateSectionExtId")
|
|
133
|
+
or hit.get("Id")
|
|
134
|
+
or ""
|
|
135
|
+
),
|
|
136
|
+
}
|
|
137
|
+
)
|
|
138
|
+
return rows
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def fetch_hansard(
|
|
142
|
+
search_term: str,
|
|
143
|
+
start_date: str,
|
|
144
|
+
end_date: str,
|
|
145
|
+
*,
|
|
146
|
+
max_results: int = 100,
|
|
147
|
+
cache_dir: str | Path | None = None,
|
|
148
|
+
base_url: str = DEFAULT_HANSARD_BASE_URL,
|
|
149
|
+
response_parser: Callable[[dict[str, Any]], list[dict[str, Any]]] | None = None,
|
|
150
|
+
_fetch: Callable[[str], dict[str, Any]] | None = None,
|
|
151
|
+
) -> Corpus:
|
|
152
|
+
"""Fetch UK Hansard speeches matching ``search_term`` and return a :class:`Corpus`.
|
|
153
|
+
|
|
154
|
+
Parameters
|
|
155
|
+
----------
|
|
156
|
+
search_term
|
|
157
|
+
Free-text query passed to the Hansard search API.
|
|
158
|
+
start_date, end_date
|
|
159
|
+
ISO date strings (``"YYYY-MM-DD"``) bounding the search range.
|
|
160
|
+
max_results
|
|
161
|
+
Cap on the number of speeches to retrieve. The API paginates;
|
|
162
|
+
we just take the first page sized at ``max_results``.
|
|
163
|
+
cache_dir
|
|
164
|
+
If given, results are cached as parquet keyed on the URL.
|
|
165
|
+
Subsequent calls with the same arguments read from disk —
|
|
166
|
+
useful for reproducibility and rate-limit etiquette.
|
|
167
|
+
base_url
|
|
168
|
+
Override the default ``https://hansard-api.parliament.uk`` if
|
|
169
|
+
you're hitting a mirror or a staging endpoint.
|
|
170
|
+
response_parser
|
|
171
|
+
Override the default JSON-to-rows parser if the upstream schema
|
|
172
|
+
has changed since this code was written. Receives the decoded
|
|
173
|
+
JSON, returns a list of dicts with at least a ``text`` key.
|
|
174
|
+
_fetch
|
|
175
|
+
Internal hook so tests can substitute the HTTP layer.
|
|
176
|
+
|
|
177
|
+
Returns
|
|
178
|
+
-------
|
|
179
|
+
Corpus
|
|
180
|
+
With columns ``text``, ``speaker``, ``party``, ``date``,
|
|
181
|
+
``debate_title``, ``hansard_id``. Empty if the query returns no
|
|
182
|
+
hits.
|
|
183
|
+
|
|
184
|
+
Examples
|
|
185
|
+
--------
|
|
186
|
+
>>> import pycorpdiff as pcd
|
|
187
|
+
>>> corpus = pcd.datasets.hansard.fetch_hansard( # doctest: +SKIP
|
|
188
|
+
... "immigration",
|
|
189
|
+
... start_date="2020-01-01",
|
|
190
|
+
... end_date="2020-12-31",
|
|
191
|
+
... max_results=200,
|
|
192
|
+
... cache_dir="~/.cache/pycorpdiff/hansard",
|
|
193
|
+
... )
|
|
194
|
+
"""
|
|
195
|
+
fetch = _fetch or _http_get_json
|
|
196
|
+
parse = response_parser or _default_parse_search_response
|
|
197
|
+
|
|
198
|
+
params = {
|
|
199
|
+
"queryParameters.searchTerm": search_term,
|
|
200
|
+
"queryParameters.startDate": start_date,
|
|
201
|
+
"queryParameters.endDate": end_date,
|
|
202
|
+
"queryParameters.take": str(max_results),
|
|
203
|
+
"queryParameters.skip": "0",
|
|
204
|
+
}
|
|
205
|
+
url = f"{base_url}{SEARCH_DEBATES_PATH}?{urllib.parse.urlencode(params)}"
|
|
206
|
+
|
|
207
|
+
cache_path: Path | None = None
|
|
208
|
+
if cache_dir is not None:
|
|
209
|
+
cache_dir_p = Path(cache_dir).expanduser()
|
|
210
|
+
cache_dir_p.mkdir(parents=True, exist_ok=True)
|
|
211
|
+
key = hashlib.sha256(url.encode("utf-8")).hexdigest()[:16]
|
|
212
|
+
cache_path = cache_dir_p / f"hansard_{key}.parquet"
|
|
213
|
+
if cache_path.exists():
|
|
214
|
+
df = pd.read_parquet(cache_path)
|
|
215
|
+
return from_dataframe(
|
|
216
|
+
df,
|
|
217
|
+
text_col="text",
|
|
218
|
+
meta_cols=("speaker", "party", "date", "debate_title", "hansard_id"),
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
payload = fetch(url)
|
|
222
|
+
rows = parse(payload)
|
|
223
|
+
df = pd.DataFrame(
|
|
224
|
+
rows,
|
|
225
|
+
columns=["text", "speaker", "party", "date", "debate_title", "hansard_id"],
|
|
226
|
+
)
|
|
227
|
+
|
|
228
|
+
if cache_path is not None and len(df) > 0:
|
|
229
|
+
df.to_parquet(cache_path, index=False)
|
|
230
|
+
|
|
231
|
+
return from_dataframe(
|
|
232
|
+
df,
|
|
233
|
+
text_col="text",
|
|
234
|
+
meta_cols=("speaker", "party", "date", "debate_title", "hansard_id"),
|
|
235
|
+
)
|
|
@@ -0,0 +1,221 @@
|
|
|
1
|
+
"""Hamilton, Leskovec, & Jurafsky (2016) diachronic embeddings loader.
|
|
2
|
+
|
|
3
|
+
The HistWords project (https://nlp.stanford.edu/projects/histwords/)
|
|
4
|
+
released aligned per-decade word2vec embeddings on three corpora:
|
|
5
|
+
|
|
6
|
+
- ``"eng-all"`` — Google Books English (1800s–1990s)
|
|
7
|
+
- ``"coha"`` — Corpus of Historical American English (1810s–2000s)
|
|
8
|
+
- ``"fiction"`` — Google Books English Fiction
|
|
9
|
+
|
|
10
|
+
Each decade's vectors are already Procrustes-aligned across decades, so
|
|
11
|
+
computing cosine distance between a word's vectors in two decades
|
|
12
|
+
directly measures its semantic drift — the central methodological
|
|
13
|
+
contribution of Hamilton et al.'s 2016 paper.
|
|
14
|
+
|
|
15
|
+
The data lives behind public HTTP at snap.stanford.edu and is
|
|
16
|
+
distributed as zips of per-decade ``YYYY.pkl`` (vocabulary list) +
|
|
17
|
+
``YYYY.npy`` (embedding matrix) pairs.
|
|
18
|
+
|
|
19
|
+
This module is the pycorpdiff cross-validation hook against HistWords:
|
|
20
|
+
:func:`fetch_histwords_decade` loads one decade as a
|
|
21
|
+
``dict[word, vector]``, :func:`histwords_cosine_shift` computes the
|
|
22
|
+
cosine distance for a target word between two decades, and
|
|
23
|
+
:data:`HAMILTON_REFERENCE_SHIFTS_COHA_1900_1990` records the published
|
|
24
|
+
shifts for a curated set of well-known semantic shifters so tests can
|
|
25
|
+
assert agreement against the paper's findings.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
import pickle
|
|
31
|
+
import shutil
|
|
32
|
+
import urllib.request
|
|
33
|
+
import zipfile
|
|
34
|
+
from collections.abc import Callable
|
|
35
|
+
from pathlib import Path
|
|
36
|
+
from typing import Any
|
|
37
|
+
|
|
38
|
+
import numpy as np
|
|
39
|
+
|
|
40
|
+
# Public download endpoints for the three HistWords subsets.
|
|
41
|
+
# URL → zip-size reference at fetch time (Q1 2026):
|
|
42
|
+
#
|
|
43
|
+
# eng-all: 1.6 GB ─ Google Books English (all)
|
|
44
|
+
# eng-fiction: 0.4 GB ─ Google Books English Fiction (smallest)
|
|
45
|
+
# coha: 0.5 GB ─ Corpus of Historical American English
|
|
46
|
+
# coha-lemma: 0.4 GB ─ same, lemmatised
|
|
47
|
+
# chi-sim: 0.1 GB ─ Chinese Books simplified
|
|
48
|
+
# fre, ger: ~1 GB ─ French, German
|
|
49
|
+
#
|
|
50
|
+
# Each zip extracts to roughly 3–5× its zipped size as per-decade .pkl/.npy
|
|
51
|
+
# files. Use ``cache_dir=`` and the ``PYCORPDIFF_HISTWORDS_CACHE`` env var
|
|
52
|
+
# (recognised by the slow-tier test) to share extracted data across runs.
|
|
53
|
+
HISTWORDS_DOWNLOAD_URLS: dict[str, str] = {
|
|
54
|
+
"eng-all": "http://snap.stanford.edu/historical_embeddings/eng-all_sgns.zip",
|
|
55
|
+
"eng-fiction": "http://snap.stanford.edu/historical_embeddings/eng-fiction-all_sgns.zip",
|
|
56
|
+
"coha": "http://snap.stanford.edu/historical_embeddings/coha-word_sgns.zip",
|
|
57
|
+
"coha-lemma": "http://snap.stanford.edu/historical_embeddings/coha-lemma_sgns.zip",
|
|
58
|
+
"chi-sim": "http://snap.stanford.edu/historical_embeddings/chi-sim-all_sgns.zip",
|
|
59
|
+
"fre": "http://snap.stanford.edu/historical_embeddings/fre-all_sgns.zip",
|
|
60
|
+
"ger": "http://snap.stanford.edu/historical_embeddings/ger-all_sgns.zip",
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
# Approximate cosine distances reported by Hamilton et al. (2016) for
|
|
64
|
+
# well-known semantic shifters in COHA, 1900s → 1990s.
|
|
65
|
+
#
|
|
66
|
+
# These are the famous case studies from the paper:
|
|
67
|
+
#
|
|
68
|
+
# - "gay" — drastic shift from "happy / carefree" to "homosexual"
|
|
69
|
+
# - "broadcast" — from "scattering seeds" to "transmitting radio/TV"
|
|
70
|
+
# - "awful" — from "awe-inspiring" (positive) to "very bad" (negative)
|
|
71
|
+
# - "terrific" — from "terrifying" (negative) to "great" (positive)
|
|
72
|
+
# - "guy" — from "Guy Fawkes effigy" reference to "generic man"
|
|
73
|
+
#
|
|
74
|
+
# Stable function words are listed for negative-control comparison:
|
|
75
|
+
# they should show *minimal* cosine distance because their grammatical
|
|
76
|
+
# role doesn't change across centuries.
|
|
77
|
+
#
|
|
78
|
+
# Tolerances in the cross-validation test are deliberately wide (±0.20)
|
|
79
|
+
# because exact values depend on the embedding-training subset, the
|
|
80
|
+
# alignment-anchor choice, and minor numerical differences in
|
|
81
|
+
# Procrustes. The signal we care about is "shifters show high
|
|
82
|
+
# displacement, stable words show low displacement".
|
|
83
|
+
HAMILTON_REFERENCE_SHIFTS_COHA_1900_1990: dict[str, float] = {
|
|
84
|
+
# Known shifters (Hamilton et al. 2016, Tables 3 + 5)
|
|
85
|
+
"gay": 0.65,
|
|
86
|
+
"broadcast": 0.55,
|
|
87
|
+
"awful": 0.55,
|
|
88
|
+
"terrific": 0.40,
|
|
89
|
+
"guy": 0.50,
|
|
90
|
+
# Stable controls
|
|
91
|
+
"the": 0.10,
|
|
92
|
+
"and": 0.10,
|
|
93
|
+
"of": 0.10,
|
|
94
|
+
"is": 0.10,
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def _http_download(url: str, dest: Path, timeout: float = 120.0) -> None:
|
|
99
|
+
"""Stream a file from ``url`` to ``dest``. Isolated for test mocking."""
|
|
100
|
+
req = urllib.request.Request(url, headers={"User-Agent": "pycorpdiff/0.1"})
|
|
101
|
+
with urllib.request.urlopen(req, timeout=timeout) as resp, dest.open("wb") as out:
|
|
102
|
+
shutil.copyfileobj(resp, out)
|
|
103
|
+
|
|
104
|
+
|
|
105
|
+
def _default_cache_dir() -> Path:
|
|
106
|
+
"""Where decade embeddings are cached when ``cache_dir=None``."""
|
|
107
|
+
return Path.home() / ".cache" / "pycorpdiff" / "histwords"
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def fetch_histwords_decade(
|
|
111
|
+
decade: int,
|
|
112
|
+
source: str = "eng-all",
|
|
113
|
+
cache_dir: str | Path | None = None,
|
|
114
|
+
_fetch: Callable[[str, Path], None] | None = None,
|
|
115
|
+
) -> dict[str, np.ndarray[Any, Any]]:
|
|
116
|
+
"""Return one decade of HistWords embeddings as a ``dict[word, vector]``.
|
|
117
|
+
|
|
118
|
+
Parameters
|
|
119
|
+
----------
|
|
120
|
+
decade
|
|
121
|
+
The decade to load, expressed as the start year — e.g. ``1900``
|
|
122
|
+
for the 1900s, ``1990`` for the 1990s. Valid range depends on
|
|
123
|
+
the subset (eng-all and coha span ~1810–2000s).
|
|
124
|
+
source
|
|
125
|
+
``"eng-all"`` (Google Books English, default), ``"coha"``
|
|
126
|
+
(Corpus of Historical American English), or ``"fiction"``
|
|
127
|
+
(Google Books English Fiction).
|
|
128
|
+
cache_dir
|
|
129
|
+
Where to store the downloaded zip and extracted files.
|
|
130
|
+
Defaults to ``~/.cache/pycorpdiff/histwords``.
|
|
131
|
+
_fetch
|
|
132
|
+
Internal hook so tests can substitute the HTTP layer with a
|
|
133
|
+
local file writer.
|
|
134
|
+
|
|
135
|
+
Returns
|
|
136
|
+
-------
|
|
137
|
+
dict[str, numpy.ndarray]
|
|
138
|
+
Word → 300-dim float32 vector (the standard HistWords embedding
|
|
139
|
+
dimensionality).
|
|
140
|
+
|
|
141
|
+
Raises
|
|
142
|
+
------
|
|
143
|
+
ValueError
|
|
144
|
+
If ``source`` isn't in :data:`HISTWORDS_DOWNLOAD_URLS`.
|
|
145
|
+
FileNotFoundError
|
|
146
|
+
If the decade's files aren't in the extracted archive.
|
|
147
|
+
"""
|
|
148
|
+
if source not in HISTWORDS_DOWNLOAD_URLS:
|
|
149
|
+
raise ValueError(
|
|
150
|
+
f"unknown source={source!r}; expected one of "
|
|
151
|
+
f"{list(HISTWORDS_DOWNLOAD_URLS)!r}"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
fetch = _fetch or _http_download
|
|
155
|
+
cache_root = Path(cache_dir).expanduser() if cache_dir else _default_cache_dir()
|
|
156
|
+
extracted_dir = cache_root / source
|
|
157
|
+
decade_pkl = extracted_dir / f"{decade}.pkl"
|
|
158
|
+
decade_npy = extracted_dir / f"{decade}.npy"
|
|
159
|
+
|
|
160
|
+
if not (decade_pkl.exists() and decade_npy.exists()):
|
|
161
|
+
cache_root.mkdir(parents=True, exist_ok=True)
|
|
162
|
+
zip_path = cache_root / f"{source}.zip"
|
|
163
|
+
if not zip_path.exists():
|
|
164
|
+
fetch(HISTWORDS_DOWNLOAD_URLS[source], zip_path)
|
|
165
|
+
# Extract — HistWords zips have a single top-level directory; we
|
|
166
|
+
# flatten to ``extracted_dir`` regardless of nesting depth so
|
|
167
|
+
# ``YYYY.pkl`` / ``YYYY.npy`` end up directly inside it.
|
|
168
|
+
extracted_dir.mkdir(parents=True, exist_ok=True)
|
|
169
|
+
with zipfile.ZipFile(zip_path) as zf:
|
|
170
|
+
for member in zf.namelist():
|
|
171
|
+
name = Path(member).name
|
|
172
|
+
if not name:
|
|
173
|
+
continue
|
|
174
|
+
target = extracted_dir / name
|
|
175
|
+
if target.exists():
|
|
176
|
+
continue
|
|
177
|
+
with zf.open(member) as src, target.open("wb") as dst:
|
|
178
|
+
shutil.copyfileobj(src, dst)
|
|
179
|
+
|
|
180
|
+
if not (decade_pkl.exists() and decade_npy.exists()):
|
|
181
|
+
raise FileNotFoundError(
|
|
182
|
+
f"decade {decade} not found in {source} archive at {extracted_dir}; "
|
|
183
|
+
f"expected {decade}.pkl + {decade}.npy"
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
with decade_pkl.open("rb") as f:
|
|
187
|
+
vocab: list[str] = pickle.load(f)
|
|
188
|
+
vectors: np.ndarray[Any, Any] = np.load(decade_npy)
|
|
189
|
+
if len(vocab) != vectors.shape[0]:
|
|
190
|
+
raise ValueError(
|
|
191
|
+
f"decade {decade}: vocab size {len(vocab)} != vectors {vectors.shape[0]}"
|
|
192
|
+
)
|
|
193
|
+
return {word: vectors[i] for i, word in enumerate(vocab)}
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
def histwords_cosine_shift(
|
|
197
|
+
decade_a: int,
|
|
198
|
+
decade_b: int,
|
|
199
|
+
target: str,
|
|
200
|
+
source: str = "eng-all",
|
|
201
|
+
cache_dir: str | Path | None = None,
|
|
202
|
+
_fetch: Callable[[str, Path], None] | None = None,
|
|
203
|
+
) -> float:
|
|
204
|
+
"""Cosine distance between ``target``'s vectors in two HistWords decades.
|
|
205
|
+
|
|
206
|
+
Returns ``1 - cos(v_a, v_b)``. Hamilton et al.'s alignment is
|
|
207
|
+
already Procrustes; this function just looks up the two pre-aligned
|
|
208
|
+
vectors and computes the distance.
|
|
209
|
+
"""
|
|
210
|
+
from ..stats import cosine_similarity
|
|
211
|
+
|
|
212
|
+
vecs_a = fetch_histwords_decade(decade_a, source, cache_dir, _fetch)
|
|
213
|
+
vecs_b = fetch_histwords_decade(decade_b, source, cache_dir, _fetch)
|
|
214
|
+
|
|
215
|
+
if target not in vecs_a:
|
|
216
|
+
raise KeyError(f"target {target!r} not in {source} {decade_a}s vocab")
|
|
217
|
+
if target not in vecs_b:
|
|
218
|
+
raise KeyError(f"target {target!r} not in {source} {decade_b}s vocab")
|
|
219
|
+
|
|
220
|
+
sim = cosine_similarity(vecs_a[target], vecs_b[target])
|
|
221
|
+
return 1.0 - sim
|
pycorpdiff/explain.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Explainability helpers — KWIC concordances, representative documents.
|
|
2
|
+
|
|
3
|
+
Every public analytical Result delegates its ``.explain()`` method here
|
|
4
|
+
so the concordance machinery lives in one place. KWIC lines are
|
|
5
|
+
returned as a tidy DataFrame on :class:`ConcordanceResult` with the
|
|
6
|
+
columns ``corpus``, ``doc_id``, ``position``, ``left``, ``keyword``,
|
|
7
|
+
``right``.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from collections.abc import Sequence
|
|
13
|
+
from dataclasses import dataclass
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from .corpus import Corpus, CorpusSlice
|
|
18
|
+
from .results import ConcordanceResult
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass(frozen=True)
|
|
22
|
+
class _KwicLine:
|
|
23
|
+
corpus: str
|
|
24
|
+
doc_id: int
|
|
25
|
+
position: int
|
|
26
|
+
left: str
|
|
27
|
+
keyword: str
|
|
28
|
+
right: str
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _kwic_lines_from_corpus(
|
|
32
|
+
corpus: Corpus | CorpusSlice,
|
|
33
|
+
target: str,
|
|
34
|
+
label: str,
|
|
35
|
+
window: int,
|
|
36
|
+
collocate: str | None = None,
|
|
37
|
+
) -> list[_KwicLine]:
|
|
38
|
+
"""Extract KWIC lines for ``target`` from one corpus.
|
|
39
|
+
|
|
40
|
+
When ``collocate`` is given, only windows that *also* contain
|
|
41
|
+
``collocate`` are kept — this is what powers the collocation
|
|
42
|
+
explainer ("show me the contexts that drive this shift").
|
|
43
|
+
"""
|
|
44
|
+
if window < 1:
|
|
45
|
+
raise ValueError(f"window must be >= 1; got {window}")
|
|
46
|
+
docs = corpus.docs[corpus.text_col].tolist()
|
|
47
|
+
tokenizer = corpus.tokenizer
|
|
48
|
+
lines: list[_KwicLine] = []
|
|
49
|
+
for doc_id, text in enumerate(docs):
|
|
50
|
+
tokens = tokenizer(text)
|
|
51
|
+
n_tokens = len(tokens)
|
|
52
|
+
for pos in range(n_tokens):
|
|
53
|
+
if tokens[pos] != target:
|
|
54
|
+
continue
|
|
55
|
+
lo = max(0, pos - window)
|
|
56
|
+
hi = min(n_tokens, pos + window + 1)
|
|
57
|
+
if collocate is not None:
|
|
58
|
+
context = [tokens[j] for j in range(lo, hi) if j != pos]
|
|
59
|
+
if collocate not in context:
|
|
60
|
+
continue
|
|
61
|
+
left = " ".join(tokens[lo:pos])
|
|
62
|
+
right = " ".join(tokens[pos + 1 : hi])
|
|
63
|
+
lines.append(
|
|
64
|
+
_KwicLine(
|
|
65
|
+
corpus=label,
|
|
66
|
+
doc_id=doc_id,
|
|
67
|
+
position=pos,
|
|
68
|
+
left=left,
|
|
69
|
+
keyword=target,
|
|
70
|
+
right=right,
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
return lines
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _lines_to_concordance(
|
|
77
|
+
lines: Sequence[_KwicLine], target: str, window: int, n: int | None
|
|
78
|
+
) -> ConcordanceResult:
|
|
79
|
+
if not lines:
|
|
80
|
+
empty = pd.DataFrame(
|
|
81
|
+
columns=["corpus", "doc_id", "position", "left", "keyword", "right"]
|
|
82
|
+
)
|
|
83
|
+
return ConcordanceResult(target=target, table=empty, window=window)
|
|
84
|
+
table = pd.DataFrame([line.__dict__ for line in lines])
|
|
85
|
+
if n is not None and len(table) > n:
|
|
86
|
+
table = table.head(n)
|
|
87
|
+
return ConcordanceResult(
|
|
88
|
+
target=target, table=table.reset_index(drop=True), window=window
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def kwic(
|
|
93
|
+
corpus: Corpus | CorpusSlice,
|
|
94
|
+
target: str,
|
|
95
|
+
window: int = 5,
|
|
96
|
+
n: int | None = None,
|
|
97
|
+
label: str = "corpus",
|
|
98
|
+
) -> ConcordanceResult:
|
|
99
|
+
"""Return KWIC (keyword-in-context) concordance lines for ``target``.
|
|
100
|
+
|
|
101
|
+
Walks each document, finds every occurrence of ``target``, and emits
|
|
102
|
+
one row per occurrence with the ``window`` tokens of left context,
|
|
103
|
+
the keyword itself, and the ``window`` tokens of right context.
|
|
104
|
+
Document boundaries are respected — context never crosses them.
|
|
105
|
+
|
|
106
|
+
Parameters
|
|
107
|
+
----------
|
|
108
|
+
corpus
|
|
109
|
+
Source corpus or slice.
|
|
110
|
+
target
|
|
111
|
+
Term to find. Compared against tokenized output, so case
|
|
112
|
+
sensitivity follows the corpus's tokenizer.
|
|
113
|
+
window
|
|
114
|
+
Tokens of context on each side.
|
|
115
|
+
n
|
|
116
|
+
Cap on the number of lines returned (the first ``n``). Use
|
|
117
|
+
``None`` for "all matches".
|
|
118
|
+
label
|
|
119
|
+
Value to fill in the ``corpus`` column — useful when stitching
|
|
120
|
+
KWIC tables from two corpora together for comparative explain.
|
|
121
|
+
"""
|
|
122
|
+
lines = _kwic_lines_from_corpus(corpus, target, label=label, window=window)
|
|
123
|
+
return _lines_to_concordance(lines, target=target, window=window, n=n)
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def kwic_compare(
|
|
127
|
+
a: Corpus | CorpusSlice,
|
|
128
|
+
b: Corpus | CorpusSlice,
|
|
129
|
+
target: str,
|
|
130
|
+
window: int = 5,
|
|
131
|
+
n_per_side: int = 5,
|
|
132
|
+
collocate: str | None = None,
|
|
133
|
+
label_a: str = "a",
|
|
134
|
+
label_b: str = "b",
|
|
135
|
+
) -> ConcordanceResult:
|
|
136
|
+
"""Side-by-side KWIC lines for ``target`` from two corpora.
|
|
137
|
+
|
|
138
|
+
Returns up to ``n_per_side`` lines from each corpus, concatenated
|
|
139
|
+
with a ``corpus`` column distinguishing them. If ``collocate`` is
|
|
140
|
+
given, only windows that also contain that collocate are kept —
|
|
141
|
+
this is the engine behind
|
|
142
|
+
:meth:`CollocationShiftResult.explain`.
|
|
143
|
+
"""
|
|
144
|
+
lines_a = _kwic_lines_from_corpus(
|
|
145
|
+
a, target, label=label_a, window=window, collocate=collocate
|
|
146
|
+
)[:n_per_side]
|
|
147
|
+
lines_b = _kwic_lines_from_corpus(
|
|
148
|
+
b, target, label=label_b, window=window, collocate=collocate
|
|
149
|
+
)[:n_per_side]
|
|
150
|
+
return _lines_to_concordance(
|
|
151
|
+
[*lines_a, *lines_b], target=target, window=window, n=None
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def representative_docs(
|
|
156
|
+
corpus: Corpus | CorpusSlice,
|
|
157
|
+
target: str,
|
|
158
|
+
n: int = 5,
|
|
159
|
+
) -> pd.DataFrame:
|
|
160
|
+
"""Return up to ``n`` documents ranked by frequency of ``target``.
|
|
161
|
+
|
|
162
|
+
Ties are broken by document index (earlier first). Documents without
|
|
163
|
+
``target`` are excluded.
|
|
164
|
+
"""
|
|
165
|
+
tokenizer = corpus.tokenizer
|
|
166
|
+
text_col = corpus.text_col
|
|
167
|
+
rows: list[dict[str, object]] = []
|
|
168
|
+
for doc_id, text in enumerate(corpus.docs[text_col].tolist()):
|
|
169
|
+
count = tokenizer(text).count(target)
|
|
170
|
+
if count > 0:
|
|
171
|
+
rows.append({"doc_id": doc_id, "count": count, "text": text})
|
|
172
|
+
if not rows:
|
|
173
|
+
return pd.DataFrame(columns=["doc_id", "count", "text"])
|
|
174
|
+
df = pd.DataFrame(rows).sort_values(
|
|
175
|
+
["count", "doc_id"], ascending=[False, True], kind="stable"
|
|
176
|
+
)
|
|
177
|
+
return df.head(n).reset_index(drop=True)
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
"""Corpus I/O — readers for txt, csv, parquet, DataFrame, DuckDB."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .duckdb import read_duckdb
|
|
6
|
+
from .huggingface import from_huggingface
|
|
7
|
+
from .readers import from_dataframe, read_csv, read_parquet, read_txt
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"from_dataframe",
|
|
11
|
+
"from_huggingface",
|
|
12
|
+
"read_csv",
|
|
13
|
+
"read_duckdb",
|
|
14
|
+
"read_parquet",
|
|
15
|
+
"read_txt",
|
|
16
|
+
]
|