pycorpdiff 0.1.0a0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. pycorpdiff/__init__.py +126 -0
  2. pycorpdiff/_backends/__init__.py +3 -0
  3. pycorpdiff/_backends/pandas.py +3 -0
  4. pycorpdiff/_backends/polars.py +3 -0
  5. pycorpdiff/collocation/__init__.py +19 -0
  6. pycorpdiff/collocation/cooccurrence.py +65 -0
  7. pycorpdiff/collocation/measures.py +102 -0
  8. pycorpdiff/collocation/network.py +233 -0
  9. pycorpdiff/collocation/shift.py +146 -0
  10. pycorpdiff/compare.py +345 -0
  11. pycorpdiff/corpus.py +411 -0
  12. pycorpdiff/datasets/__init__.py +27 -0
  13. pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  14. pycorpdiff/datasets/_generate_hansard.py +221 -0
  15. pycorpdiff/datasets/hansard.py +235 -0
  16. pycorpdiff/datasets/histwords.py +221 -0
  17. pycorpdiff/explain.py +177 -0
  18. pycorpdiff/io/__init__.py +16 -0
  19. pycorpdiff/io/duckdb.py +92 -0
  20. pycorpdiff/io/huggingface.py +142 -0
  21. pycorpdiff/io/readers.py +138 -0
  22. pycorpdiff/keyness/__init__.py +26 -0
  23. pycorpdiff/keyness/bayes.py +50 -0
  24. pycorpdiff/keyness/chi_squared.py +94 -0
  25. pycorpdiff/keyness/correction.py +34 -0
  26. pycorpdiff/keyness/dispersion.py +89 -0
  27. pycorpdiff/keyness/effect_sizes.py +65 -0
  28. pycorpdiff/keyness/loglikelihood.py +92 -0
  29. pycorpdiff/keyness/multicorpus.py +143 -0
  30. pycorpdiff/keyness/permutation.py +154 -0
  31. pycorpdiff/py.typed +0 -0
  32. pycorpdiff/results.py +635 -0
  33. pycorpdiff/semantic/__init__.py +18 -0
  34. pycorpdiff/semantic/alignment.py +53 -0
  35. pycorpdiff/semantic/embed.py +84 -0
  36. pycorpdiff/semantic/shift.py +224 -0
  37. pycorpdiff/semantic/trajectory.py +166 -0
  38. pycorpdiff/stats.py +69 -0
  39. pycorpdiff/temporal/__init__.py +15 -0
  40. pycorpdiff/temporal/bocpd.py +233 -0
  41. pycorpdiff/temporal/causal_impact.py +293 -0
  42. pycorpdiff/temporal/changepoint.py +92 -0
  43. pycorpdiff/temporal/forecast.py +405 -0
  44. pycorpdiff/temporal/its.py +123 -0
  45. pycorpdiff/temporal/slicing.py +174 -0
  46. pycorpdiff/tokenize.py +110 -0
  47. pycorpdiff/viz/__init__.py +37 -0
  48. pycorpdiff/viz/bocpd.py +173 -0
  49. pycorpdiff/viz/causal_impact.py +142 -0
  50. pycorpdiff/viz/collocation.py +48 -0
  51. pycorpdiff/viz/dispersion.py +117 -0
  52. pycorpdiff/viz/forecast.py +129 -0
  53. pycorpdiff/viz/keyness.py +96 -0
  54. pycorpdiff/viz/network.py +186 -0
  55. pycorpdiff/viz/scattertext.py +160 -0
  56. pycorpdiff/viz/semantic_forecast.py +114 -0
  57. pycorpdiff/viz/trajectory.py +48 -0
  58. pycorpdiff-0.1.0a0.dist-info/METADATA +230 -0
  59. pycorpdiff-0.1.0a0.dist-info/RECORD +61 -0
  60. pycorpdiff-0.1.0a0.dist-info/WHEEL +4 -0
  61. pycorpdiff-0.1.0a0.dist-info/licenses/LICENSE +21 -0
@@ -0,0 +1,235 @@
1
+ """UK Hansard loader: bundled synthetic sample + live fetcher.
2
+
3
+ Two functions live here:
4
+
5
+ - :func:`load_hansard_sample` — return the bundled 193-speech synthetic
6
+ sample. Deterministic; ships with the package; no network. Use this
7
+ for tutorials, tests, and offline demos.
8
+ - :func:`fetch_hansard` — query the live UK Parliament Hansard search
9
+ API, optionally caching to a local parquet, and return the matched
10
+ speeches as a :class:`Corpus`. Use this for actual research.
11
+
12
+ The live API
13
+ ------------
14
+
15
+ ``fetch_hansard`` hits the public Hansard search endpoint at
16
+ ``https://hansard-api.parliament.uk/``. The endpoint requires no auth
17
+ and serves UK parliamentary speeches under the Open Government
18
+ Licence (essentially public domain, attribution requested).
19
+
20
+ The API surface changes occasionally; if a field name changes upstream
21
+ the function exposes a ``response_parser`` hook so users can adapt
22
+ without monkey-patching. The defaults match the schema as of
23
+ early 2026.
24
+
25
+ Alternative sources documented for completeness:
26
+
27
+ - **TheyWorkForYou** — https://www.theyworkforyou.com/api/ (free, free
28
+ registration for API key). Different schema; would need a separate
29
+ adapter.
30
+ - **HuggingFace datasets** — search for ``hansard``. Pre-cleaned
31
+ variants with permissive licences. Just :func:`pycorpdiff.from_dataframe`
32
+ the result.
33
+ """
34
+
35
+ from __future__ import annotations
36
+
37
+ import hashlib
38
+ import json
39
+ import urllib.parse
40
+ import urllib.request
41
+ from collections.abc import Callable
42
+ from pathlib import Path
43
+ from typing import Any
44
+
45
+ import pandas as pd
46
+
47
+ from ..corpus import Corpus
48
+ from ..io.readers import from_dataframe, read_parquet
49
+
50
+ DEFAULT_HANSARD_BASE_URL = "https://hansard-api.parliament.uk"
51
+ SEARCH_DEBATES_PATH = "/search/debates.json"
52
+
53
+
54
+ def load_hansard_sample() -> Corpus:
55
+ """Return the bundled 193-speech synthetic Hansard sample as a :class:`Corpus`.
56
+
57
+ The corpus has columns ``speech_id``, ``text``, ``topic``,
58
+ ``frame``, ``party``, ``date``, ``year``. Frames shift over time
59
+ to mimic real discourse: immigration goes humanising → criminalising
60
+ around 2016 (Brexit referendum), Brexit moves emerging → peak →
61
+ aftermath, NHS has austerity (2010-14) and COVID (2020-22)
62
+ pressure points, climate sharpens scientific → policy → crisis.
63
+
64
+ Use this for tutorials, demos, and reproducible package tests. For
65
+ actual research, fetch real Hansard via :func:`fetch_hansard`.
66
+ """
67
+ data_path = Path(__file__).parent / "_data" / "hansard_sample.parquet"
68
+ if not data_path.exists():
69
+ raise FileNotFoundError(
70
+ f"Hansard sample not found at {data_path}. The package may have "
71
+ "been installed without its bundled data; re-run "
72
+ "`python -m pycorpdiff.datasets._generate_hansard` to regenerate."
73
+ )
74
+ return read_parquet(
75
+ data_path,
76
+ text_col="text",
77
+ id_col="speech_id",
78
+ meta_cols=("topic", "frame", "party", "date", "year"),
79
+ )
80
+
81
+
82
+ def _http_get_json(url: str, timeout: float = 30.0) -> dict[str, Any]:
83
+ """Plain GET → JSON. Isolated so tests can monkey-patch it cleanly."""
84
+ req = urllib.request.Request(url, headers={"User-Agent": "pycorpdiff/0.1"})
85
+ with urllib.request.urlopen(req, timeout=timeout) as resp:
86
+ payload = resp.read().decode("utf-8")
87
+ result: dict[str, Any] = json.loads(payload)
88
+ return result
89
+
90
+
91
+ def _default_parse_search_response(payload: dict[str, Any]) -> list[dict[str, Any]]:
92
+ """Extract rows from a Hansard search-results payload.
93
+
94
+ The Hansard search endpoint returns a JSON object with a top-level
95
+ list of search hits. Field names vary slightly across endpoints and
96
+ over time; this parser tolerates the common variations
97
+ (``Results`` / ``SearchResults`` / list-at-root) and surfaces a
98
+ canonical set of fields.
99
+ """
100
+ # The response can be {"Results": [...]} or just [...] depending on endpoint.
101
+ if isinstance(payload, list):
102
+ hits = payload
103
+ else:
104
+ hits = (
105
+ payload.get("Results")
106
+ or payload.get("SearchResults")
107
+ or payload.get("Contributions")
108
+ or []
109
+ )
110
+ rows: list[dict[str, Any]] = []
111
+ for hit in hits:
112
+ if not isinstance(hit, dict):
113
+ continue
114
+ text = (
115
+ hit.get("ContributionText")
116
+ or hit.get("ContentText")
117
+ or hit.get("Snippet")
118
+ or hit.get("Text")
119
+ or ""
120
+ )
121
+ if not text:
122
+ continue
123
+ rows.append(
124
+ {
125
+ "text": text,
126
+ "speaker": hit.get("AttributedTo") or hit.get("MemberName") or "",
127
+ "party": hit.get("MemberParty") or hit.get("Party") or "",
128
+ "date": (hit.get("SittingDate") or hit.get("DebateDate") or "")[:10],
129
+ "debate_title": hit.get("DebateSection") or hit.get("Title") or "",
130
+ "hansard_id": str(
131
+ hit.get("ContributionExtId")
132
+ or hit.get("DebateSectionExtId")
133
+ or hit.get("Id")
134
+ or ""
135
+ ),
136
+ }
137
+ )
138
+ return rows
139
+
140
+
141
+ def fetch_hansard(
142
+ search_term: str,
143
+ start_date: str,
144
+ end_date: str,
145
+ *,
146
+ max_results: int = 100,
147
+ cache_dir: str | Path | None = None,
148
+ base_url: str = DEFAULT_HANSARD_BASE_URL,
149
+ response_parser: Callable[[dict[str, Any]], list[dict[str, Any]]] | None = None,
150
+ _fetch: Callable[[str], dict[str, Any]] | None = None,
151
+ ) -> Corpus:
152
+ """Fetch UK Hansard speeches matching ``search_term`` and return a :class:`Corpus`.
153
+
154
+ Parameters
155
+ ----------
156
+ search_term
157
+ Free-text query passed to the Hansard search API.
158
+ start_date, end_date
159
+ ISO date strings (``"YYYY-MM-DD"``) bounding the search range.
160
+ max_results
161
+ Cap on the number of speeches to retrieve. The API paginates;
162
+ we just take the first page sized at ``max_results``.
163
+ cache_dir
164
+ If given, results are cached as parquet keyed on the URL.
165
+ Subsequent calls with the same arguments read from disk —
166
+ useful for reproducibility and rate-limit etiquette.
167
+ base_url
168
+ Override the default ``https://hansard-api.parliament.uk`` if
169
+ you're hitting a mirror or a staging endpoint.
170
+ response_parser
171
+ Override the default JSON-to-rows parser if the upstream schema
172
+ has changed since this code was written. Receives the decoded
173
+ JSON, returns a list of dicts with at least a ``text`` key.
174
+ _fetch
175
+ Internal hook so tests can substitute the HTTP layer.
176
+
177
+ Returns
178
+ -------
179
+ Corpus
180
+ With columns ``text``, ``speaker``, ``party``, ``date``,
181
+ ``debate_title``, ``hansard_id``. Empty if the query returns no
182
+ hits.
183
+
184
+ Examples
185
+ --------
186
+ >>> import pycorpdiff as pcd
187
+ >>> corpus = pcd.datasets.hansard.fetch_hansard( # doctest: +SKIP
188
+ ... "immigration",
189
+ ... start_date="2020-01-01",
190
+ ... end_date="2020-12-31",
191
+ ... max_results=200,
192
+ ... cache_dir="~/.cache/pycorpdiff/hansard",
193
+ ... )
194
+ """
195
+ fetch = _fetch or _http_get_json
196
+ parse = response_parser or _default_parse_search_response
197
+
198
+ params = {
199
+ "queryParameters.searchTerm": search_term,
200
+ "queryParameters.startDate": start_date,
201
+ "queryParameters.endDate": end_date,
202
+ "queryParameters.take": str(max_results),
203
+ "queryParameters.skip": "0",
204
+ }
205
+ url = f"{base_url}{SEARCH_DEBATES_PATH}?{urllib.parse.urlencode(params)}"
206
+
207
+ cache_path: Path | None = None
208
+ if cache_dir is not None:
209
+ cache_dir_p = Path(cache_dir).expanduser()
210
+ cache_dir_p.mkdir(parents=True, exist_ok=True)
211
+ key = hashlib.sha256(url.encode("utf-8")).hexdigest()[:16]
212
+ cache_path = cache_dir_p / f"hansard_{key}.parquet"
213
+ if cache_path.exists():
214
+ df = pd.read_parquet(cache_path)
215
+ return from_dataframe(
216
+ df,
217
+ text_col="text",
218
+ meta_cols=("speaker", "party", "date", "debate_title", "hansard_id"),
219
+ )
220
+
221
+ payload = fetch(url)
222
+ rows = parse(payload)
223
+ df = pd.DataFrame(
224
+ rows,
225
+ columns=["text", "speaker", "party", "date", "debate_title", "hansard_id"],
226
+ )
227
+
228
+ if cache_path is not None and len(df) > 0:
229
+ df.to_parquet(cache_path, index=False)
230
+
231
+ return from_dataframe(
232
+ df,
233
+ text_col="text",
234
+ meta_cols=("speaker", "party", "date", "debate_title", "hansard_id"),
235
+ )
@@ -0,0 +1,221 @@
1
+ """Hamilton, Leskovec, & Jurafsky (2016) diachronic embeddings loader.
2
+
3
+ The HistWords project (https://nlp.stanford.edu/projects/histwords/)
4
+ released aligned per-decade word2vec embeddings on three corpora:
5
+
6
+ - ``"eng-all"`` — Google Books English (1800s–1990s)
7
+ - ``"coha"`` — Corpus of Historical American English (1810s–2000s)
8
+ - ``"fiction"`` — Google Books English Fiction
9
+
10
+ Each decade's vectors are already Procrustes-aligned across decades, so
11
+ computing cosine distance between a word's vectors in two decades
12
+ directly measures its semantic drift — the central methodological
13
+ contribution of Hamilton et al.'s 2016 paper.
14
+
15
+ The data lives behind public HTTP at snap.stanford.edu and is
16
+ distributed as zips of per-decade ``YYYY.pkl`` (vocabulary list) +
17
+ ``YYYY.npy`` (embedding matrix) pairs.
18
+
19
+ This module is the pycorpdiff cross-validation hook against HistWords:
20
+ :func:`fetch_histwords_decade` loads one decade as a
21
+ ``dict[word, vector]``, :func:`histwords_cosine_shift` computes the
22
+ cosine distance for a target word between two decades, and
23
+ :data:`HAMILTON_REFERENCE_SHIFTS_COHA_1900_1990` records the published
24
+ shifts for a curated set of well-known semantic shifters so tests can
25
+ assert agreement against the paper's findings.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ import pickle
31
+ import shutil
32
+ import urllib.request
33
+ import zipfile
34
+ from collections.abc import Callable
35
+ from pathlib import Path
36
+ from typing import Any
37
+
38
+ import numpy as np
39
+
40
+ # Public download endpoints for the three HistWords subsets.
41
+ # URL → zip-size reference at fetch time (Q1 2026):
42
+ #
43
+ # eng-all: 1.6 GB ─ Google Books English (all)
44
+ # eng-fiction: 0.4 GB ─ Google Books English Fiction (smallest)
45
+ # coha: 0.5 GB ─ Corpus of Historical American English
46
+ # coha-lemma: 0.4 GB ─ same, lemmatised
47
+ # chi-sim: 0.1 GB ─ Chinese Books simplified
48
+ # fre, ger: ~1 GB ─ French, German
49
+ #
50
+ # Each zip extracts to roughly 3–5× its zipped size as per-decade .pkl/.npy
51
+ # files. Use ``cache_dir=`` and the ``PYCORPDIFF_HISTWORDS_CACHE`` env var
52
+ # (recognised by the slow-tier test) to share extracted data across runs.
53
+ HISTWORDS_DOWNLOAD_URLS: dict[str, str] = {
54
+ "eng-all": "http://snap.stanford.edu/historical_embeddings/eng-all_sgns.zip",
55
+ "eng-fiction": "http://snap.stanford.edu/historical_embeddings/eng-fiction-all_sgns.zip",
56
+ "coha": "http://snap.stanford.edu/historical_embeddings/coha-word_sgns.zip",
57
+ "coha-lemma": "http://snap.stanford.edu/historical_embeddings/coha-lemma_sgns.zip",
58
+ "chi-sim": "http://snap.stanford.edu/historical_embeddings/chi-sim-all_sgns.zip",
59
+ "fre": "http://snap.stanford.edu/historical_embeddings/fre-all_sgns.zip",
60
+ "ger": "http://snap.stanford.edu/historical_embeddings/ger-all_sgns.zip",
61
+ }
62
+
63
+ # Approximate cosine distances reported by Hamilton et al. (2016) for
64
+ # well-known semantic shifters in COHA, 1900s → 1990s.
65
+ #
66
+ # These are the famous case studies from the paper:
67
+ #
68
+ # - "gay" — drastic shift from "happy / carefree" to "homosexual"
69
+ # - "broadcast" — from "scattering seeds" to "transmitting radio/TV"
70
+ # - "awful" — from "awe-inspiring" (positive) to "very bad" (negative)
71
+ # - "terrific" — from "terrifying" (negative) to "great" (positive)
72
+ # - "guy" — from "Guy Fawkes effigy" reference to "generic man"
73
+ #
74
+ # Stable function words are listed for negative-control comparison:
75
+ # they should show *minimal* cosine distance because their grammatical
76
+ # role doesn't change across centuries.
77
+ #
78
+ # Tolerances in the cross-validation test are deliberately wide (±0.20)
79
+ # because exact values depend on the embedding-training subset, the
80
+ # alignment-anchor choice, and minor numerical differences in
81
+ # Procrustes. The signal we care about is "shifters show high
82
+ # displacement, stable words show low displacement".
83
+ HAMILTON_REFERENCE_SHIFTS_COHA_1900_1990: dict[str, float] = {
84
+ # Known shifters (Hamilton et al. 2016, Tables 3 + 5)
85
+ "gay": 0.65,
86
+ "broadcast": 0.55,
87
+ "awful": 0.55,
88
+ "terrific": 0.40,
89
+ "guy": 0.50,
90
+ # Stable controls
91
+ "the": 0.10,
92
+ "and": 0.10,
93
+ "of": 0.10,
94
+ "is": 0.10,
95
+ }
96
+
97
+
98
+ def _http_download(url: str, dest: Path, timeout: float = 120.0) -> None:
99
+ """Stream a file from ``url`` to ``dest``. Isolated for test mocking."""
100
+ req = urllib.request.Request(url, headers={"User-Agent": "pycorpdiff/0.1"})
101
+ with urllib.request.urlopen(req, timeout=timeout) as resp, dest.open("wb") as out:
102
+ shutil.copyfileobj(resp, out)
103
+
104
+
105
+ def _default_cache_dir() -> Path:
106
+ """Where decade embeddings are cached when ``cache_dir=None``."""
107
+ return Path.home() / ".cache" / "pycorpdiff" / "histwords"
108
+
109
+
110
+ def fetch_histwords_decade(
111
+ decade: int,
112
+ source: str = "eng-all",
113
+ cache_dir: str | Path | None = None,
114
+ _fetch: Callable[[str, Path], None] | None = None,
115
+ ) -> dict[str, np.ndarray[Any, Any]]:
116
+ """Return one decade of HistWords embeddings as a ``dict[word, vector]``.
117
+
118
+ Parameters
119
+ ----------
120
+ decade
121
+ The decade to load, expressed as the start year — e.g. ``1900``
122
+ for the 1900s, ``1990`` for the 1990s. Valid range depends on
123
+ the subset (eng-all and coha span ~1810–2000s).
124
+ source
125
+ ``"eng-all"`` (Google Books English, default), ``"coha"``
126
+ (Corpus of Historical American English), or ``"fiction"``
127
+ (Google Books English Fiction).
128
+ cache_dir
129
+ Where to store the downloaded zip and extracted files.
130
+ Defaults to ``~/.cache/pycorpdiff/histwords``.
131
+ _fetch
132
+ Internal hook so tests can substitute the HTTP layer with a
133
+ local file writer.
134
+
135
+ Returns
136
+ -------
137
+ dict[str, numpy.ndarray]
138
+ Word → 300-dim float32 vector (the standard HistWords embedding
139
+ dimensionality).
140
+
141
+ Raises
142
+ ------
143
+ ValueError
144
+ If ``source`` isn't in :data:`HISTWORDS_DOWNLOAD_URLS`.
145
+ FileNotFoundError
146
+ If the decade's files aren't in the extracted archive.
147
+ """
148
+ if source not in HISTWORDS_DOWNLOAD_URLS:
149
+ raise ValueError(
150
+ f"unknown source={source!r}; expected one of "
151
+ f"{list(HISTWORDS_DOWNLOAD_URLS)!r}"
152
+ )
153
+
154
+ fetch = _fetch or _http_download
155
+ cache_root = Path(cache_dir).expanduser() if cache_dir else _default_cache_dir()
156
+ extracted_dir = cache_root / source
157
+ decade_pkl = extracted_dir / f"{decade}.pkl"
158
+ decade_npy = extracted_dir / f"{decade}.npy"
159
+
160
+ if not (decade_pkl.exists() and decade_npy.exists()):
161
+ cache_root.mkdir(parents=True, exist_ok=True)
162
+ zip_path = cache_root / f"{source}.zip"
163
+ if not zip_path.exists():
164
+ fetch(HISTWORDS_DOWNLOAD_URLS[source], zip_path)
165
+ # Extract — HistWords zips have a single top-level directory; we
166
+ # flatten to ``extracted_dir`` regardless of nesting depth so
167
+ # ``YYYY.pkl`` / ``YYYY.npy`` end up directly inside it.
168
+ extracted_dir.mkdir(parents=True, exist_ok=True)
169
+ with zipfile.ZipFile(zip_path) as zf:
170
+ for member in zf.namelist():
171
+ name = Path(member).name
172
+ if not name:
173
+ continue
174
+ target = extracted_dir / name
175
+ if target.exists():
176
+ continue
177
+ with zf.open(member) as src, target.open("wb") as dst:
178
+ shutil.copyfileobj(src, dst)
179
+
180
+ if not (decade_pkl.exists() and decade_npy.exists()):
181
+ raise FileNotFoundError(
182
+ f"decade {decade} not found in {source} archive at {extracted_dir}; "
183
+ f"expected {decade}.pkl + {decade}.npy"
184
+ )
185
+
186
+ with decade_pkl.open("rb") as f:
187
+ vocab: list[str] = pickle.load(f)
188
+ vectors: np.ndarray[Any, Any] = np.load(decade_npy)
189
+ if len(vocab) != vectors.shape[0]:
190
+ raise ValueError(
191
+ f"decade {decade}: vocab size {len(vocab)} != vectors {vectors.shape[0]}"
192
+ )
193
+ return {word: vectors[i] for i, word in enumerate(vocab)}
194
+
195
+
196
+ def histwords_cosine_shift(
197
+ decade_a: int,
198
+ decade_b: int,
199
+ target: str,
200
+ source: str = "eng-all",
201
+ cache_dir: str | Path | None = None,
202
+ _fetch: Callable[[str, Path], None] | None = None,
203
+ ) -> float:
204
+ """Cosine distance between ``target``'s vectors in two HistWords decades.
205
+
206
+ Returns ``1 - cos(v_a, v_b)``. Hamilton et al.'s alignment is
207
+ already Procrustes; this function just looks up the two pre-aligned
208
+ vectors and computes the distance.
209
+ """
210
+ from ..stats import cosine_similarity
211
+
212
+ vecs_a = fetch_histwords_decade(decade_a, source, cache_dir, _fetch)
213
+ vecs_b = fetch_histwords_decade(decade_b, source, cache_dir, _fetch)
214
+
215
+ if target not in vecs_a:
216
+ raise KeyError(f"target {target!r} not in {source} {decade_a}s vocab")
217
+ if target not in vecs_b:
218
+ raise KeyError(f"target {target!r} not in {source} {decade_b}s vocab")
219
+
220
+ sim = cosine_similarity(vecs_a[target], vecs_b[target])
221
+ return 1.0 - sim
pycorpdiff/explain.py ADDED
@@ -0,0 +1,177 @@
1
+ """Explainability helpers — KWIC concordances, representative documents.
2
+
3
+ Every public analytical Result delegates its ``.explain()`` method here
4
+ so the concordance machinery lives in one place. KWIC lines are
5
+ returned as a tidy DataFrame on :class:`ConcordanceResult` with the
6
+ columns ``corpus``, ``doc_id``, ``position``, ``left``, ``keyword``,
7
+ ``right``.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from collections.abc import Sequence
13
+ from dataclasses import dataclass
14
+
15
+ import pandas as pd
16
+
17
+ from .corpus import Corpus, CorpusSlice
18
+ from .results import ConcordanceResult
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class _KwicLine:
23
+ corpus: str
24
+ doc_id: int
25
+ position: int
26
+ left: str
27
+ keyword: str
28
+ right: str
29
+
30
+
31
+ def _kwic_lines_from_corpus(
32
+ corpus: Corpus | CorpusSlice,
33
+ target: str,
34
+ label: str,
35
+ window: int,
36
+ collocate: str | None = None,
37
+ ) -> list[_KwicLine]:
38
+ """Extract KWIC lines for ``target`` from one corpus.
39
+
40
+ When ``collocate`` is given, only windows that *also* contain
41
+ ``collocate`` are kept — this is what powers the collocation
42
+ explainer ("show me the contexts that drive this shift").
43
+ """
44
+ if window < 1:
45
+ raise ValueError(f"window must be >= 1; got {window}")
46
+ docs = corpus.docs[corpus.text_col].tolist()
47
+ tokenizer = corpus.tokenizer
48
+ lines: list[_KwicLine] = []
49
+ for doc_id, text in enumerate(docs):
50
+ tokens = tokenizer(text)
51
+ n_tokens = len(tokens)
52
+ for pos in range(n_tokens):
53
+ if tokens[pos] != target:
54
+ continue
55
+ lo = max(0, pos - window)
56
+ hi = min(n_tokens, pos + window + 1)
57
+ if collocate is not None:
58
+ context = [tokens[j] for j in range(lo, hi) if j != pos]
59
+ if collocate not in context:
60
+ continue
61
+ left = " ".join(tokens[lo:pos])
62
+ right = " ".join(tokens[pos + 1 : hi])
63
+ lines.append(
64
+ _KwicLine(
65
+ corpus=label,
66
+ doc_id=doc_id,
67
+ position=pos,
68
+ left=left,
69
+ keyword=target,
70
+ right=right,
71
+ )
72
+ )
73
+ return lines
74
+
75
+
76
+ def _lines_to_concordance(
77
+ lines: Sequence[_KwicLine], target: str, window: int, n: int | None
78
+ ) -> ConcordanceResult:
79
+ if not lines:
80
+ empty = pd.DataFrame(
81
+ columns=["corpus", "doc_id", "position", "left", "keyword", "right"]
82
+ )
83
+ return ConcordanceResult(target=target, table=empty, window=window)
84
+ table = pd.DataFrame([line.__dict__ for line in lines])
85
+ if n is not None and len(table) > n:
86
+ table = table.head(n)
87
+ return ConcordanceResult(
88
+ target=target, table=table.reset_index(drop=True), window=window
89
+ )
90
+
91
+
92
+ def kwic(
93
+ corpus: Corpus | CorpusSlice,
94
+ target: str,
95
+ window: int = 5,
96
+ n: int | None = None,
97
+ label: str = "corpus",
98
+ ) -> ConcordanceResult:
99
+ """Return KWIC (keyword-in-context) concordance lines for ``target``.
100
+
101
+ Walks each document, finds every occurrence of ``target``, and emits
102
+ one row per occurrence with the ``window`` tokens of left context,
103
+ the keyword itself, and the ``window`` tokens of right context.
104
+ Document boundaries are respected — context never crosses them.
105
+
106
+ Parameters
107
+ ----------
108
+ corpus
109
+ Source corpus or slice.
110
+ target
111
+ Term to find. Compared against tokenized output, so case
112
+ sensitivity follows the corpus's tokenizer.
113
+ window
114
+ Tokens of context on each side.
115
+ n
116
+ Cap on the number of lines returned (the first ``n``). Use
117
+ ``None`` for "all matches".
118
+ label
119
+ Value to fill in the ``corpus`` column — useful when stitching
120
+ KWIC tables from two corpora together for comparative explain.
121
+ """
122
+ lines = _kwic_lines_from_corpus(corpus, target, label=label, window=window)
123
+ return _lines_to_concordance(lines, target=target, window=window, n=n)
124
+
125
+
126
+ def kwic_compare(
127
+ a: Corpus | CorpusSlice,
128
+ b: Corpus | CorpusSlice,
129
+ target: str,
130
+ window: int = 5,
131
+ n_per_side: int = 5,
132
+ collocate: str | None = None,
133
+ label_a: str = "a",
134
+ label_b: str = "b",
135
+ ) -> ConcordanceResult:
136
+ """Side-by-side KWIC lines for ``target`` from two corpora.
137
+
138
+ Returns up to ``n_per_side`` lines from each corpus, concatenated
139
+ with a ``corpus`` column distinguishing them. If ``collocate`` is
140
+ given, only windows that also contain that collocate are kept —
141
+ this is the engine behind
142
+ :meth:`CollocationShiftResult.explain`.
143
+ """
144
+ lines_a = _kwic_lines_from_corpus(
145
+ a, target, label=label_a, window=window, collocate=collocate
146
+ )[:n_per_side]
147
+ lines_b = _kwic_lines_from_corpus(
148
+ b, target, label=label_b, window=window, collocate=collocate
149
+ )[:n_per_side]
150
+ return _lines_to_concordance(
151
+ [*lines_a, *lines_b], target=target, window=window, n=None
152
+ )
153
+
154
+
155
+ def representative_docs(
156
+ corpus: Corpus | CorpusSlice,
157
+ target: str,
158
+ n: int = 5,
159
+ ) -> pd.DataFrame:
160
+ """Return up to ``n`` documents ranked by frequency of ``target``.
161
+
162
+ Ties are broken by document index (earlier first). Documents without
163
+ ``target`` are excluded.
164
+ """
165
+ tokenizer = corpus.tokenizer
166
+ text_col = corpus.text_col
167
+ rows: list[dict[str, object]] = []
168
+ for doc_id, text in enumerate(corpus.docs[text_col].tolist()):
169
+ count = tokenizer(text).count(target)
170
+ if count > 0:
171
+ rows.append({"doc_id": doc_id, "count": count, "text": text})
172
+ if not rows:
173
+ return pd.DataFrame(columns=["doc_id", "count", "text"])
174
+ df = pd.DataFrame(rows).sort_values(
175
+ ["count", "doc_id"], ascending=[False, True], kind="stable"
176
+ )
177
+ return df.head(n).reset_index(drop=True)
@@ -0,0 +1,16 @@
1
+ """Corpus I/O — readers for txt, csv, parquet, DataFrame, DuckDB."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .duckdb import read_duckdb
6
+ from .huggingface import from_huggingface
7
+ from .readers import from_dataframe, read_csv, read_parquet, read_txt
8
+
9
+ __all__ = [
10
+ "from_dataframe",
11
+ "from_huggingface",
12
+ "read_csv",
13
+ "read_duckdb",
14
+ "read_parquet",
15
+ "read_txt",
16
+ ]