pycorpdiff 0.1.0a6__tar.gz → 0.1.0a8__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/.gitignore +0 -3
- pycorpdiff-0.1.0a8/CHANGELOG.md +71 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/CITATION.cff +1 -1
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/PKG-INFO +42 -24
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/README.md +39 -21
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/pyproject.toml +13 -9
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/__init__.py +6 -5
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/compare.py +3 -1
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/corpus.py +9 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/io/duckdb.py +13 -1
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/bayes.py +10 -2
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/results.py +25 -8
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/shift.py +24 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_histwords.py +29 -15
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_quanteda.py +29 -23
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_sbert_slow.py +13 -2
- pycorpdiff-0.1.0a8/tests/unit/test_audit_a7_fixes.py +133 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_loglikelihood.py +46 -0
- pycorpdiff-0.1.0a6/CHANGELOG.md +0 -44
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/LICENSE +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/_backends/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/_backends/pandas.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/_backends/polars.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/measures.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/network.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/collocation/shift.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/hansard.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/datasets/histwords.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/explain.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/io/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/io/huggingface.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/io/readers.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/chi_squared.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/correction.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/dispersion.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/multicorpus.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/keyness/permutation.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/py.typed +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/alignment.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/embed.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/semantic/trajectory.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/stats.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/bocpd.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/changepoint.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/forecast.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/its.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/temporal/slicing.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/tokenize.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/bocpd.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/collocation.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/dispersion.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/forecast.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/keyness.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/network.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/scattertext.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/src/pycorpdiff/viz/trajectory.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/conftest.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/fixtures/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_collocation_integration.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_nltk.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_rayson.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_crossval_scattertext.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_explain_integration.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_keyness_integration.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_semantic_integration.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_stop_words.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_temporal_stats.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/integration/test_viz.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/property/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/property/test_collocation_properties.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/property/test_keyness_properties.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/property/test_temporal_properties.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/__init__.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_bayes_factor.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_bocpd.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_causal_impact.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_changepoint.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_chi_squared.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_collocation_cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_collocation_measures.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_collocation_shift.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_comparison_concordance.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_cooccurrence_network.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_corpus_hash.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_corpus_vocab.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_correction.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_datasets_hansard.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_dispersion.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_dispersion_plot.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_doc_term_counts_sparse.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_embedders.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_explain.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_forecast.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_forecast_semantic_drift.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_from_huggingface.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_hansard_fetcher.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_histwords_loader.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_its.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_keyness_multi.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_ngram_tokenizer.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_permutation_keyness.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_polars_interop.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_procrustes.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_read_duckdb.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_read_txt_line_mode.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_result_exports.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_scattertext_plot.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_semantic_neighbours.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_semantic_shift.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_semantic_trajectory.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_smoke.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_temporal.py +0 -0
- {pycorpdiff-0.1.0a6 → pycorpdiff-0.1.0a8}/tests/unit/test_wilson_ci.py +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `pycorpdiff` are documented in this file. The format
|
|
4
|
+
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
|
|
5
|
+
project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
## [0.1.0a8] — first public release
|
|
8
|
+
|
|
9
|
+
The first public alpha of `pycorpdiff` — comparative corpus analysis
|
|
10
|
+
for modern Python workflows. Three public verbs (`compare`, `track`,
|
|
11
|
+
`compare.before_after`), nine `Result` dataclasses each implementing
|
|
12
|
+
the relevant subset of `.to_df / .plot / .explain / .summary /
|
|
13
|
+
.to_html / .to_json` (see `docs/design.md` for the per-Result method
|
|
14
|
+
matrix), two `typing.Protocol` extension points (`Tokenizer`,
|
|
15
|
+
`Embedder`), and opt-in extras for visualisation, semantic embedding,
|
|
16
|
+
temporal modelling, polars interop, DuckDB ingestion, 🤗 Datasets,
|
|
17
|
+
and notebook rendering.
|
|
18
|
+
|
|
19
|
+
### Analytical surface
|
|
20
|
+
|
|
21
|
+
- **Keyness**: signed log-likelihood G² with selectable formula
|
|
22
|
+
(`formula="rayson"` 2-cell shortcut, default; matches the UCREL
|
|
23
|
+
LL Wizard. `formula="dunning"` 4-cell G²; matches NLTK +
|
|
24
|
+
`quanteda::textstat_keyness(measure="lr")` byte-for-byte.). Pearson
|
|
25
|
+
χ², Hardie LogRatio, Gabrielatos %DIFF, BIC-approximated Bayes
|
|
26
|
+
factor (also tracks the `formula=` choice), Juilland D / Gries DP
|
|
27
|
+
dispersion flagging, Benjamini–Hochberg correction, stop-word
|
|
28
|
+
filtering, empirical permutation *p*-values, N-way contingency G²
|
|
29
|
+
via `keyness_multi`.
|
|
30
|
+
- **Collocations**: logDice, PMI, t-score, MI³ with Laplace smoothing;
|
|
31
|
+
cross-corpus `collocation_shift`; co-occurrence networks via
|
|
32
|
+
`cooccurrence_network`.
|
|
33
|
+
- **Semantic shift**: averaged contextual embeddings, Procrustes
|
|
34
|
+
alignment, multi-period `semantic_trajectory`, `neighborhood_drift`.
|
|
35
|
+
Embedder output shape is validated to catch silently-broken
|
|
36
|
+
embedders before they produce nonsense.
|
|
37
|
+
- **Temporal**: Wilson-CI trajectories, offline PELT changepoints,
|
|
38
|
+
online Bayesian changepoint detection, segmented-OLS interrupted
|
|
39
|
+
time series, Bayesian structural time-series causal impact,
|
|
40
|
+
state-space exponential-smoothing forecasting.
|
|
41
|
+
|
|
42
|
+
### Cross-validated
|
|
43
|
+
|
|
44
|
+
The package is checked against standard tools by automated test:
|
|
45
|
+
|
|
46
|
+
- **Rayson's LL Wizard** — hand-derived contingency-table reference
|
|
47
|
+
triples (fast tier; runs on every push).
|
|
48
|
+
- **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
|
|
49
|
+
on every adjacent bigram (slow tier).
|
|
50
|
+
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
|
|
51
|
+
US Conventions corpus (slow tier).
|
|
52
|
+
- **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
|
|
53
|
+
`formula="dunning"` (slow tier).
|
|
54
|
+
- **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
|
|
55
|
+
sanity check on Stanford SNAP COHA decade embeddings; skips
|
|
56
|
+
gracefully when the archive isn't reachable (slow tier).
|
|
57
|
+
|
|
58
|
+
### Extras
|
|
59
|
+
|
|
60
|
+
`[viz]`, `[semantic]`, `[temporal]`, `[polars]`, `[duckdb]`, `[nlp]`,
|
|
61
|
+
`[huggingface]`, `[notebooks]`, `[all]` are MIT-compatible. A separate
|
|
62
|
+
`[showcase]` extra pulls in `pysofra` (GPL-3.0-or-later) for
|
|
63
|
+
JAMA-style table polish in the showcase notebook — opt in explicitly
|
|
64
|
+
if you accept that licence.
|
|
65
|
+
|
|
66
|
+
### Infrastructure
|
|
67
|
+
|
|
68
|
+
Hundreds of tests, `ruff` + `mypy --strict` clean across the source
|
|
69
|
+
tree, matrix CI on three Python versions × two operating systems,
|
|
70
|
+
plus a slow-tier CI job exercising the cross-validation receipts
|
|
71
|
+
against NLTK + quanteda on main pushes.
|
|
@@ -4,7 +4,7 @@ message: >
|
|
|
4
4
|
entry. GitHub renders a "Cite this repository" widget directly from
|
|
5
5
|
this file.
|
|
6
6
|
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
7
|
-
version: 0.1.
|
|
7
|
+
version: 0.1.0a8
|
|
8
8
|
date-released: 2026-05-25
|
|
9
9
|
authors:
|
|
10
10
|
- family-names: Turner
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycorpdiff
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a8
|
|
4
4
|
Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
|
|
5
5
|
Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
|
|
6
6
|
Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
|
|
@@ -54,7 +54,6 @@ Requires-Dist: matplotlib>=3.8; extra == 'all'
|
|
|
54
54
|
Requires-Dist: networkx>=3.1; extra == 'all'
|
|
55
55
|
Requires-Dist: polars>=1.0; extra == 'all'
|
|
56
56
|
Requires-Dist: pyarrow>=15; extra == 'all'
|
|
57
|
-
Requires-Dist: pysofra>=0.1.0a3; extra == 'all'
|
|
58
57
|
Requires-Dist: ruptures>=1.1; extra == 'all'
|
|
59
58
|
Requires-Dist: scikit-learn>=1.3; extra == 'all'
|
|
60
59
|
Requires-Dist: sentence-transformers>=2.2; extra == 'all'
|
|
@@ -77,7 +76,6 @@ Provides-Extra: nlp
|
|
|
77
76
|
Requires-Dist: spacy>=3.7; extra == 'nlp'
|
|
78
77
|
Provides-Extra: notebooks
|
|
79
78
|
Requires-Dist: jupyter>=1.0; extra == 'notebooks'
|
|
80
|
-
Requires-Dist: pysofra>=0.1.0a3; extra == 'notebooks'
|
|
81
79
|
Requires-Dist: vl-convert-python>=1.5; extra == 'notebooks'
|
|
82
80
|
Provides-Extra: polars
|
|
83
81
|
Requires-Dist: polars>=1.0; extra == 'polars'
|
|
@@ -85,6 +83,8 @@ Requires-Dist: pyarrow>=15; extra == 'polars'
|
|
|
85
83
|
Provides-Extra: semantic
|
|
86
84
|
Requires-Dist: scikit-learn>=1.3; extra == 'semantic'
|
|
87
85
|
Requires-Dist: sentence-transformers>=2.2; extra == 'semantic'
|
|
86
|
+
Provides-Extra: showcase
|
|
87
|
+
Requires-Dist: pysofra>=0.1.0a3; extra == 'showcase'
|
|
88
88
|
Provides-Extra: temporal
|
|
89
89
|
Requires-Dist: ruptures>=1.1; extra == 'temporal'
|
|
90
90
|
Requires-Dist: statsmodels>=0.14; extra == 'temporal'
|
|
@@ -127,11 +127,11 @@ and computational social science routinely have:
|
|
|
127
127
|
`pycorpdiff` is positioned as **orchestration**, not reinvention.
|
|
128
128
|
Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
|
|
129
129
|
`SBERT`-compatible model) plug in via two `typing.Protocol` extension
|
|
130
|
-
points — one-line adapters, no plugin registry. The base install
|
|
131
|
-
|
|
132
|
-
via extras.
|
|
130
|
+
points — one-line adapters, no plugin registry. The base install's
|
|
131
|
+
direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
|
|
132
|
+
`pyarrow`; everything else is opt-in via extras.
|
|
133
133
|
|
|
134
|
-
> **Status: alpha (0.1.
|
|
134
|
+
> **Status: alpha (0.1.0a8).** Public API is stable for the features
|
|
135
135
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
136
136
|
|
|
137
137
|
## The three-layer architecture
|
|
@@ -178,7 +178,8 @@ for the full feature tour, or the cheat sheet below for one-line API previews.
|
|
|
178
178
|
|
|
179
179
|
```python
|
|
180
180
|
# Compare verbs (returns Result objects; methods exposed vary by Result)
|
|
181
|
-
pcd.compare(a, b).keyness()
|
|
181
|
+
pcd.compare(a, b).keyness() # default formula="rayson" (LL Wizard)
|
|
182
|
+
pcd.compare(a, b).keyness(formula="dunning") # full 4-cell G² (matches quanteda / NLTK)
|
|
182
183
|
pcd.compare(a, b).collocation_shift("immigrant")
|
|
183
184
|
pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
184
185
|
# SBERTEmbedder downloads a sentence-transformers model on first call;
|
|
@@ -190,7 +191,7 @@ tr.changepoints() # offline PELT
|
|
|
190
191
|
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
191
192
|
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
192
193
|
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
193
|
-
tr.forecast(horizon=4) # state-space ETS
|
|
194
|
+
tr.forecast(horizon=4) # 4 periods at the over_time freq (state-space ETS)
|
|
194
195
|
|
|
195
196
|
# Before / after a known event
|
|
196
197
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
@@ -209,17 +210,20 @@ every analytical surface.
|
|
|
209
210
|
## Installation
|
|
210
211
|
|
|
211
212
|
```bash
|
|
212
|
-
pip install pycorpdiff
|
|
213
|
-
pip install "pycorpdiff[viz]"
|
|
214
|
-
pip install "pycorpdiff[semantic]"
|
|
215
|
-
pip install "pycorpdiff[temporal]"
|
|
216
|
-
pip install "pycorpdiff[notebooks]"
|
|
217
|
-
pip install "pycorpdiff[all]"
|
|
213
|
+
pip install pycorpdiff # lexical-comparative core (MIT)
|
|
214
|
+
pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
|
|
215
|
+
pip install "pycorpdiff[semantic]" # + sentence-transformers
|
|
216
|
+
pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
|
|
217
|
+
pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert
|
|
218
|
+
pip install "pycorpdiff[all]" # everything MIT-compatible
|
|
219
|
+
pip install "pycorpdiff[all,showcase]" # + pysofra (GPL-3.0-or-later) for the JAMA-style showcase
|
|
218
220
|
```
|
|
219
221
|
|
|
220
|
-
The base install
|
|
221
|
-
`scipy`, `pyarrow
|
|
222
|
-
only pay for what you use.
|
|
222
|
+
The base install's direct runtime dependencies are `numpy`, `pandas`,
|
|
223
|
+
`scipy`, and `pyarrow`; optional extras land per analytical layer so
|
|
224
|
+
you only pay for what you use. `[showcase]` is broken out separately
|
|
225
|
+
because `pysofra` is GPL-3.0-or-later — pure `pycorpdiff` use without
|
|
226
|
+
that extra remains MIT-only.
|
|
223
227
|
|
|
224
228
|
To work from source:
|
|
225
229
|
|
|
@@ -232,13 +236,27 @@ pytest -q
|
|
|
232
236
|
|
|
233
237
|
## Cross-validation receipts
|
|
234
238
|
|
|
235
|
-
The math
|
|
239
|
+
The math is checked against standard tools by automated test. The
|
|
240
|
+
fast tier runs on every push (matrix CI); the slow tier needs heavy
|
|
241
|
+
optional dependencies (R + quanteda, NLTK, rpy2, Stanford SNAP
|
|
242
|
+
downloads) and runs on main pushes only.
|
|
236
243
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
- **
|
|
240
|
-
|
|
241
|
-
|
|
244
|
+
Fast tier:
|
|
245
|
+
|
|
246
|
+
- **Rayson's LL Wizard** — hand-derived contingency-table reference
|
|
247
|
+
triples ([`tests/integration/test_crossval_rayson.py`](https://github.com/jturner-uofl/pycorpdiff/blob/main/tests/integration/test_crossval_rayson.py))
|
|
248
|
+
|
|
249
|
+
Slow tier:
|
|
250
|
+
|
|
251
|
+
- **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
|
|
252
|
+
on every adjacent bigram
|
|
253
|
+
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
|
|
254
|
+
US Conventions corpus
|
|
255
|
+
- **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
|
|
256
|
+
`formula="dunning"`
|
|
257
|
+
- **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
|
|
258
|
+
sanity check on Stanford SNAP COHA decade embeddings (skips
|
|
259
|
+
gracefully if the archive isn't reachable)
|
|
242
260
|
|
|
243
261
|
## Citation
|
|
244
262
|
|
|
@@ -31,11 +31,11 @@ and computational social science routinely have:
|
|
|
31
31
|
`pycorpdiff` is positioned as **orchestration**, not reinvention.
|
|
32
32
|
Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
|
|
33
33
|
`SBERT`-compatible model) plug in via two `typing.Protocol` extension
|
|
34
|
-
points — one-line adapters, no plugin registry. The base install
|
|
35
|
-
|
|
36
|
-
via extras.
|
|
34
|
+
points — one-line adapters, no plugin registry. The base install's
|
|
35
|
+
direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
|
|
36
|
+
`pyarrow`; everything else is opt-in via extras.
|
|
37
37
|
|
|
38
|
-
> **Status: alpha (0.1.
|
|
38
|
+
> **Status: alpha (0.1.0a8).** Public API is stable for the features
|
|
39
39
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
40
40
|
|
|
41
41
|
## The three-layer architecture
|
|
@@ -82,7 +82,8 @@ for the full feature tour, or the cheat sheet below for one-line API previews.
|
|
|
82
82
|
|
|
83
83
|
```python
|
|
84
84
|
# Compare verbs (returns Result objects; methods exposed vary by Result)
|
|
85
|
-
pcd.compare(a, b).keyness()
|
|
85
|
+
pcd.compare(a, b).keyness() # default formula="rayson" (LL Wizard)
|
|
86
|
+
pcd.compare(a, b).keyness(formula="dunning") # full 4-cell G² (matches quanteda / NLTK)
|
|
86
87
|
pcd.compare(a, b).collocation_shift("immigrant")
|
|
87
88
|
pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
88
89
|
# SBERTEmbedder downloads a sentence-transformers model on first call;
|
|
@@ -94,7 +95,7 @@ tr.changepoints() # offline PELT
|
|
|
94
95
|
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
95
96
|
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
96
97
|
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
97
|
-
tr.forecast(horizon=4) # state-space ETS
|
|
98
|
+
tr.forecast(horizon=4) # 4 periods at the over_time freq (state-space ETS)
|
|
98
99
|
|
|
99
100
|
# Before / after a known event
|
|
100
101
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
@@ -113,17 +114,20 @@ every analytical surface.
|
|
|
113
114
|
## Installation
|
|
114
115
|
|
|
115
116
|
```bash
|
|
116
|
-
pip install pycorpdiff
|
|
117
|
-
pip install "pycorpdiff[viz]"
|
|
118
|
-
pip install "pycorpdiff[semantic]"
|
|
119
|
-
pip install "pycorpdiff[temporal]"
|
|
120
|
-
pip install "pycorpdiff[notebooks]"
|
|
121
|
-
pip install "pycorpdiff[all]"
|
|
117
|
+
pip install pycorpdiff # lexical-comparative core (MIT)
|
|
118
|
+
pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
|
|
119
|
+
pip install "pycorpdiff[semantic]" # + sentence-transformers
|
|
120
|
+
pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
|
|
121
|
+
pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert
|
|
122
|
+
pip install "pycorpdiff[all]" # everything MIT-compatible
|
|
123
|
+
pip install "pycorpdiff[all,showcase]" # + pysofra (GPL-3.0-or-later) for the JAMA-style showcase
|
|
122
124
|
```
|
|
123
125
|
|
|
124
|
-
The base install
|
|
125
|
-
`scipy`, `pyarrow
|
|
126
|
-
only pay for what you use.
|
|
126
|
+
The base install's direct runtime dependencies are `numpy`, `pandas`,
|
|
127
|
+
`scipy`, and `pyarrow`; optional extras land per analytical layer so
|
|
128
|
+
you only pay for what you use. `[showcase]` is broken out separately
|
|
129
|
+
because `pysofra` is GPL-3.0-or-later — pure `pycorpdiff` use without
|
|
130
|
+
that extra remains MIT-only.
|
|
127
131
|
|
|
128
132
|
To work from source:
|
|
129
133
|
|
|
@@ -136,13 +140,27 @@ pytest -q
|
|
|
136
140
|
|
|
137
141
|
## Cross-validation receipts
|
|
138
142
|
|
|
139
|
-
The math
|
|
143
|
+
The math is checked against standard tools by automated test. The
|
|
144
|
+
fast tier runs on every push (matrix CI); the slow tier needs heavy
|
|
145
|
+
optional dependencies (R + quanteda, NLTK, rpy2, Stanford SNAP
|
|
146
|
+
downloads) and runs on main pushes only.
|
|
140
147
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
- **
|
|
144
|
-
|
|
145
|
-
|
|
148
|
+
Fast tier:
|
|
149
|
+
|
|
150
|
+
- **Rayson's LL Wizard** — hand-derived contingency-table reference
|
|
151
|
+
triples ([`tests/integration/test_crossval_rayson.py`](https://github.com/jturner-uofl/pycorpdiff/blob/main/tests/integration/test_crossval_rayson.py))
|
|
152
|
+
|
|
153
|
+
Slow tier:
|
|
154
|
+
|
|
155
|
+
- **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
|
|
156
|
+
on every adjacent bigram
|
|
157
|
+
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
|
|
158
|
+
US Conventions corpus
|
|
159
|
+
- **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
|
|
160
|
+
`formula="dunning"`
|
|
161
|
+
- **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
|
|
162
|
+
sanity check on Stanford SNAP COHA decade embeddings (skips
|
|
163
|
+
gracefully if the archive isn't reachable)
|
|
146
164
|
|
|
147
165
|
## Citation
|
|
148
166
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pycorpdiff"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.0a8"
|
|
8
8
|
description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -62,13 +62,18 @@ nlp = ["spacy>=3.7"]
|
|
|
62
62
|
# Public-text-corpus hub. Heavy (pulls pyarrow, fsspec, requests, aiohttp),
|
|
63
63
|
# so opt-in only — base install stays small.
|
|
64
64
|
huggingface = ["datasets>=2.14"]
|
|
65
|
-
# Needed if you want to execute the
|
|
66
|
-
#
|
|
67
|
-
#
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
#
|
|
71
|
-
#
|
|
65
|
+
# Needed if you want to execute the example notebooks. `jupyter` runs
|
|
66
|
+
# the notebook; `vl-convert` does static SVG/PNG export of altair charts.
|
|
67
|
+
# Kept MIT-clean — see `showcase` below for the JAMA-style table polish.
|
|
68
|
+
notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5"]
|
|
69
|
+
# Adds `pysofra` for the showcase notebook's JAMA-style typography.
|
|
70
|
+
# IMPORTANT: pysofra is GPL-3.0-or-later. Opting in to `[showcase]` (or
|
|
71
|
+
# installing pysofra directly) brings GPL into your environment; pure
|
|
72
|
+
# pycorpdiff use without this extra remains MIT-only.
|
|
73
|
+
showcase = ["pysofra>=0.1.0a3"]
|
|
74
|
+
# Meta-extra: every MIT-compatible optional code path. Does NOT include
|
|
75
|
+
# `[showcase]` because pysofra is GPL-3.0-or-later; install
|
|
76
|
+
# `pycorpdiff[all,showcase]` explicitly if you accept that licence.
|
|
72
77
|
all = [
|
|
73
78
|
"altair>=5",
|
|
74
79
|
"matplotlib>=3.8",
|
|
@@ -84,7 +89,6 @@ all = [
|
|
|
84
89
|
"spacy>=3.7",
|
|
85
90
|
"jupyter>=1.0",
|
|
86
91
|
"vl-convert-python>=1.5",
|
|
87
|
-
"pysofra>=0.1.0a3",
|
|
88
92
|
]
|
|
89
93
|
dev = [
|
|
90
94
|
"pytest>=8",
|
|
@@ -6,20 +6,21 @@ result objects (:class:`KeynessResult`, :class:`CollocationShiftResult`,
|
|
|
6
6
|
:class:`SemanticShiftResult`, :class:`TemporalTrajectory`,
|
|
7
7
|
:class:`NetworkResult`, :class:`ForecastResult`,
|
|
8
8
|
:class:`CausalImpactResult`, :class:`BocpdResult`,
|
|
9
|
-
:class:`ConcordanceResult`), each implementing the
|
|
10
|
-
``.to_df / .plot / .explain / .summary / .to_html / .to_json``
|
|
9
|
+
:class:`ConcordanceResult`), each implementing the relevant subset of
|
|
10
|
+
the ``.to_df / .plot / .explain / .summary / .to_html / .to_json``
|
|
11
|
+
contract. See ``docs/design.md`` for the per-Result method matrix.
|
|
11
12
|
|
|
12
13
|
Example
|
|
13
14
|
-------
|
|
14
15
|
|
|
15
16
|
>>> import pycorpdiff as pcd
|
|
16
|
-
>>> pcd.__version__
|
|
17
|
-
|
|
17
|
+
>>> isinstance(pcd.__version__, str)
|
|
18
|
+
True
|
|
18
19
|
"""
|
|
19
20
|
|
|
20
21
|
from __future__ import annotations
|
|
21
22
|
|
|
22
|
-
__version__ = "0.1.
|
|
23
|
+
__version__ = "0.1.0a8"
|
|
23
24
|
|
|
24
25
|
from .collocation.network import NetworkResult, cooccurrence_network
|
|
25
26
|
from .compare import Comparison, compare
|
|
@@ -149,7 +149,9 @@ class Comparison:
|
|
|
149
149
|
if effect_size:
|
|
150
150
|
table["log_ratio"] = _log_ratio(a_kept, b_kept, n_a, n_b)
|
|
151
151
|
table["percent_diff"] = _percent_diff(a_kept, b_kept, n_a, n_b)
|
|
152
|
-
table["bayes_factor"] = _bayes_factor(
|
|
152
|
+
table["bayes_factor"] = _bayes_factor(
|
|
153
|
+
a_kept, b_kept, n_a, n_b, formula=formula
|
|
154
|
+
)
|
|
153
155
|
|
|
154
156
|
if dispersion:
|
|
155
157
|
kept_terms = table.index
|
|
@@ -242,6 +242,15 @@ class Corpus:
|
|
|
242
242
|
"""
|
|
243
243
|
from .temporal.slicing import TemporalCorpus # local import to break cycle
|
|
244
244
|
|
|
245
|
+
if len(self.docs) == 0:
|
|
246
|
+
raise ValueError(
|
|
247
|
+
"by_time() requires a non-empty corpus; got 0 documents."
|
|
248
|
+
)
|
|
249
|
+
if col not in self.docs.columns:
|
|
250
|
+
raise ValueError(
|
|
251
|
+
f"by_time(col={col!r}, ...): column not found in corpus. "
|
|
252
|
+
f"Available columns: {list(self.docs.columns)!r}."
|
|
253
|
+
)
|
|
245
254
|
return TemporalCorpus(parent=self, time_col=col, freq=freq)
|
|
246
255
|
|
|
247
256
|
def with_tokenizer(self, tokenizer: Tokenizer) -> Corpus:
|
|
@@ -71,12 +71,24 @@ def read_duckdb(
|
|
|
71
71
|
... )
|
|
72
72
|
"""
|
|
73
73
|
try:
|
|
74
|
-
import duckdb
|
|
74
|
+
import duckdb
|
|
75
75
|
except ImportError as exc: # pragma: no cover
|
|
76
76
|
raise ImportError(
|
|
77
77
|
"read_duckdb requires duckdb. Install with: pip install 'pycorpdiff[duckdb]'"
|
|
78
78
|
) from exc
|
|
79
79
|
|
|
80
|
+
if isinstance(connection, str):
|
|
81
|
+
raise TypeError(
|
|
82
|
+
"read_duckdb expects a DuckDB connection, not a file path. "
|
|
83
|
+
f"Got connection={connection!r}. Open one first: "
|
|
84
|
+
f'duckdb.connect({connection!r}), or pcd.read_duckdb(duckdb.connect(), "...")'
|
|
85
|
+
)
|
|
86
|
+
if not isinstance(connection, duckdb.DuckDBPyConnection):
|
|
87
|
+
raise TypeError(
|
|
88
|
+
"read_duckdb expects a duckdb.DuckDBPyConnection; got "
|
|
89
|
+
f"{type(connection).__name__}. Open one via duckdb.connect(...)."
|
|
90
|
+
)
|
|
91
|
+
|
|
80
92
|
cursor = connection.execute(query, params) if params is not None else connection.execute(query)
|
|
81
93
|
df = cursor.df()
|
|
82
94
|
if text_col not in df.columns:
|
|
@@ -15,7 +15,7 @@ from __future__ import annotations
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
|
-
from .loglikelihood import log_likelihood
|
|
18
|
+
from .loglikelihood import LLFormula, log_likelihood
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def bayes_factor(
|
|
@@ -23,6 +23,8 @@ def bayes_factor(
|
|
|
23
23
|
counts_b: pd.Series,
|
|
24
24
|
total_a: int,
|
|
25
25
|
total_b: int,
|
|
26
|
+
*,
|
|
27
|
+
formula: LLFormula = "rayson",
|
|
26
28
|
) -> pd.Series:
|
|
27
29
|
"""BIC-approximated Bayes factor for each term's frequency difference.
|
|
28
30
|
|
|
@@ -31,6 +33,12 @@ def bayes_factor(
|
|
|
31
33
|
the unsigned log-likelihood. The Bayes factor is then
|
|
32
34
|
``exp(BIC / 2)``. Wilson (2013) is the keyness application.
|
|
33
35
|
|
|
36
|
+
``formula`` selects which G² flavour feeds the BF: ``"rayson"`` (the
|
|
37
|
+
2-cell shortcut, default; matches the LL Wizard) or ``"dunning"``
|
|
38
|
+
(the full 4-cell G²; matches quanteda/NLTK). Use the same
|
|
39
|
+
``formula=`` as the ``keyness()`` call that produced the row so the
|
|
40
|
+
G² and the Bayes factor in a single row describe the same statistic.
|
|
41
|
+
|
|
34
42
|
Interpret with Kass & Raftery (1995):
|
|
35
43
|
|
|
36
44
|
- ``BF > 2`` : positive evidence
|
|
@@ -43,7 +51,7 @@ def bayes_factor(
|
|
|
43
51
|
plots / sorts handle it.
|
|
44
52
|
"""
|
|
45
53
|
terms = counts_a.index.union(counts_b.index)
|
|
46
|
-
ll_table = log_likelihood(counts_a, counts_b, total_a, total_b)
|
|
54
|
+
ll_table = log_likelihood(counts_a, counts_b, total_a, total_b, formula=formula)
|
|
47
55
|
g2_abs = ll_table["g2"].abs()
|
|
48
56
|
bic = g2_abs - np.log(total_a + total_b)
|
|
49
57
|
with np.errstate(over="ignore"):
|
|
@@ -10,7 +10,7 @@ contract:
|
|
|
10
10
|
- ``.summary()`` returns a short human-readable string.
|
|
11
11
|
- ``.explain(term, n)`` returns a :class:`ConcordanceResult` with
|
|
12
12
|
KWIC evidence for one row of the result. Defined only on
|
|
13
|
-
|
|
13
|
+
term-ranked Results (``KeynessResult``, ``CollocationShiftResult``)
|
|
14
14
|
where "one row of the result" maps to a target term.
|
|
15
15
|
|
|
16
16
|
See ``docs/design.md`` for the per-Result method matrix. This contract
|
|
@@ -257,15 +257,32 @@ class SemanticShiftResult:
|
|
|
257
257
|
return _table_to_json(self.table, path, **kw)
|
|
258
258
|
|
|
259
259
|
def plot(self, **kw: Any) -> alt.Chart:
|
|
260
|
-
"""
|
|
260
|
+
"""Horizontal bar chart of cosine distance per target term.
|
|
261
261
|
|
|
262
|
-
For a
|
|
263
|
-
|
|
264
|
-
|
|
262
|
+
For a multi-period trajectory of cosine distances (an across-
|
|
263
|
+
time view rather than a single A-vs-B snapshot), use
|
|
264
|
+
:func:`pycorpdiff.semantic_trajectory` paired with
|
|
265
|
+
:func:`pycorpdiff.viz.semantic_forecast_plot`.
|
|
266
|
+
|
|
267
|
+
Extra keyword arguments forward to :meth:`altair.Chart.properties`.
|
|
265
268
|
"""
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
+
import altair as alt
|
|
270
|
+
|
|
271
|
+
return ( # type: ignore[no-any-return]
|
|
272
|
+
alt.Chart(self.table)
|
|
273
|
+
.mark_bar(color="#0b6e7c")
|
|
274
|
+
.encode(
|
|
275
|
+
x=alt.X("cosine_distance:Q", title="Cosine distance (A → B)"),
|
|
276
|
+
y=alt.Y("target:N", sort="-x", title=None),
|
|
277
|
+
tooltip=[
|
|
278
|
+
"target",
|
|
279
|
+
alt.Tooltip("cosine_similarity:Q", format=".4f"),
|
|
280
|
+
alt.Tooltip("cosine_distance:Q", format=".4f"),
|
|
281
|
+
"n_contexts_a",
|
|
282
|
+
"n_contexts_b",
|
|
283
|
+
],
|
|
284
|
+
)
|
|
285
|
+
.properties(width=400, **kw)
|
|
269
286
|
)
|
|
270
287
|
|
|
271
288
|
def neighbors_before(
|
|
@@ -46,6 +46,28 @@ def _centroid(vectors: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
|
|
|
46
46
|
return out
|
|
47
47
|
|
|
48
48
|
|
|
49
|
+
def _validate_embeddings(
|
|
50
|
+
vecs: np.ndarray[Any, Any], expected_rows: int, side: str
|
|
51
|
+
) -> None:
|
|
52
|
+
"""Catch mis-shaped embedder output before it produces silent nonsense.
|
|
53
|
+
|
|
54
|
+
A 1-D return from ``embedder.encode`` would otherwise be averaged into
|
|
55
|
+
a scalar centroid and yield ``cosine_similarity == 1.0`` for any
|
|
56
|
+
comparison — a silently wrong result.
|
|
57
|
+
"""
|
|
58
|
+
if vecs.ndim != 2:
|
|
59
|
+
raise ValueError(
|
|
60
|
+
f"embedder.encode() for corpus {side!r} returned an array of "
|
|
61
|
+
f"rank {vecs.ndim}; expected 2 (shape (n_windows, d)). "
|
|
62
|
+
f"Got shape {vecs.shape}."
|
|
63
|
+
)
|
|
64
|
+
if vecs.shape[0] != expected_rows:
|
|
65
|
+
raise ValueError(
|
|
66
|
+
f"embedder.encode() for corpus {side!r} returned "
|
|
67
|
+
f"{vecs.shape[0]} rows; expected {expected_rows} (one per window)."
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
|
|
49
71
|
def semantic_shift(
|
|
50
72
|
a: Corpus | CorpusSlice,
|
|
51
73
|
b: Corpus | CorpusSlice,
|
|
@@ -103,6 +125,8 @@ def semantic_shift(
|
|
|
103
125
|
|
|
104
126
|
vecs_a = np.asarray(embedder.encode(wins_a), dtype=np.float64)
|
|
105
127
|
vecs_b = np.asarray(embedder.encode(wins_b), dtype=np.float64)
|
|
128
|
+
_validate_embeddings(vecs_a, expected_rows=len(wins_a), side="a")
|
|
129
|
+
_validate_embeddings(vecs_b, expected_rows=len(wins_b), side="b")
|
|
106
130
|
|
|
107
131
|
if align == "procrustes":
|
|
108
132
|
# Procrustes wants two matrices of the same shape. Pad / truncate
|
|
@@ -71,9 +71,12 @@ def test_fetch_coha_1990_returns_real_vocab(histwords_cache_dir: Path) -> None:
|
|
|
71
71
|
everyday words. Doesn't check vector values — that's the next test."""
|
|
72
72
|
if not _has_internet():
|
|
73
73
|
pytest.skip("offline")
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
74
|
+
try:
|
|
75
|
+
vecs = pcd.fetch_histwords_decade(
|
|
76
|
+
1990, source="coha", cache_dir=histwords_cache_dir
|
|
77
|
+
)
|
|
78
|
+
except FileNotFoundError as exc:
|
|
79
|
+
pytest.skip(f"COHA 1990s not available: {exc}")
|
|
77
80
|
# COHA 1990s vocab is large (~50k+ words). Expect basic English words.
|
|
78
81
|
for word in ("the", "and", "of", "is", "people"):
|
|
79
82
|
assert word in vecs, f"expected {word!r} in 1990s COHA vocab"
|
|
@@ -98,6 +101,8 @@ def test_known_shifters_show_high_cosine_distance(
|
|
|
98
101
|
)
|
|
99
102
|
except KeyError:
|
|
100
103
|
pytest.skip(f"{word!r} missing from COHA 1900s or 1990s vocab")
|
|
104
|
+
except FileNotFoundError as exc:
|
|
105
|
+
pytest.skip(f"COHA decade data not available: {exc}")
|
|
101
106
|
assert d > 0.3, (
|
|
102
107
|
f"expected {word!r} to show cosine distance > 0.3 "
|
|
103
108
|
f"between 1900s and 1990s COHA; got {d:.3f}"
|
|
@@ -115,9 +120,12 @@ def test_stable_function_words_show_low_cosine_distance(
|
|
|
115
120
|
pytest.skip("offline")
|
|
116
121
|
stable = ["the", "and", "of"]
|
|
117
122
|
for word in stable:
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
123
|
+
try:
|
|
124
|
+
d = pcd.histwords_cosine_shift(
|
|
125
|
+
1900, 1990, word, source="coha", cache_dir=histwords_cache_dir
|
|
126
|
+
)
|
|
127
|
+
except FileNotFoundError as exc:
|
|
128
|
+
pytest.skip(f"COHA decade data not available: {exc}")
|
|
121
129
|
assert d < 0.30, (
|
|
122
130
|
f"expected {word!r} to be stable across decades "
|
|
123
131
|
f"(cosine distance < 0.30); got {d:.3f}"
|
|
@@ -137,19 +145,25 @@ def test_shifter_distance_exceeds_stable_distance_by_meaningful_margin(
|
|
|
137
145
|
shifter_distances = []
|
|
138
146
|
for word in ("gay", "broadcast", "awful"):
|
|
139
147
|
with contextlib.suppress(KeyError):
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
148
|
+
try:
|
|
149
|
+
shifter_distances.append(
|
|
150
|
+
pcd.histwords_cosine_shift(
|
|
151
|
+
1900, 1990, word, source="coha",
|
|
152
|
+
cache_dir=histwords_cache_dir,
|
|
153
|
+
)
|
|
144
154
|
)
|
|
145
|
-
|
|
155
|
+
except FileNotFoundError as exc:
|
|
156
|
+
pytest.skip(f"COHA decade data not available: {exc}")
|
|
146
157
|
stable_distances = []
|
|
147
158
|
for word in ("the", "and", "of"):
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
159
|
+
try:
|
|
160
|
+
stable_distances.append(
|
|
161
|
+
pcd.histwords_cosine_shift(
|
|
162
|
+
1900, 1990, word, source="coha", cache_dir=histwords_cache_dir
|
|
163
|
+
)
|
|
151
164
|
)
|
|
152
|
-
|
|
165
|
+
except FileNotFoundError as exc:
|
|
166
|
+
pytest.skip(f"COHA decade data not available: {exc}")
|
|
153
167
|
if not shifter_distances:
|
|
154
168
|
pytest.skip("no shifters available in COHA vocab")
|
|
155
169
|
avg_shift = sum(shifter_distances) / len(shifter_distances)
|