pycorpdiff 0.1.0a5__tar.gz → 0.1.0a7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/.gitignore +0 -3
- pycorpdiff-0.1.0a7/CHANGELOG.md +71 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/CITATION.cff +4 -2
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/PKG-INFO +42 -24
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/README.md +39 -21
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/pyproject.toml +15 -9
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/__init__.py +6 -5
- pycorpdiff-0.1.0a7/src/pycorpdiff/_backends/pandas.py +9 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/compare.py +15 -2
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/corpus.py +9 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/io/duckdb.py +13 -1
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/io/huggingface.py +1 -1
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/bayes.py +10 -2
- pycorpdiff-0.1.0a7/src/pycorpdiff/keyness/loglikelihood.py +149 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/results.py +37 -14
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/shift.py +24 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_histwords.py +29 -15
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_quanteda.py +40 -27
- pycorpdiff-0.1.0a7/tests/unit/test_audit_a7_fixes.py +133 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_loglikelihood.py +46 -0
- pycorpdiff-0.1.0a5/CHANGELOG.md +0 -44
- pycorpdiff-0.1.0a5/src/pycorpdiff/_backends/pandas.py +0 -3
- pycorpdiff-0.1.0a5/src/pycorpdiff/keyness/loglikelihood.py +0 -92
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/LICENSE +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/_backends/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/_backends/polars.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/measures.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/network.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/shift.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/hansard.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/histwords.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/explain.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/io/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/io/readers.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/chi_squared.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/correction.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/dispersion.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/multicorpus.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/permutation.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/py.typed +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/alignment.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/embed.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/trajectory.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/stats.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/bocpd.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/changepoint.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/forecast.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/its.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/slicing.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/tokenize.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/bocpd.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/collocation.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/dispersion.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/forecast.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/keyness.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/network.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/scattertext.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/trajectory.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/conftest.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/fixtures/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_collocation_integration.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_nltk.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_rayson.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_scattertext.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_explain_integration.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_keyness_integration.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_sbert_slow.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_semantic_integration.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_stop_words.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_temporal_stats.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_viz.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/property/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/property/test_collocation_properties.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/property/test_keyness_properties.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/property/test_temporal_properties.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/__init__.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_bayes_factor.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_bocpd.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_causal_impact.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_changepoint.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_chi_squared.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_collocation_cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_collocation_measures.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_collocation_shift.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_comparison_concordance.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_cooccurrence_network.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_corpus_hash.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_corpus_vocab.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_correction.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_datasets_hansard.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_dispersion.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_dispersion_plot.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_doc_term_counts_sparse.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_embedders.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_explain.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_forecast.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_forecast_semantic_drift.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_from_huggingface.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_hansard_fetcher.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_histwords_loader.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_its.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_keyness_multi.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_ngram_tokenizer.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_permutation_keyness.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_polars_interop.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_procrustes.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_read_duckdb.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_read_txt_line_mode.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_result_exports.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_scattertext_plot.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_semantic_neighbours.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_semantic_shift.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_semantic_trajectory.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_smoke.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_temporal.py +0 -0
- {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_wilson_ci.py +0 -0
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to `pycorpdiff` are documented in this file. The format
|
|
4
|
+
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
|
|
5
|
+
project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
|
+
|
|
7
|
+
## [0.1.0a7] — first public release
|
|
8
|
+
|
|
9
|
+
The first public alpha of `pycorpdiff` — comparative corpus analysis
|
|
10
|
+
for modern Python workflows. Three public verbs (`compare`, `track`,
|
|
11
|
+
`compare.before_after`), nine `Result` dataclasses each implementing
|
|
12
|
+
the relevant subset of `.to_df / .plot / .explain / .summary /
|
|
13
|
+
.to_html / .to_json` (see `docs/design.md` for the per-Result method
|
|
14
|
+
matrix), two `typing.Protocol` extension points (`Tokenizer`,
|
|
15
|
+
`Embedder`), and opt-in extras for visualisation, semantic embedding,
|
|
16
|
+
temporal modelling, polars interop, DuckDB ingestion, 🤗 Datasets,
|
|
17
|
+
and notebook rendering.
|
|
18
|
+
|
|
19
|
+
### Analytical surface
|
|
20
|
+
|
|
21
|
+
- **Keyness**: signed log-likelihood G² with selectable formula
|
|
22
|
+
(`formula="rayson"` 2-cell shortcut, default; matches the UCREL
|
|
23
|
+
LL Wizard. `formula="dunning"` 4-cell G²; matches NLTK +
|
|
24
|
+
`quanteda::textstat_keyness(measure="lr")` byte-for-byte.). Pearson
|
|
25
|
+
χ², Hardie LogRatio, Gabrielatos %DIFF, BIC-approximated Bayes
|
|
26
|
+
factor (also tracks the `formula=` choice), Juilland D / Gries DP
|
|
27
|
+
dispersion flagging, Benjamini–Hochberg correction, stop-word
|
|
28
|
+
filtering, empirical permutation *p*-values, N-way contingency G²
|
|
29
|
+
via `keyness_multi`.
|
|
30
|
+
- **Collocations**: logDice, PMI, t-score, MI³ with Laplace smoothing;
|
|
31
|
+
cross-corpus `collocation_shift`; co-occurrence networks via
|
|
32
|
+
`cooccurrence_network`.
|
|
33
|
+
- **Semantic shift**: averaged contextual embeddings, Procrustes
|
|
34
|
+
alignment, multi-period `semantic_trajectory`, `neighborhood_drift`.
|
|
35
|
+
Embedder output shape is validated to catch silently-broken
|
|
36
|
+
embedders before they produce nonsense.
|
|
37
|
+
- **Temporal**: Wilson-CI trajectories, offline PELT changepoints,
|
|
38
|
+
online Bayesian changepoint detection, segmented-OLS interrupted
|
|
39
|
+
time series, Bayesian structural time-series causal impact,
|
|
40
|
+
state-space exponential-smoothing forecasting.
|
|
41
|
+
|
|
42
|
+
### Cross-validated
|
|
43
|
+
|
|
44
|
+
The package is checked against standard tools by automated test:
|
|
45
|
+
|
|
46
|
+
- **Rayson's LL Wizard** — hand-derived contingency-table reference
|
|
47
|
+
triples (fast tier; runs on every push).
|
|
48
|
+
- **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
|
|
49
|
+
on every adjacent bigram (slow tier).
|
|
50
|
+
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
|
|
51
|
+
US Conventions corpus (slow tier).
|
|
52
|
+
- **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
|
|
53
|
+
`formula="dunning"` (slow tier).
|
|
54
|
+
- **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
|
|
55
|
+
sanity check on Stanford SNAP COHA decade embeddings; skips
|
|
56
|
+
gracefully when the archive isn't reachable (slow tier).
|
|
57
|
+
|
|
58
|
+
### Extras
|
|
59
|
+
|
|
60
|
+
`[viz]`, `[semantic]`, `[temporal]`, `[polars]`, `[duckdb]`, `[nlp]`,
|
|
61
|
+
`[huggingface]`, `[notebooks]`, `[all]` are MIT-compatible. A separate
|
|
62
|
+
`[showcase]` extra pulls in `pysofra` (GPL-3.0-or-later) for
|
|
63
|
+
JAMA-style table polish in the showcase notebook — opt in explicitly
|
|
64
|
+
if you accept that licence.
|
|
65
|
+
|
|
66
|
+
### Infrastructure
|
|
67
|
+
|
|
68
|
+
Hundreds of tests, `ruff` + `mypy --strict` clean across the source
|
|
69
|
+
tree, matrix CI on three Python versions × two operating systems,
|
|
70
|
+
plus a slow-tier CI job exercising the cross-validation receipts
|
|
71
|
+
against NLTK + quanteda on main pushes.
|
|
@@ -4,7 +4,7 @@ message: >
|
|
|
4
4
|
entry. GitHub renders a "Cite this repository" widget directly from
|
|
5
5
|
this file.
|
|
6
6
|
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
7
|
-
version: 0.1.
|
|
7
|
+
version: 0.1.0a7
|
|
8
8
|
date-released: 2026-05-25
|
|
9
9
|
authors:
|
|
10
10
|
- family-names: Turner
|
|
@@ -32,7 +32,9 @@ abstract: >
|
|
|
32
32
|
API. The package targets corpus linguistics, digital humanities,
|
|
33
33
|
computational social science, and discourse analysis research,
|
|
34
34
|
emphasising interpretability, explainability, statistical rigour,
|
|
35
|
-
and reproducibility.
|
|
35
|
+
and reproducibility. A bundled synthetic UK-Hansard-style sample
|
|
36
|
+
ships for offline demonstration; real-data interfaces include
|
|
37
|
+
fetch_hansard and from_huggingface.
|
|
36
38
|
identifiers:
|
|
37
39
|
- type: url
|
|
38
40
|
value: "https://github.com/jturner-uofl/pycorpdiff"
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycorpdiff
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a7
|
|
4
4
|
Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
|
|
5
5
|
Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
|
|
6
6
|
Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
|
|
@@ -54,7 +54,6 @@ Requires-Dist: matplotlib>=3.8; extra == 'all'
|
|
|
54
54
|
Requires-Dist: networkx>=3.1; extra == 'all'
|
|
55
55
|
Requires-Dist: polars>=1.0; extra == 'all'
|
|
56
56
|
Requires-Dist: pyarrow>=15; extra == 'all'
|
|
57
|
-
Requires-Dist: pysofra>=0.1.0a3; extra == 'all'
|
|
58
57
|
Requires-Dist: ruptures>=1.1; extra == 'all'
|
|
59
58
|
Requires-Dist: scikit-learn>=1.3; extra == 'all'
|
|
60
59
|
Requires-Dist: sentence-transformers>=2.2; extra == 'all'
|
|
@@ -77,7 +76,6 @@ Provides-Extra: nlp
|
|
|
77
76
|
Requires-Dist: spacy>=3.7; extra == 'nlp'
|
|
78
77
|
Provides-Extra: notebooks
|
|
79
78
|
Requires-Dist: jupyter>=1.0; extra == 'notebooks'
|
|
80
|
-
Requires-Dist: pysofra>=0.1.0a3; extra == 'notebooks'
|
|
81
79
|
Requires-Dist: vl-convert-python>=1.5; extra == 'notebooks'
|
|
82
80
|
Provides-Extra: polars
|
|
83
81
|
Requires-Dist: polars>=1.0; extra == 'polars'
|
|
@@ -85,6 +83,8 @@ Requires-Dist: pyarrow>=15; extra == 'polars'
|
|
|
85
83
|
Provides-Extra: semantic
|
|
86
84
|
Requires-Dist: scikit-learn>=1.3; extra == 'semantic'
|
|
87
85
|
Requires-Dist: sentence-transformers>=2.2; extra == 'semantic'
|
|
86
|
+
Provides-Extra: showcase
|
|
87
|
+
Requires-Dist: pysofra>=0.1.0a3; extra == 'showcase'
|
|
88
88
|
Provides-Extra: temporal
|
|
89
89
|
Requires-Dist: ruptures>=1.1; extra == 'temporal'
|
|
90
90
|
Requires-Dist: statsmodels>=0.14; extra == 'temporal'
|
|
@@ -127,11 +127,11 @@ and computational social science routinely have:
|
|
|
127
127
|
`pycorpdiff` is positioned as **orchestration**, not reinvention.
|
|
128
128
|
Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
|
|
129
129
|
`SBERT`-compatible model) plug in via two `typing.Protocol` extension
|
|
130
|
-
points — one-line adapters, no plugin registry. The base install
|
|
131
|
-
|
|
132
|
-
via extras.
|
|
130
|
+
points — one-line adapters, no plugin registry. The base install's
|
|
131
|
+
direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
|
|
132
|
+
`pyarrow`; everything else is opt-in via extras.
|
|
133
133
|
|
|
134
|
-
> **Status: alpha (0.1.
|
|
134
|
+
> **Status: alpha (0.1.0a7).** Public API is stable for the features
|
|
135
135
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
136
136
|
|
|
137
137
|
## The three-layer architecture
|
|
@@ -178,7 +178,8 @@ for the full feature tour, or the cheat sheet below for one-line API previews.
|
|
|
178
178
|
|
|
179
179
|
```python
|
|
180
180
|
# Compare verbs (returns Result objects; methods exposed vary by Result)
|
|
181
|
-
pcd.compare(a, b).keyness()
|
|
181
|
+
pcd.compare(a, b).keyness() # default formula="rayson" (LL Wizard)
|
|
182
|
+
pcd.compare(a, b).keyness(formula="dunning") # full 4-cell G² (matches quanteda / NLTK)
|
|
182
183
|
pcd.compare(a, b).collocation_shift("immigrant")
|
|
183
184
|
pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
184
185
|
# SBERTEmbedder downloads a sentence-transformers model on first call;
|
|
@@ -190,7 +191,7 @@ tr.changepoints() # offline PELT
|
|
|
190
191
|
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
191
192
|
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
192
193
|
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
193
|
-
tr.forecast(horizon=4) # state-space ETS
|
|
194
|
+
tr.forecast(horizon=4) # 4 periods at the over_time freq (state-space ETS)
|
|
194
195
|
|
|
195
196
|
# Before / after a known event
|
|
196
197
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
@@ -209,17 +210,20 @@ every analytical surface.
|
|
|
209
210
|
## Installation
|
|
210
211
|
|
|
211
212
|
```bash
|
|
212
|
-
pip install pycorpdiff
|
|
213
|
-
pip install "pycorpdiff[viz]"
|
|
214
|
-
pip install "pycorpdiff[semantic]"
|
|
215
|
-
pip install "pycorpdiff[temporal]"
|
|
216
|
-
pip install "pycorpdiff[notebooks]"
|
|
217
|
-
pip install "pycorpdiff[all]"
|
|
213
|
+
pip install pycorpdiff # lexical-comparative core (MIT)
|
|
214
|
+
pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
|
|
215
|
+
pip install "pycorpdiff[semantic]" # + sentence-transformers
|
|
216
|
+
pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
|
|
217
|
+
pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert
|
|
218
|
+
pip install "pycorpdiff[all]" # everything MIT-compatible
|
|
219
|
+
pip install "pycorpdiff[all,showcase]" # + pysofra (GPL-3.0-or-later) for the JAMA-style showcase
|
|
218
220
|
```
|
|
219
221
|
|
|
220
|
-
The base install
|
|
221
|
-
`scipy`, `pyarrow
|
|
222
|
-
only pay for what you use.
|
|
222
|
+
The base install's direct runtime dependencies are `numpy`, `pandas`,
|
|
223
|
+
`scipy`, and `pyarrow`; optional extras land per analytical layer so
|
|
224
|
+
you only pay for what you use. `[showcase]` is broken out separately
|
|
225
|
+
because `pysofra` is GPL-3.0-or-later — pure `pycorpdiff` use without
|
|
226
|
+
that extra remains MIT-only.
|
|
223
227
|
|
|
224
228
|
To work from source:
|
|
225
229
|
|
|
@@ -232,13 +236,27 @@ pytest -q
|
|
|
232
236
|
|
|
233
237
|
## Cross-validation receipts
|
|
234
238
|
|
|
235
|
-
The math
|
|
239
|
+
The math is checked against standard tools by automated test. The
|
|
240
|
+
fast tier runs on every push (matrix CI); the slow tier needs heavy
|
|
241
|
+
optional dependencies (R + quanteda, NLTK, rpy2, Stanford SNAP
|
|
242
|
+
downloads) and runs on main pushes only.
|
|
236
243
|
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
- **
|
|
240
|
-
|
|
241
|
-
|
|
244
|
+
Fast tier:
|
|
245
|
+
|
|
246
|
+
- **Rayson's LL Wizard** — hand-derived contingency-table reference
|
|
247
|
+
triples ([`tests/integration/test_crossval_rayson.py`](https://github.com/jturner-uofl/pycorpdiff/blob/main/tests/integration/test_crossval_rayson.py))
|
|
248
|
+
|
|
249
|
+
Slow tier:
|
|
250
|
+
|
|
251
|
+
- **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
|
|
252
|
+
on every adjacent bigram
|
|
253
|
+
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
|
|
254
|
+
US Conventions corpus
|
|
255
|
+
- **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
|
|
256
|
+
`formula="dunning"`
|
|
257
|
+
- **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
|
|
258
|
+
sanity check on Stanford SNAP COHA decade embeddings (skips
|
|
259
|
+
gracefully if the archive isn't reachable)
|
|
242
260
|
|
|
243
261
|
## Citation
|
|
244
262
|
|
|
@@ -31,11 +31,11 @@ and computational social science routinely have:
|
|
|
31
31
|
`pycorpdiff` is positioned as **orchestration**, not reinvention.
|
|
32
32
|
Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
|
|
33
33
|
`SBERT`-compatible model) plug in via two `typing.Protocol` extension
|
|
34
|
-
points — one-line adapters, no plugin registry. The base install
|
|
35
|
-
|
|
36
|
-
via extras.
|
|
34
|
+
points — one-line adapters, no plugin registry. The base install's
|
|
35
|
+
direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
|
|
36
|
+
`pyarrow`; everything else is opt-in via extras.
|
|
37
37
|
|
|
38
|
-
> **Status: alpha (0.1.
|
|
38
|
+
> **Status: alpha (0.1.0a7).** Public API is stable for the features
|
|
39
39
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
40
40
|
|
|
41
41
|
## The three-layer architecture
|
|
@@ -82,7 +82,8 @@ for the full feature tour, or the cheat sheet below for one-line API previews.
|
|
|
82
82
|
|
|
83
83
|
```python
|
|
84
84
|
# Compare verbs (returns Result objects; methods exposed vary by Result)
|
|
85
|
-
pcd.compare(a, b).keyness()
|
|
85
|
+
pcd.compare(a, b).keyness() # default formula="rayson" (LL Wizard)
|
|
86
|
+
pcd.compare(a, b).keyness(formula="dunning") # full 4-cell G² (matches quanteda / NLTK)
|
|
86
87
|
pcd.compare(a, b).collocation_shift("immigrant")
|
|
87
88
|
pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
88
89
|
# SBERTEmbedder downloads a sentence-transformers model on first call;
|
|
@@ -94,7 +95,7 @@ tr.changepoints() # offline PELT
|
|
|
94
95
|
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
95
96
|
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
96
97
|
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
97
|
-
tr.forecast(horizon=4) # state-space ETS
|
|
98
|
+
tr.forecast(horizon=4) # 4 periods at the over_time freq (state-space ETS)
|
|
98
99
|
|
|
99
100
|
# Before / after a known event
|
|
100
101
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
@@ -113,17 +114,20 @@ every analytical surface.
|
|
|
113
114
|
## Installation
|
|
114
115
|
|
|
115
116
|
```bash
|
|
116
|
-
pip install pycorpdiff
|
|
117
|
-
pip install "pycorpdiff[viz]"
|
|
118
|
-
pip install "pycorpdiff[semantic]"
|
|
119
|
-
pip install "pycorpdiff[temporal]"
|
|
120
|
-
pip install "pycorpdiff[notebooks]"
|
|
121
|
-
pip install "pycorpdiff[all]"
|
|
117
|
+
pip install pycorpdiff # lexical-comparative core (MIT)
|
|
118
|
+
pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
|
|
119
|
+
pip install "pycorpdiff[semantic]" # + sentence-transformers
|
|
120
|
+
pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
|
|
121
|
+
pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert
|
|
122
|
+
pip install "pycorpdiff[all]" # everything MIT-compatible
|
|
123
|
+
pip install "pycorpdiff[all,showcase]" # + pysofra (GPL-3.0-or-later) for the JAMA-style showcase
|
|
122
124
|
```
|
|
123
125
|
|
|
124
|
-
The base install
|
|
125
|
-
`scipy`, `pyarrow
|
|
126
|
-
only pay for what you use.
|
|
126
|
+
The base install's direct runtime dependencies are `numpy`, `pandas`,
|
|
127
|
+
`scipy`, and `pyarrow`; optional extras land per analytical layer so
|
|
128
|
+
you only pay for what you use. `[showcase]` is broken out separately
|
|
129
|
+
because `pysofra` is GPL-3.0-or-later — pure `pycorpdiff` use without
|
|
130
|
+
that extra remains MIT-only.
|
|
127
131
|
|
|
128
132
|
To work from source:
|
|
129
133
|
|
|
@@ -136,13 +140,27 @@ pytest -q
|
|
|
136
140
|
|
|
137
141
|
## Cross-validation receipts
|
|
138
142
|
|
|
139
|
-
The math
|
|
143
|
+
The math is checked against standard tools by automated test. The
|
|
144
|
+
fast tier runs on every push (matrix CI); the slow tier needs heavy
|
|
145
|
+
optional dependencies (R + quanteda, NLTK, rpy2, Stanford SNAP
|
|
146
|
+
downloads) and runs on main pushes only.
|
|
140
147
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
- **
|
|
144
|
-
|
|
145
|
-
|
|
148
|
+
Fast tier:
|
|
149
|
+
|
|
150
|
+
- **Rayson's LL Wizard** — hand-derived contingency-table reference
|
|
151
|
+
triples ([`tests/integration/test_crossval_rayson.py`](https://github.com/jturner-uofl/pycorpdiff/blob/main/tests/integration/test_crossval_rayson.py))
|
|
152
|
+
|
|
153
|
+
Slow tier:
|
|
154
|
+
|
|
155
|
+
- **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
|
|
156
|
+
on every adjacent bigram
|
|
157
|
+
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
|
|
158
|
+
US Conventions corpus
|
|
159
|
+
- **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
|
|
160
|
+
`formula="dunning"`
|
|
161
|
+
- **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
|
|
162
|
+
sanity check on Stanford SNAP COHA decade embeddings (skips
|
|
163
|
+
gracefully if the archive isn't reachable)
|
|
146
164
|
|
|
147
165
|
## Citation
|
|
148
166
|
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pycorpdiff"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.0a7"
|
|
8
8
|
description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -62,13 +62,18 @@ nlp = ["spacy>=3.7"]
|
|
|
62
62
|
# Public-text-corpus hub. Heavy (pulls pyarrow, fsspec, requests, aiohttp),
|
|
63
63
|
# so opt-in only — base install stays small.
|
|
64
64
|
huggingface = ["datasets>=2.14"]
|
|
65
|
-
# Needed if you want to execute the
|
|
66
|
-
#
|
|
67
|
-
#
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
#
|
|
71
|
-
#
|
|
65
|
+
# Needed if you want to execute the example notebooks. `jupyter` runs
|
|
66
|
+
# the notebook; `vl-convert` does static SVG/PNG export of altair charts.
|
|
67
|
+
# Kept MIT-clean — see `showcase` below for the JAMA-style table polish.
|
|
68
|
+
notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5"]
|
|
69
|
+
# Adds `pysofra` for the showcase notebook's JAMA-style typography.
|
|
70
|
+
# IMPORTANT: pysofra is GPL-3.0-or-later. Opting in to `[showcase]` (or
|
|
71
|
+
# installing pysofra directly) brings GPL into your environment; pure
|
|
72
|
+
# pycorpdiff use without this extra remains MIT-only.
|
|
73
|
+
showcase = ["pysofra>=0.1.0a3"]
|
|
74
|
+
# Meta-extra: every MIT-compatible optional code path. Does NOT include
|
|
75
|
+
# `[showcase]` because pysofra is GPL-3.0-or-later; install
|
|
76
|
+
# `pycorpdiff[all,showcase]` explicitly if you accept that licence.
|
|
72
77
|
all = [
|
|
73
78
|
"altair>=5",
|
|
74
79
|
"matplotlib>=3.8",
|
|
@@ -84,7 +89,6 @@ all = [
|
|
|
84
89
|
"spacy>=3.7",
|
|
85
90
|
"jupyter>=1.0",
|
|
86
91
|
"vl-convert-python>=1.5",
|
|
87
|
-
"pysofra>=0.1.0a3",
|
|
88
92
|
]
|
|
89
93
|
dev = [
|
|
90
94
|
"pytest>=8",
|
|
@@ -176,6 +180,8 @@ disallow_any_generics = true
|
|
|
176
180
|
module = [
|
|
177
181
|
"altair",
|
|
178
182
|
"altair.*",
|
|
183
|
+
"datasets",
|
|
184
|
+
"datasets.*",
|
|
179
185
|
"duckdb",
|
|
180
186
|
"duckdb.*",
|
|
181
187
|
"matplotlib",
|
|
@@ -6,20 +6,21 @@ result objects (:class:`KeynessResult`, :class:`CollocationShiftResult`,
|
|
|
6
6
|
:class:`SemanticShiftResult`, :class:`TemporalTrajectory`,
|
|
7
7
|
:class:`NetworkResult`, :class:`ForecastResult`,
|
|
8
8
|
:class:`CausalImpactResult`, :class:`BocpdResult`,
|
|
9
|
-
:class:`ConcordanceResult`), each implementing the
|
|
10
|
-
``.to_df / .plot / .explain / .summary / .to_html / .to_json``
|
|
9
|
+
:class:`ConcordanceResult`), each implementing the relevant subset of
|
|
10
|
+
the ``.to_df / .plot / .explain / .summary / .to_html / .to_json``
|
|
11
|
+
contract. See ``docs/design.md`` for the per-Result method matrix.
|
|
11
12
|
|
|
12
13
|
Example
|
|
13
14
|
-------
|
|
14
15
|
|
|
15
16
|
>>> import pycorpdiff as pcd
|
|
16
|
-
>>> pcd.__version__
|
|
17
|
-
|
|
17
|
+
>>> isinstance(pcd.__version__, str)
|
|
18
|
+
True
|
|
18
19
|
"""
|
|
19
20
|
|
|
20
21
|
from __future__ import annotations
|
|
21
22
|
|
|
22
|
-
__version__ = "0.1.
|
|
23
|
+
__version__ = "0.1.0a7"
|
|
23
24
|
|
|
24
25
|
from .collocation.network import NetworkResult, cooccurrence_network
|
|
25
26
|
from .compare import Comparison, compare
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
"""Pandas-backed internals for :class:`pycorpdiff.Corpus`.
|
|
2
|
+
|
|
3
|
+
Corpus operations route through this module so backend-specific code
|
|
4
|
+
stays out of the public API. The pandas backend is the default and is
|
|
5
|
+
exercised on every install; polars is opt-in via the ``polars`` extra
|
|
6
|
+
and lives in the sibling :mod:`pycorpdiff._backends.polars`.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
@@ -10,6 +10,7 @@ from dataclasses import dataclass
|
|
|
10
10
|
from typing import TYPE_CHECKING, Literal
|
|
11
11
|
|
|
12
12
|
from .corpus import Corpus, CorpusSlice
|
|
13
|
+
from .keyness.loglikelihood import LLFormula
|
|
13
14
|
|
|
14
15
|
if TYPE_CHECKING:
|
|
15
16
|
from .results import (
|
|
@@ -46,6 +47,7 @@ class Comparison:
|
|
|
46
47
|
def keyness(
|
|
47
48
|
self,
|
|
48
49
|
method: KeynessMethod = "log_likelihood",
|
|
50
|
+
formula: LLFormula = "rayson",
|
|
49
51
|
effect_size: bool = True,
|
|
50
52
|
dispersion: bool = False,
|
|
51
53
|
min_count: int = 5,
|
|
@@ -64,6 +66,14 @@ class Comparison:
|
|
|
64
66
|
sorts by signed Pearson χ². The other modes
|
|
65
67
|
(``"log_ratio"``, ``"bayes_factor"``, ``"percent_diff"``)
|
|
66
68
|
require ``effect_size=True`` and sort by that column.
|
|
69
|
+
formula
|
|
70
|
+
Which log-likelihood formulation to use for the G² column.
|
|
71
|
+
``"rayson"`` (default) is the 2-cell shortcut matching
|
|
72
|
+
Rayson's UCREL LL Wizard; ``"dunning"`` is the full 4-cell
|
|
73
|
+
G² matching NLTK's ``BigramAssocMeasures`` and R's
|
|
74
|
+
``quanteda::textstat_keyness(measure="lr")``. See
|
|
75
|
+
``docs/statistical-methods.md`` for the math + when they
|
|
76
|
+
diverge.
|
|
67
77
|
effect_size
|
|
68
78
|
If True (default), also compute LogRatio (Hardie),
|
|
69
79
|
%DIFF (Gabrielatos), and the BIC-approximated Bayes factor.
|
|
@@ -131,7 +141,7 @@ class Comparison:
|
|
|
131
141
|
# G² is always computed (cheap, the default sort column). χ² is
|
|
132
142
|
# computed only when requested — same shape, asymptotically
|
|
133
143
|
# equivalent, no need to pay for both by default.
|
|
134
|
-
table = log_likelihood(a_kept, b_kept, n_a, n_b)
|
|
144
|
+
table = log_likelihood(a_kept, b_kept, n_a, n_b, formula=formula)
|
|
135
145
|
if method == "chi_squared":
|
|
136
146
|
chi_table = _chi_squared(a_kept, b_kept, n_a, n_b)
|
|
137
147
|
table["chi_squared"] = chi_table["chi_squared"]
|
|
@@ -139,7 +149,9 @@ class Comparison:
|
|
|
139
149
|
if effect_size:
|
|
140
150
|
table["log_ratio"] = _log_ratio(a_kept, b_kept, n_a, n_b)
|
|
141
151
|
table["percent_diff"] = _percent_diff(a_kept, b_kept, n_a, n_b)
|
|
142
|
-
table["bayes_factor"] = _bayes_factor(
|
|
152
|
+
table["bayes_factor"] = _bayes_factor(
|
|
153
|
+
a_kept, b_kept, n_a, n_b, formula=formula
|
|
154
|
+
)
|
|
143
155
|
|
|
144
156
|
if dispersion:
|
|
145
157
|
kept_terms = table.index
|
|
@@ -192,6 +204,7 @@ class Comparison:
|
|
|
192
204
|
label_a=_corpus_label(self.a),
|
|
193
205
|
label_b=_corpus_label(self.b),
|
|
194
206
|
params={
|
|
207
|
+
"formula": formula,
|
|
195
208
|
"effect_size": effect_size,
|
|
196
209
|
"dispersion": dispersion,
|
|
197
210
|
"min_count": min_count,
|
|
@@ -242,6 +242,15 @@ class Corpus:
|
|
|
242
242
|
"""
|
|
243
243
|
from .temporal.slicing import TemporalCorpus # local import to break cycle
|
|
244
244
|
|
|
245
|
+
if len(self.docs) == 0:
|
|
246
|
+
raise ValueError(
|
|
247
|
+
"by_time() requires a non-empty corpus; got 0 documents."
|
|
248
|
+
)
|
|
249
|
+
if col not in self.docs.columns:
|
|
250
|
+
raise ValueError(
|
|
251
|
+
f"by_time(col={col!r}, ...): column not found in corpus. "
|
|
252
|
+
f"Available columns: {list(self.docs.columns)!r}."
|
|
253
|
+
)
|
|
245
254
|
return TemporalCorpus(parent=self, time_col=col, freq=freq)
|
|
246
255
|
|
|
247
256
|
def with_tokenizer(self, tokenizer: Tokenizer) -> Corpus:
|
|
@@ -71,12 +71,24 @@ def read_duckdb(
|
|
|
71
71
|
... )
|
|
72
72
|
"""
|
|
73
73
|
try:
|
|
74
|
-
import duckdb
|
|
74
|
+
import duckdb
|
|
75
75
|
except ImportError as exc: # pragma: no cover
|
|
76
76
|
raise ImportError(
|
|
77
77
|
"read_duckdb requires duckdb. Install with: pip install 'pycorpdiff[duckdb]'"
|
|
78
78
|
) from exc
|
|
79
79
|
|
|
80
|
+
if isinstance(connection, str):
|
|
81
|
+
raise TypeError(
|
|
82
|
+
"read_duckdb expects a DuckDB connection, not a file path. "
|
|
83
|
+
f"Got connection={connection!r}. Open one first: "
|
|
84
|
+
f'duckdb.connect({connection!r}), or pcd.read_duckdb(duckdb.connect(), "...")'
|
|
85
|
+
)
|
|
86
|
+
if not isinstance(connection, duckdb.DuckDBPyConnection):
|
|
87
|
+
raise TypeError(
|
|
88
|
+
"read_duckdb expects a duckdb.DuckDBPyConnection; got "
|
|
89
|
+
f"{type(connection).__name__}. Open one via duckdb.connect(...)."
|
|
90
|
+
)
|
|
91
|
+
|
|
80
92
|
cursor = connection.execute(query, params) if params is not None else connection.execute(query)
|
|
81
93
|
df = cursor.df()
|
|
82
94
|
if text_col not in df.columns:
|
|
@@ -95,7 +95,7 @@ def from_huggingface(
|
|
|
95
95
|
loader = _loader
|
|
96
96
|
if loader is None:
|
|
97
97
|
try:
|
|
98
|
-
from datasets import load_dataset as _hf_load
|
|
98
|
+
from datasets import load_dataset as _hf_load
|
|
99
99
|
except ImportError as exc: # pragma: no cover
|
|
100
100
|
raise ImportError(
|
|
101
101
|
"from_huggingface requires the `datasets` library. "
|
|
@@ -15,7 +15,7 @@ from __future__ import annotations
|
|
|
15
15
|
import numpy as np
|
|
16
16
|
import pandas as pd
|
|
17
17
|
|
|
18
|
-
from .loglikelihood import log_likelihood
|
|
18
|
+
from .loglikelihood import LLFormula, log_likelihood
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
def bayes_factor(
|
|
@@ -23,6 +23,8 @@ def bayes_factor(
|
|
|
23
23
|
counts_b: pd.Series,
|
|
24
24
|
total_a: int,
|
|
25
25
|
total_b: int,
|
|
26
|
+
*,
|
|
27
|
+
formula: LLFormula = "rayson",
|
|
26
28
|
) -> pd.Series:
|
|
27
29
|
"""BIC-approximated Bayes factor for each term's frequency difference.
|
|
28
30
|
|
|
@@ -31,6 +33,12 @@ def bayes_factor(
|
|
|
31
33
|
the unsigned log-likelihood. The Bayes factor is then
|
|
32
34
|
``exp(BIC / 2)``. Wilson (2013) is the keyness application.
|
|
33
35
|
|
|
36
|
+
``formula`` selects which G² flavour feeds the BF: ``"rayson"`` (the
|
|
37
|
+
2-cell shortcut, default; matches the LL Wizard) or ``"dunning"``
|
|
38
|
+
(the full 4-cell G²; matches quanteda/NLTK). Use the same
|
|
39
|
+
``formula=`` as the ``keyness()`` call that produced the row so the
|
|
40
|
+
G² and the Bayes factor in a single row describe the same statistic.
|
|
41
|
+
|
|
34
42
|
Interpret with Kass & Raftery (1995):
|
|
35
43
|
|
|
36
44
|
- ``BF > 2`` : positive evidence
|
|
@@ -43,7 +51,7 @@ def bayes_factor(
|
|
|
43
51
|
plots / sorts handle it.
|
|
44
52
|
"""
|
|
45
53
|
terms = counts_a.index.union(counts_b.index)
|
|
46
|
-
ll_table = log_likelihood(counts_a, counts_b, total_a, total_b)
|
|
54
|
+
ll_table = log_likelihood(counts_a, counts_b, total_a, total_b, formula=formula)
|
|
47
55
|
g2_abs = ll_table["g2"].abs()
|
|
48
56
|
bic = g2_abs - np.log(total_a + total_b)
|
|
49
57
|
with np.errstate(over="ignore"):
|