pycorpdiff 0.1.0a9__tar.gz → 0.1.0a11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/CHANGELOG.md +8 -10
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/CITATION.cff +1 -1
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/PKG-INFO +2 -4
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/README.md +1 -3
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/pyproject.toml +1 -1
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/__init__.py +1 -1
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/collocation/measures.py +9 -4
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/keyness/correction.py +14 -1
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/semantic/shift.py +21 -6
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/temporal/causal_impact.py +22 -7
- pycorpdiff-0.1.0a9/tests/integration/test_crossval_quanteda.py +0 -153
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/.gitignore +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/LICENSE +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/_backends/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/_backends/pandas.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/_backends/polars.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/collocation/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/collocation/network.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/collocation/shift.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/compare.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/corpus.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/datasets/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/datasets/hansard.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/datasets/histwords.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/explain.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/io/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/io/duckdb.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/io/huggingface.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/io/readers.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/keyness/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/keyness/bayes.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/keyness/chi_squared.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/keyness/dispersion.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/keyness/multicorpus.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/keyness/permutation.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/py.typed +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/results.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/semantic/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/semantic/alignment.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/semantic/embed.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/semantic/trajectory.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/stats.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/temporal/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/temporal/bocpd.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/temporal/changepoint.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/temporal/forecast.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/temporal/its.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/temporal/slicing.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/tokenize.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/bocpd.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/collocation.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/dispersion.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/forecast.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/keyness.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/network.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/scattertext.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/viz/trajectory.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/conftest.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/fixtures/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_collocation_integration.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_crossval_histwords.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_crossval_nltk.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_crossval_rayson.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_crossval_scattertext.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_explain_integration.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_keyness_integration.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_sbert_slow.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_semantic_integration.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_stop_words.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_temporal_stats.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_viz.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/property/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/property/test_collocation_properties.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/property/test_keyness_properties.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/property/test_temporal_properties.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/__init__.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_bayes_factor.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_bocpd.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_causal_impact.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_changepoint.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_chi_squared.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_collocation_cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_collocation_measures.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_collocation_shift.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_comparison_concordance.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_cooccurrence_network.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_corpus_hash.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_corpus_vocab.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_correction.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_datasets_hansard.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_dispersion.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_dispersion_plot.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_doc_term_counts_sparse.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_embedders.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_explain.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_forecast.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_forecast_semantic_drift.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_from_huggingface.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_hansard_fetcher.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_histwords_loader.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_its.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_keyness_multi.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_ngram_tokenizer.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_permutation_keyness.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_polars_interop.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_procrustes.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_read_duckdb.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_read_txt_line_mode.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_result_exports.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_scattertext_plot.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_semantic_neighbours.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_semantic_shift.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_semantic_trajectory.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_smoke.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_temporal.py +0 -0
- /pycorpdiff-0.1.0a9/tests/unit/test_audit_a7_fixes.py → /pycorpdiff-0.1.0a11/tests/unit/test_validation_contracts.py +0 -0
- {pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/unit/test_wilson_ci.py +0 -0
|
@@ -4,7 +4,7 @@ All notable changes to `pycorpdiff` are documented in this file. The format
|
|
|
4
4
|
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
|
|
5
5
|
project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
-
## [0.1.
|
|
7
|
+
## [0.1.0a11] — first public release
|
|
8
8
|
|
|
9
9
|
The first public alpha of `pycorpdiff` — comparative corpus analysis
|
|
10
10
|
for modern Python workflows. Three public verbs (`compare`, `track`,
|
|
@@ -20,13 +20,13 @@ and notebook rendering.
|
|
|
20
20
|
|
|
21
21
|
- **Keyness**: signed log-likelihood G² with selectable formula
|
|
22
22
|
(`formula="rayson"` 2-cell shortcut, default; matches the UCREL
|
|
23
|
-
LL Wizard. `formula="dunning"` 4-cell G²;
|
|
24
|
-
`quanteda::textstat_keyness(measure="lr")`
|
|
25
|
-
χ², Hardie LogRatio, Gabrielatos %DIFF, BIC-approximated
|
|
26
|
-
factor (also tracks the `formula=` choice), Juilland D /
|
|
27
|
-
dispersion flagging, Benjamini–Hochberg correction,
|
|
28
|
-
filtering, empirical permutation *p*-values, N-way
|
|
29
|
-
via `keyness_multi`.
|
|
23
|
+
LL Wizard. `formula="dunning"` 4-cell G²; the canonical Dunning
|
|
24
|
+
1993 form used by NLTK and R's `quanteda::textstat_keyness(measure="lr")`).
|
|
25
|
+
Pearson χ², Hardie LogRatio, Gabrielatos %DIFF, BIC-approximated
|
|
26
|
+
Bayes factor (also tracks the `formula=` choice), Juilland D /
|
|
27
|
+
Gries DP dispersion flagging, Benjamini–Hochberg correction,
|
|
28
|
+
stop-word filtering, empirical permutation *p*-values, N-way
|
|
29
|
+
contingency G² via `keyness_multi`.
|
|
30
30
|
- **Collocations**: logDice, PMI, t-score, MI³ with Laplace smoothing;
|
|
31
31
|
cross-corpus `collocation_shift`; co-occurrence networks via
|
|
32
32
|
`cooccurrence_network`.
|
|
@@ -49,8 +49,6 @@ The package is checked against standard tools by automated test:
|
|
|
49
49
|
on every adjacent bigram (slow tier).
|
|
50
50
|
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
|
|
51
51
|
US Conventions corpus (slow tier).
|
|
52
|
-
- **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-6 with
|
|
53
|
-
`formula="dunning"` (cross-runtime float-arithmetic ceiling; slow tier).
|
|
54
52
|
- **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
|
|
55
53
|
sanity check on Stanford SNAP COHA decade embeddings; skips
|
|
56
54
|
gracefully when the archive isn't reachable (slow tier).
|
|
@@ -4,7 +4,7 @@ message: >
|
|
|
4
4
|
entry. GitHub renders a "Cite this repository" widget directly from
|
|
5
5
|
this file.
|
|
6
6
|
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
7
|
-
version: 0.1.
|
|
7
|
+
version: 0.1.0a11
|
|
8
8
|
date-released: 2026-05-25
|
|
9
9
|
authors:
|
|
10
10
|
- family-names: Turner
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycorpdiff
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a11
|
|
4
4
|
Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
|
|
5
5
|
Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
|
|
6
6
|
Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
|
|
@@ -131,7 +131,7 @@ points — one-line adapters, no plugin registry. The base install's
|
|
|
131
131
|
direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
|
|
132
132
|
`pyarrow`; everything else is opt-in via extras.
|
|
133
133
|
|
|
134
|
-
> **Status: alpha (0.1.
|
|
134
|
+
> **Status: alpha (0.1.0a11).** Public API is stable for the features
|
|
135
135
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
136
136
|
|
|
137
137
|
## The three-layer architecture
|
|
@@ -252,8 +252,6 @@ Slow tier:
|
|
|
252
252
|
on every adjacent bigram
|
|
253
253
|
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
|
|
254
254
|
US Conventions corpus
|
|
255
|
-
- **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-6 with
|
|
256
|
-
`formula="dunning"` (cross-runtime float-arithmetic ceiling)
|
|
257
255
|
- **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
|
|
258
256
|
sanity check on Stanford SNAP COHA decade embeddings (skips
|
|
259
257
|
gracefully if the archive isn't reachable)
|
|
@@ -35,7 +35,7 @@ points — one-line adapters, no plugin registry. The base install's
|
|
|
35
35
|
direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
|
|
36
36
|
`pyarrow`; everything else is opt-in via extras.
|
|
37
37
|
|
|
38
|
-
> **Status: alpha (0.1.
|
|
38
|
+
> **Status: alpha (0.1.0a11).** Public API is stable for the features
|
|
39
39
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
40
40
|
|
|
41
41
|
## The three-layer architecture
|
|
@@ -156,8 +156,6 @@ Slow tier:
|
|
|
156
156
|
on every adjacent bigram
|
|
157
157
|
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
|
|
158
158
|
US Conventions corpus
|
|
159
|
-
- **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-6 with
|
|
160
|
-
`formula="dunning"` (cross-runtime float-arithmetic ceiling)
|
|
161
159
|
- **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
|
|
162
160
|
sanity check on Stanford SNAP COHA decade embeddings (skips
|
|
163
161
|
gracefully if the archive isn't reachable)
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pycorpdiff"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.0a11"
|
|
8
8
|
description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -16,14 +16,19 @@ NumPy / pandas conventions.
|
|
|
16
16
|
|
|
17
17
|
References
|
|
18
18
|
----------
|
|
19
|
-
|
|
20
|
-
*
|
|
19
|
+
Church, K. W., & Hanks, P. (1990). Word association norms, mutual
|
|
20
|
+
information, and lexicography. *Computational Linguistics*, 16(1),
|
|
21
|
+
22-29. (Pointwise mutual information for collocation.)
|
|
21
22
|
|
|
22
23
|
Church, K., Gale, W., Hanks, P., & Hindle, D. (1991). Using statistics in
|
|
23
|
-
lexical analysis. In *Lexical Acquisition*, 115-164.
|
|
24
|
+
lexical analysis. In *Lexical Acquisition*, 115-164. (t-score.)
|
|
24
25
|
|
|
25
26
|
Daille, B. (1994). *Approche mixte pour l'extraction automatique de
|
|
26
|
-
terminologie*. PhD thesis, Université Paris 7.
|
|
27
|
+
terminologie*. PhD thesis, Université Paris 7. (MI³ — cube weighting
|
|
28
|
+
of PMI to correct rare-pair inflation.)
|
|
29
|
+
|
|
30
|
+
Rychlý, P. (2008). A lexicographer-friendly association score. In
|
|
31
|
+
*Proceedings of RASLAN 2008*. (logDice.)
|
|
27
32
|
"""
|
|
28
33
|
|
|
29
34
|
from __future__ import annotations
|
|
@@ -1,4 +1,17 @@
|
|
|
1
|
-
"""Multiple-comparison correction for keyness *p*-value vectors.
|
|
1
|
+
"""Multiple-comparison correction for keyness *p*-value vectors.
|
|
2
|
+
|
|
3
|
+
References
|
|
4
|
+
----------
|
|
5
|
+
Benjamini, Y., & Hochberg, Y. (1995). Controlling the false discovery
|
|
6
|
+
rate: A practical and powerful approach to multiple testing. *Journal
|
|
7
|
+
of the Royal Statistical Society: Series B*, 57(1), 289-300.
|
|
8
|
+
(BH-adjusted *p*-values; the FDR control used by default.)
|
|
9
|
+
|
|
10
|
+
Bonferroni, C. E. (1936). Teoria statistica delle classi e calcolo
|
|
11
|
+
delle probabilità. *Pubblicazioni del R Istituto Superiore di Scienze
|
|
12
|
+
Economiche e Commerciali di Firenze*, 8, 3-62. (Family-wise correction;
|
|
13
|
+
opt-in via ``multiple_comparisons="bonferroni"``.)
|
|
14
|
+
"""
|
|
2
15
|
|
|
3
16
|
from __future__ import annotations
|
|
4
17
|
|
|
@@ -1,14 +1,29 @@
|
|
|
1
1
|
"""Semantic shift and neighborhood drift between corpora.
|
|
2
2
|
|
|
3
|
-
The default strategy is *averaged contextual embeddings
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
3
|
+
The default strategy is *averaged contextual embeddings*
|
|
4
|
+
(Giulianelli, Del Tredici, & Fernández 2020): for each occurrence of
|
|
5
|
+
the target term in a corpus, encode its surrounding window as a
|
|
6
|
+
sentence, then average across occurrences. The corpus-specific
|
|
7
|
+
representation that comes out is what we compare.
|
|
7
8
|
|
|
8
9
|
This works with any shared-space embedder (SBERT, multilingual SBERT,
|
|
9
10
|
HuggingFace encoders). For Hamilton-style independently-trained
|
|
10
|
-
embeddings,
|
|
11
|
-
|
|
11
|
+
embeddings (Hamilton, Leskovec, & Jurafsky 2016), supply
|
|
12
|
+
``align="procrustes"`` to rotate the source space onto the target
|
|
13
|
+
space before comparison (Schönemann 1966).
|
|
14
|
+
|
|
15
|
+
References
|
|
16
|
+
----------
|
|
17
|
+
Giulianelli, M., Del Tredici, M., & Fernández, R. (2020). Analysing
|
|
18
|
+
lexical semantic change with contextualised word representations. In
|
|
19
|
+
*Proceedings of ACL 2020*, 3960-3973.
|
|
20
|
+
|
|
21
|
+
Hamilton, W. L., Leskovec, J., & Jurafsky, D. (2016). Diachronic word
|
|
22
|
+
embeddings reveal statistical laws of semantic change. In *Proceedings
|
|
23
|
+
of ACL 2016*.
|
|
24
|
+
|
|
25
|
+
Schönemann, P. H. (1966). A generalized solution of the orthogonal
|
|
26
|
+
Procrustes problem. *Psychometrika*, 31(1), 1-10.
|
|
12
27
|
"""
|
|
13
28
|
|
|
14
29
|
from __future__ import annotations
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Bayesian counterfactual causal impact
|
|
1
|
+
"""Bayesian counterfactual causal impact — no-control variant.
|
|
2
2
|
|
|
3
3
|
The :func:`interrupted_time_series` module answers "is there a step
|
|
4
4
|
discontinuity at this known event?" via segmented OLS. The harder —
|
|
@@ -7,20 +7,35 @@ looked like *without* the event?" That's the counterfactual, and the
|
|
|
7
7
|
gap between observed reality and counterfactual prediction is the
|
|
8
8
|
causal effect of the event.
|
|
9
9
|
|
|
10
|
-
Method:
|
|
11
|
-
|
|
12
|
-
forward as the counterfactual for the post-event
|
|
13
|
-
via :class:`statsmodels.tsa.UnobservedComponents`.
|
|
14
|
-
intervals on the pointwise and cumulative effects come from
|
|
15
|
-
Carlo simulation against the joint posterior of the state-space
|
|
10
|
+
Method: a univariate structural time-series counterfactual. A local
|
|
11
|
+
linear trend (Bayesian state-space) model is fit on the pre-event
|
|
12
|
+
window and projected forward as the counterfactual for the post-event
|
|
13
|
+
window. Implemented via :class:`statsmodels.tsa.UnobservedComponents`.
|
|
14
|
+
Credible intervals on the pointwise and cumulative effects come from
|
|
15
|
+
Monte Carlo simulation against the joint posterior of the state-space
|
|
16
16
|
filter — anchored at the end of the pre-event training data and rolled
|
|
17
17
|
forward through the post-event horizon.
|
|
18
18
|
|
|
19
|
+
**Scope and caveats.** This is the *no-control* variant of the
|
|
20
|
+
Brodersen et al. (2015) framework: pycorpdiff observes only the target
|
|
21
|
+
series itself, not a panel of unaffected control series. The canonical
|
|
22
|
+
Google ``CausalImpact`` implementation uses BSTS with spike-and-slab
|
|
23
|
+
regression on parallel control series; that machinery is not in scope
|
|
24
|
+
here. The univariate version is appropriate when no obvious control
|
|
25
|
+
series exists (the common case in corpus-linguistic event studies),
|
|
26
|
+
but its counterfactual relies entirely on extrapolation of the pre-
|
|
27
|
+
event trend — be wary of distribution shifts unrelated to the event.
|
|
28
|
+
|
|
19
29
|
Reference
|
|
20
30
|
---------
|
|
21
31
|
Brodersen, K. H., Gallusser, F., Koehler, J., Remy, N., & Scott, S. L.
|
|
22
32
|
(2015). Inferring causal impact using Bayesian structural time-series
|
|
23
33
|
models. *Annals of Applied Statistics*, 9(1), 247-274.
|
|
34
|
+
(pycorpdiff implements a no-control simplification of this framework.)
|
|
35
|
+
|
|
36
|
+
Harvey, A. C. (1989). *Forecasting, Structural Time Series Models and
|
|
37
|
+
the Kalman Filter*. Cambridge University Press. (Local linear trend
|
|
38
|
+
state-space specification.)
|
|
24
39
|
"""
|
|
25
40
|
|
|
26
41
|
from __future__ import annotations
|
|
@@ -1,153 +0,0 @@
|
|
|
1
|
-
"""Cross-validation against R's quanteda (Benoit et al. 2018).
|
|
2
|
-
|
|
3
|
-
quanteda's ``textstat_keyness(measure="lr")`` computes the same
|
|
4
|
-
Dunning log-likelihood our :func:`pycorpdiff.keyness.log_likelihood`
|
|
5
|
-
does. With identical inputs they should produce byte-identical G²
|
|
6
|
-
values modulo floating-point representation.
|
|
7
|
-
|
|
8
|
-
This is the highest-credibility cross-validation we can ship: pycorpdiff
|
|
9
|
-
demonstrably agrees with the R reference implementation on the math,
|
|
10
|
-
on the same fixture, to 6 decimals. It's the receipt that turns
|
|
11
|
-
"the math is correct" into "the math agrees with the standard tool".
|
|
12
|
-
|
|
13
|
-
Requirements
|
|
14
|
-
------------
|
|
15
|
-
|
|
16
|
-
- R installed (any 4.x)
|
|
17
|
-
- ``install.packages("quanteda")`` from a CRAN mirror
|
|
18
|
-
- ``pip install rpy2``
|
|
19
|
-
|
|
20
|
-
Skips silently if rpy2 isn't installed *or* if R doesn't have quanteda.
|
|
21
|
-
"""
|
|
22
|
-
|
|
23
|
-
from __future__ import annotations
|
|
24
|
-
|
|
25
|
-
import math
|
|
26
|
-
|
|
27
|
-
import pandas as pd
|
|
28
|
-
import pytest
|
|
29
|
-
|
|
30
|
-
import pycorpdiff as pcd
|
|
31
|
-
|
|
32
|
-
rpy2 = pytest.importorskip("rpy2")
|
|
33
|
-
rpy2_robjects = pytest.importorskip("rpy2.robjects")
|
|
34
|
-
|
|
35
|
-
pytestmark = pytest.mark.slow
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
def _r_has_quanteda() -> bool:
|
|
39
|
-
"""Probe whether quanteda is installed in this R environment."""
|
|
40
|
-
try:
|
|
41
|
-
from rpy2.robjects.packages import importr
|
|
42
|
-
|
|
43
|
-
importr("quanteda")
|
|
44
|
-
importr("quanteda.textstats")
|
|
45
|
-
return True
|
|
46
|
-
except Exception: # pragma: no cover - environment-dependent
|
|
47
|
-
return False
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
@pytest.fixture(scope="module")
|
|
51
|
-
def fixture_corpus() -> pcd.Corpus:
|
|
52
|
-
"""A small two-class fixture with clean signal."""
|
|
53
|
-
rows = [
|
|
54
|
-
{"text": "the migrant worker arrived and settled here peacefully", "frame": "A"},
|
|
55
|
-
{"text": "the migrant family thrived in our welcoming community", "frame": "A"},
|
|
56
|
-
{"text": "the migrant community grew through worker organisation", "frame": "A"},
|
|
57
|
-
{"text": "the migrant family and worker rights advanced together", "frame": "A"},
|
|
58
|
-
{"text": "the migrant criminal threat grew unchecked at our borders", "frame": "B"},
|
|
59
|
-
{"text": "the migrant invasion of criminal gangs spread rapidly here", "frame": "B"},
|
|
60
|
-
{"text": "the migrant criminal element alarmed residents nationwide", "frame": "B"},
|
|
61
|
-
{"text": "the migrant gangs threaten the border and the criminal risk", "frame": "B"},
|
|
62
|
-
]
|
|
63
|
-
return pcd.from_dataframe(pd.DataFrame(rows), text_col="text", meta_cols=("frame",))
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
def _quanteda_keyness(corpus_df: pd.DataFrame) -> pd.DataFrame:
|
|
67
|
-
"""Run quanteda's textstat_keyness(measure='lr') on the same corpus.
|
|
68
|
-
|
|
69
|
-
Returns a DataFrame with columns ``feature`` and ``g2`` (the
|
|
70
|
-
log-likelihood). Sign matches quanteda's convention.
|
|
71
|
-
"""
|
|
72
|
-
if not _r_has_quanteda():
|
|
73
|
-
pytest.skip("quanteda not installed in the R environment")
|
|
74
|
-
|
|
75
|
-
from rpy2.robjects import default_converter, pandas2ri, r
|
|
76
|
-
from rpy2.robjects.conversion import localconverter
|
|
77
|
-
|
|
78
|
-
# rpy2 >= 3.5 deprecated pandas2ri.activate() in favour of a
|
|
79
|
-
# context-managed converter. Within `localconverter(...)` the
|
|
80
|
-
# `pandas2ri` converter is registered, so `r("out_df")` auto-converts
|
|
81
|
-
# the R data.frame to a pandas DataFrame on the way out; no explicit
|
|
82
|
-
# `pandas2ri.rpy2py(...)` call is needed (that's the deprecated
|
|
83
|
-
# pre-context-manager idiom and now raises NotImplementedError when
|
|
84
|
-
# the object is already pandas).
|
|
85
|
-
with localconverter(default_converter + pandas2ri.converter):
|
|
86
|
-
r("library(quanteda)")
|
|
87
|
-
r("library(quanteda.textstats)")
|
|
88
|
-
r.assign("docs_df", pandas2ri.py2rpy(corpus_df))
|
|
89
|
-
|
|
90
|
-
r(
|
|
91
|
-
"""
|
|
92
|
-
cps <- corpus(docs_df, text_field = "text")
|
|
93
|
-
toks <- tokens(cps, remove_punct = TRUE)
|
|
94
|
-
dfm_obj <- dfm(toks, tolower = TRUE)
|
|
95
|
-
keyness <- textstat_keyness(
|
|
96
|
-
dfm_obj,
|
|
97
|
-
target = which(docvars(cps, "frame") == "A"),
|
|
98
|
-
measure = "lr"
|
|
99
|
-
)
|
|
100
|
-
out_df <- as.data.frame(keyness)
|
|
101
|
-
"""
|
|
102
|
-
)
|
|
103
|
-
out = r("out_df") # auto-converted to pandas via localconverter
|
|
104
|
-
|
|
105
|
-
out = out.rename(columns={"G2": "g2"})
|
|
106
|
-
return out[["feature", "g2"]]
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
def test_log_likelihood_matches_quanteda_byte_for_byte(
|
|
110
|
-
fixture_corpus: pcd.Corpus,
|
|
111
|
-
) -> None:
|
|
112
|
-
"""For every term shared with quanteda (using formula='dunning'),
|
|
113
|
-
our signed G² agrees to ≤ 1e-6.
|
|
114
|
-
|
|
115
|
-
quanteda's ``textstat_keyness(measure="lr")`` uses the full 4-cell
|
|
116
|
-
Dunning G². The Rayson 2-cell shortcut (our default) is a different
|
|
117
|
-
statistic; comparing like-to-like requires passing ``formula="dunning"``.
|
|
118
|
-
|
|
119
|
-
The 1e-6 tolerance reflects the realistic cross-runtime
|
|
120
|
-
floating-point ceiling between R (BLAS-via-R) and NumPy
|
|
121
|
-
(BLAS-via-Python); accumulation order in xlogy differs subtly
|
|
122
|
-
between the two stacks. Tightening below ~1e-7 produces sporadic
|
|
123
|
-
failures on otherwise-identical math.
|
|
124
|
-
"""
|
|
125
|
-
a = fixture_corpus.slice(frame="A")
|
|
126
|
-
b = fixture_corpus.slice(frame="B")
|
|
127
|
-
ours = (
|
|
128
|
-
pcd.compare(a, b)
|
|
129
|
-
.keyness(min_count=1, formula="dunning")
|
|
130
|
-
.table.set_index("term")["g2"]
|
|
131
|
-
)
|
|
132
|
-
|
|
133
|
-
quanteda_df = _quanteda_keyness(fixture_corpus.docs.copy())
|
|
134
|
-
theirs = pd.Series(
|
|
135
|
-
quanteda_df["g2"].to_numpy(), index=quanteda_df["feature"].to_numpy()
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
shared = set(ours.index) & set(theirs.index)
|
|
139
|
-
assert len(shared) >= 5, (
|
|
140
|
-
f"too few shared terms for a meaningful comparison ({len(shared)})"
|
|
141
|
-
)
|
|
142
|
-
|
|
143
|
-
for term in shared:
|
|
144
|
-
ours_v = float(ours[term])
|
|
145
|
-
theirs_v = float(theirs[term])
|
|
146
|
-
# quanteda's textstat_keyness uses signed G² with the same
|
|
147
|
-
# convention we do: positive when overused in the target
|
|
148
|
-
# group. With matching formulae, the two implementations
|
|
149
|
-
# agree to cross-runtime float-arithmetic noise (~1e-7 on
|
|
150
|
-
# this fixture, well below the 1e-6 ceiling we assert here).
|
|
151
|
-
assert math.isclose(ours_v, theirs_v, abs_tol=1e-6), (
|
|
152
|
-
f"{term}: pycorpdiff={ours_v}, quanteda={theirs_v}"
|
|
153
|
-
)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/src/pycorpdiff/datasets/_data/hansard_sample.parquet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pycorpdiff-0.1.0a9 → pycorpdiff-0.1.0a11}/tests/integration/test_collocation_integration.py
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|