pycorpdiff 0.1.0a0__tar.gz → 0.1.0a2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/CHANGELOG.md +1 -1
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/CITATION.cff +7 -14
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/PKG-INFO +49 -32
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/README.md +47 -30
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/pyproject.toml +11 -13
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/__init__.py +9 -10
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/results.py +10 -1
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_smoke.py +4 -6
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/.gitignore +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/LICENSE +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/_backends/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/_backends/pandas.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/_backends/polars.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/measures.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/network.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/shift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/compare.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/corpus.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/hansard.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/histwords.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/explain.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/io/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/io/duckdb.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/io/huggingface.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/io/readers.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/bayes.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/chi_squared.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/correction.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/dispersion.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/multicorpus.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/permutation.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/py.typed +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/alignment.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/embed.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/shift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/trajectory.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/stats.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/bocpd.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/changepoint.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/forecast.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/its.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/slicing.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/tokenize.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/bocpd.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/collocation.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/dispersion.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/forecast.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/keyness.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/network.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/scattertext.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/trajectory.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/conftest.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/fixtures/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_collocation_integration.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_histwords.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_nltk.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_quanteda.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_rayson.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_scattertext.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_explain_integration.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_keyness_integration.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_sbert_slow.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_semantic_integration.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_stop_words.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_temporal_stats.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_viz.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/property/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/property/test_collocation_properties.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/property/test_keyness_properties.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/property/test_temporal_properties.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_bayes_factor.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_bocpd.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_causal_impact.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_changepoint.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_chi_squared.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_collocation_cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_collocation_measures.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_collocation_shift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_comparison_concordance.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_cooccurrence_network.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_corpus_hash.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_corpus_vocab.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_correction.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_datasets_hansard.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_dispersion.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_dispersion_plot.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_doc_term_counts_sparse.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_embedders.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_explain.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_forecast.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_forecast_semantic_drift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_from_huggingface.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_hansard_fetcher.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_histwords_loader.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_its.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_keyness_multi.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_ngram_tokenizer.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_permutation_keyness.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_polars_interop.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_procrustes.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_read_duckdb.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_read_txt_line_mode.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_result_exports.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_scattertext_plot.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_semantic_neighbours.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_semantic_shift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_semantic_trajectory.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_temporal.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_wilson_ci.py +0 -0
|
@@ -4,7 +4,7 @@ All notable changes to `pycorpdiff` are documented in this file. The format
|
|
|
4
4
|
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
|
|
5
5
|
project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
-
## [0.1.
|
|
7
|
+
## [0.1.0a2] — initial release
|
|
8
8
|
|
|
9
9
|
The initial public release of `pycorpdiff` — comparative corpus analysis
|
|
10
10
|
for modern Python workflows. Three public verbs (`compare`, `track`,
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
cff-version: 1.2.0
|
|
2
2
|
message: >
|
|
3
|
-
If you use pycorpdiff in academic work, please cite
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
preparation; the draft will live in this repository as paper/paper.tex.
|
|
3
|
+
If you use pycorpdiff in academic work, please cite this software
|
|
4
|
+
entry. GitHub renders a "Cite this repository" widget directly from
|
|
5
|
+
this file.
|
|
7
6
|
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
8
|
-
version: 0.1.
|
|
7
|
+
version: 0.1.0a2
|
|
9
8
|
date-released: 2026-05-22
|
|
10
9
|
authors:
|
|
11
10
|
- family-names: Turner
|
|
@@ -34,16 +33,10 @@ abstract: >
|
|
|
34
33
|
computational social science, and discourse analysis research,
|
|
35
34
|
emphasising interpretability, explainability, statistical rigour,
|
|
36
35
|
and reproducibility.
|
|
37
|
-
preferred-citation:
|
|
38
|
-
type: article
|
|
39
|
-
authors:
|
|
40
|
-
- family-names: Turner
|
|
41
|
-
given-names: Jason
|
|
42
|
-
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
43
|
-
journal: "Journal of Statistical Software"
|
|
44
|
-
year: 2026
|
|
45
|
-
status: in-preparation
|
|
46
36
|
identifiers:
|
|
47
37
|
- type: url
|
|
48
38
|
value: "https://github.com/jturner-uofl/pycorpdiff"
|
|
49
39
|
description: Project repository
|
|
40
|
+
- type: url
|
|
41
|
+
value: "https://pypi.org/project/pycorpdiff/"
|
|
42
|
+
description: PyPI release
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycorpdiff
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a2
|
|
4
4
|
Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
|
|
5
5
|
Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
|
|
6
6
|
Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
|
|
@@ -30,7 +30,7 @@ License: MIT License
|
|
|
30
30
|
SOFTWARE.
|
|
31
31
|
License-File: LICENSE
|
|
32
32
|
Keywords: collocation,comparative corpus analysis,computational social science,corpus linguistics,diachronic nlp,digital humanities,discourse analysis,keyness,semantic change,temporal text analysis
|
|
33
|
-
Classifier: Development Status ::
|
|
33
|
+
Classifier: Development Status :: 3 - Alpha
|
|
34
34
|
Classifier: Intended Audience :: Science/Research
|
|
35
35
|
Classifier: License :: OSI Approved :: MIT License
|
|
36
36
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -95,15 +95,10 @@ Description-Content-Type: text/markdown
|
|
|
95
95
|
|
|
96
96
|
# pycorpdiff
|
|
97
97
|
|
|
98
|
-
<!--
|
|
99
|
-
TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
|
|
100
|
-
|
|
101
98
|
[](https://pypi.org/project/pycorpdiff/)
|
|
102
99
|
[](https://pypi.org/project/pycorpdiff/)
|
|
103
100
|
[](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
|
|
104
|
-
[](https://doi.org/10.5281/zenodo.<RECORD>)
|
|
105
101
|
[](https://opensource.org/licenses/MIT)
|
|
106
|
-
-->
|
|
107
102
|
|
|
108
103
|
**Comparative corpus analysis for modern Python workflows.**
|
|
109
104
|
|
|
@@ -135,8 +130,8 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
135
130
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
136
131
|
via extras.
|
|
137
132
|
|
|
138
|
-
> **Status:
|
|
139
|
-
>
|
|
133
|
+
> **Status: alpha (0.1.0a2).** Public API is stable for the features
|
|
134
|
+
> described below; on PyPI as `pip install pycorpdiff`.
|
|
140
135
|
|
|
141
136
|
## The three-layer architecture
|
|
142
137
|
|
|
@@ -148,36 +143,53 @@ via extras.
|
|
|
148
143
|
|
|
149
144
|
## Quick start
|
|
150
145
|
|
|
146
|
+
```bash
|
|
147
|
+
pip install "pycorpdiff[viz,temporal]"
|
|
148
|
+
```
|
|
149
|
+
|
|
151
150
|
```python
|
|
152
151
|
import pycorpdiff as pcd
|
|
153
152
|
|
|
154
|
-
|
|
153
|
+
# Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
|
|
154
|
+
corpus = pcd.load_hansard_sample()
|
|
155
|
+
immigration = corpus.slice(topic="immigration")
|
|
156
|
+
human = immigration.slice(frame="humanising")
|
|
157
|
+
criminal = immigration.slice(frame="criminalising")
|
|
155
158
|
|
|
156
159
|
# Compare — three verbs
|
|
157
|
-
k = pcd.compare(
|
|
158
|
-
c = pcd.compare(
|
|
159
|
-
s = pcd.compare(
|
|
160
|
+
k = pcd.compare(human, criminal).keyness()
|
|
161
|
+
c = pcd.compare(human, criminal).collocation_shift("immigrant")
|
|
162
|
+
# s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
|
|
163
|
+
# ↑ requires `pip install "pycorpdiff[semantic]"`
|
|
160
164
|
|
|
161
165
|
# Track over time
|
|
162
|
-
tr = pcd.track(
|
|
163
|
-
tr.changepoints()
|
|
164
|
-
tr.changepoints_online(hazard=1/24)
|
|
165
|
-
tr.interrupted_time_series(event_date="2016
|
|
166
|
-
tr.causal_impact(event_date="2016
|
|
167
|
-
tr.forecast(horizon=4)
|
|
166
|
+
tr = pcd.track(immigration, "criminal").over_time(freq="Y")
|
|
167
|
+
tr.changepoints() # offline PELT
|
|
168
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
169
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
170
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
171
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
168
172
|
|
|
169
173
|
# Before / after a known event
|
|
170
|
-
pcd.compare.before_after(
|
|
174
|
+
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
171
175
|
|
|
172
|
-
# N-way (≥ 2 corpora)
|
|
173
|
-
|
|
176
|
+
# N-way (≥ 2 corpora) — one keyness across all four parties
|
|
177
|
+
parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
|
|
178
|
+
nhs = corpus.slice(topic="nhs")
|
|
179
|
+
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
174
180
|
|
|
175
181
|
# The discourse as a graph
|
|
176
|
-
pcd.cooccurrence_network(
|
|
182
|
+
pcd.cooccurrence_network(immigration, top_n=30).plot()
|
|
177
183
|
|
|
178
184
|
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
179
185
|
```
|
|
180
186
|
|
|
187
|
+
Every line of the snippet above is verified end-to-end against
|
|
188
|
+
`pip install "pycorpdiff[viz,temporal]"` — no data download required.
|
|
189
|
+
Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
|
|
190
|
+
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
|
|
191
|
+
`pcd.from_huggingface(...)` to use your own corpus.
|
|
192
|
+
|
|
181
193
|
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
182
194
|
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
183
195
|
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
@@ -185,23 +197,28 @@ surface.
|
|
|
185
197
|
|
|
186
198
|
## Installation
|
|
187
199
|
|
|
188
|
-
|
|
200
|
+
```bash
|
|
201
|
+
pip install pycorpdiff # lexical-comparative core
|
|
202
|
+
pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
|
|
203
|
+
pip install "pycorpdiff[semantic]" # + sentence-transformers
|
|
204
|
+
pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
|
|
205
|
+
pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
|
|
206
|
+
pip install "pycorpdiff[all]" # everything
|
|
207
|
+
```
|
|
189
208
|
|
|
190
|
-
|
|
209
|
+
The base install keeps a small dependency footprint (`numpy`, `pandas`,
|
|
210
|
+
`scipy`, `pyarrow`); optional extras land per analytical layer so you
|
|
211
|
+
only pay for what you use.
|
|
212
|
+
|
|
213
|
+
To work from source:
|
|
191
214
|
|
|
192
215
|
```bash
|
|
193
216
|
git clone https://github.com/jturner-uofl/pycorpdiff
|
|
194
217
|
cd pycorpdiff
|
|
195
218
|
pip install -e ".[dev]"
|
|
196
|
-
pytest -q
|
|
219
|
+
pytest -q
|
|
197
220
|
```
|
|
198
221
|
|
|
199
|
-
Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
|
|
200
|
-
(sentence-transformers + scikit-learn), `[temporal]` (ruptures +
|
|
201
|
-
statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
|
|
202
|
-
`[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
|
|
203
|
-
or `[all]`.
|
|
204
|
-
|
|
205
222
|
## Cross-validation receipts
|
|
206
223
|
|
|
207
224
|
The math agrees with the standard tools — by automated test:
|
|
@@ -1,14 +1,9 @@
|
|
|
1
1
|
# pycorpdiff
|
|
2
2
|
|
|
3
|
-
<!--
|
|
4
|
-
TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
|
|
5
|
-
|
|
6
3
|
[](https://pypi.org/project/pycorpdiff/)
|
|
7
4
|
[](https://pypi.org/project/pycorpdiff/)
|
|
8
5
|
[](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
|
|
9
|
-
[](https://doi.org/10.5281/zenodo.<RECORD>)
|
|
10
6
|
[](https://opensource.org/licenses/MIT)
|
|
11
|
-
-->
|
|
12
7
|
|
|
13
8
|
**Comparative corpus analysis for modern Python workflows.**
|
|
14
9
|
|
|
@@ -40,8 +35,8 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
40
35
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
41
36
|
via extras.
|
|
42
37
|
|
|
43
|
-
> **Status:
|
|
44
|
-
>
|
|
38
|
+
> **Status: alpha (0.1.0a2).** Public API is stable for the features
|
|
39
|
+
> described below; on PyPI as `pip install pycorpdiff`.
|
|
45
40
|
|
|
46
41
|
## The three-layer architecture
|
|
47
42
|
|
|
@@ -53,36 +48,53 @@ via extras.
|
|
|
53
48
|
|
|
54
49
|
## Quick start
|
|
55
50
|
|
|
51
|
+
```bash
|
|
52
|
+
pip install "pycorpdiff[viz,temporal]"
|
|
53
|
+
```
|
|
54
|
+
|
|
56
55
|
```python
|
|
57
56
|
import pycorpdiff as pcd
|
|
58
57
|
|
|
59
|
-
|
|
58
|
+
# Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
|
|
59
|
+
corpus = pcd.load_hansard_sample()
|
|
60
|
+
immigration = corpus.slice(topic="immigration")
|
|
61
|
+
human = immigration.slice(frame="humanising")
|
|
62
|
+
criminal = immigration.slice(frame="criminalising")
|
|
60
63
|
|
|
61
64
|
# Compare — three verbs
|
|
62
|
-
k = pcd.compare(
|
|
63
|
-
c = pcd.compare(
|
|
64
|
-
s = pcd.compare(
|
|
65
|
+
k = pcd.compare(human, criminal).keyness()
|
|
66
|
+
c = pcd.compare(human, criminal).collocation_shift("immigrant")
|
|
67
|
+
# s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
|
|
68
|
+
# ↑ requires `pip install "pycorpdiff[semantic]"`
|
|
65
69
|
|
|
66
70
|
# Track over time
|
|
67
|
-
tr = pcd.track(
|
|
68
|
-
tr.changepoints()
|
|
69
|
-
tr.changepoints_online(hazard=1/24)
|
|
70
|
-
tr.interrupted_time_series(event_date="2016
|
|
71
|
-
tr.causal_impact(event_date="2016
|
|
72
|
-
tr.forecast(horizon=4)
|
|
71
|
+
tr = pcd.track(immigration, "criminal").over_time(freq="Y")
|
|
72
|
+
tr.changepoints() # offline PELT
|
|
73
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
74
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
75
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
76
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
73
77
|
|
|
74
78
|
# Before / after a known event
|
|
75
|
-
pcd.compare.before_after(
|
|
79
|
+
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
76
80
|
|
|
77
|
-
# N-way (≥ 2 corpora)
|
|
78
|
-
|
|
81
|
+
# N-way (≥ 2 corpora) — one keyness across all four parties
|
|
82
|
+
parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
|
|
83
|
+
nhs = corpus.slice(topic="nhs")
|
|
84
|
+
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
79
85
|
|
|
80
86
|
# The discourse as a graph
|
|
81
|
-
pcd.cooccurrence_network(
|
|
87
|
+
pcd.cooccurrence_network(immigration, top_n=30).plot()
|
|
82
88
|
|
|
83
89
|
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
84
90
|
```
|
|
85
91
|
|
|
92
|
+
Every line of the snippet above is verified end-to-end against
|
|
93
|
+
`pip install "pycorpdiff[viz,temporal]"` — no data download required.
|
|
94
|
+
Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
|
|
95
|
+
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
|
|
96
|
+
`pcd.from_huggingface(...)` to use your own corpus.
|
|
97
|
+
|
|
86
98
|
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
87
99
|
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
88
100
|
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
@@ -90,23 +102,28 @@ surface.
|
|
|
90
102
|
|
|
91
103
|
## Installation
|
|
92
104
|
|
|
93
|
-
|
|
105
|
+
```bash
|
|
106
|
+
pip install pycorpdiff # lexical-comparative core
|
|
107
|
+
pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
|
|
108
|
+
pip install "pycorpdiff[semantic]" # + sentence-transformers
|
|
109
|
+
pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
|
|
110
|
+
pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
|
|
111
|
+
pip install "pycorpdiff[all]" # everything
|
|
112
|
+
```
|
|
94
113
|
|
|
95
|
-
|
|
114
|
+
The base install keeps a small dependency footprint (`numpy`, `pandas`,
|
|
115
|
+
`scipy`, `pyarrow`); optional extras land per analytical layer so you
|
|
116
|
+
only pay for what you use.
|
|
117
|
+
|
|
118
|
+
To work from source:
|
|
96
119
|
|
|
97
120
|
```bash
|
|
98
121
|
git clone https://github.com/jturner-uofl/pycorpdiff
|
|
99
122
|
cd pycorpdiff
|
|
100
123
|
pip install -e ".[dev]"
|
|
101
|
-
pytest -q
|
|
124
|
+
pytest -q
|
|
102
125
|
```
|
|
103
126
|
|
|
104
|
-
Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
|
|
105
|
-
(sentence-transformers + scikit-learn), `[temporal]` (ruptures +
|
|
106
|
-
statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
|
|
107
|
-
`[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
|
|
108
|
-
or `[all]`.
|
|
109
|
-
|
|
110
127
|
## Cross-validation receipts
|
|
111
128
|
|
|
112
129
|
The math agrees with the standard tools — by automated test:
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pycorpdiff"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.0a2"
|
|
8
8
|
description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -23,7 +23,7 @@ keywords = [
|
|
|
23
23
|
"temporal text analysis",
|
|
24
24
|
]
|
|
25
25
|
classifiers = [
|
|
26
|
-
"Development Status ::
|
|
26
|
+
"Development Status :: 3 - Alpha",
|
|
27
27
|
"Intended Audience :: Science/Research",
|
|
28
28
|
"License :: OSI Approved :: MIT License",
|
|
29
29
|
"Programming Language :: Python :: 3",
|
|
@@ -36,9 +36,9 @@ classifiers = [
|
|
|
36
36
|
]
|
|
37
37
|
dependencies = [
|
|
38
38
|
"numpy>=1.24",
|
|
39
|
-
# Capped at <3
|
|
40
|
-
#
|
|
41
|
-
#
|
|
39
|
+
# Capped at <3: pandas 3.x raises Pandas4Warning under strict
|
|
40
|
+
# warning filters via third-party DataFrame copies. Lift when the
|
|
41
|
+
# wider PyData stack catches up.
|
|
42
42
|
"pandas>=2.0,<3",
|
|
43
43
|
"scipy>=1.11",
|
|
44
44
|
"pyarrow>=14",
|
|
@@ -47,13 +47,12 @@ dependencies = [
|
|
|
47
47
|
[project.optional-dependencies]
|
|
48
48
|
# Visualisation: altair-first, matplotlib retained for paper-grade figures.
|
|
49
49
|
viz = ["altair>=5", "matplotlib>=3.8", "networkx>=3.1"]
|
|
50
|
-
# Embedding-based semantic shift
|
|
51
|
-
#
|
|
50
|
+
# Embedding-based semantic shift. sentence-transformers pulls torch
|
|
51
|
+
# transitively, which is why this is opt-in rather than a base dep.
|
|
52
52
|
semantic = ["sentence-transformers>=2.2", "scikit-learn>=1.3"]
|
|
53
53
|
# Changepoint detection + interrupted time series.
|
|
54
54
|
temporal = ["ruptures>=1.1", "statsmodels>=0.14"]
|
|
55
|
-
# Optional columnar backend. polars.from_pandas() needs pyarrow at runtime
|
|
56
|
-
# (mirrors pysofra's pinning).
|
|
55
|
+
# Optional columnar backend. polars.from_pandas() needs pyarrow at runtime.
|
|
57
56
|
polars = ["polars>=1.0", "pyarrow>=15"]
|
|
58
57
|
# Out-of-core querying for large corpora.
|
|
59
58
|
duckdb = ["duckdb>=0.10"]
|
|
@@ -92,9 +91,9 @@ dev = [
|
|
|
92
91
|
"ruff>=0.4",
|
|
93
92
|
"mypy>=1.8",
|
|
94
93
|
"pre-commit>=3.6",
|
|
95
|
-
# pandas-stubs sharpens mypy strict typing for pandas surfaces
|
|
96
|
-
#
|
|
97
|
-
#
|
|
94
|
+
# pandas-stubs sharpens mypy strict typing for pandas surfaces.
|
|
95
|
+
# Without it, ignore_missing_imports would mask real typing
|
|
96
|
+
# regressions in pandas-mediated code paths.
|
|
98
97
|
"pandas-stubs>=2.2",
|
|
99
98
|
]
|
|
100
99
|
[project.urls]
|
|
@@ -119,7 +118,6 @@ include = [
|
|
|
119
118
|
exclude = [
|
|
120
119
|
"docs",
|
|
121
120
|
"examples",
|
|
122
|
-
"site",
|
|
123
121
|
".github",
|
|
124
122
|
"uv.lock",
|
|
125
123
|
]
|
|
@@ -1,26 +1,25 @@
|
|
|
1
1
|
"""pycorpdiff — comparative corpus analysis for modern Python workflows.
|
|
2
2
|
|
|
3
3
|
The package exposes three public verbs (:func:`compare`, :func:`track`,
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
:class:`
|
|
7
|
-
:class:`
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
of the roadmap lands.
|
|
4
|
+
:func:`compare.before_after`) and a family of frozen-dataclass
|
|
5
|
+
result objects (:class:`KeynessResult`, :class:`CollocationShiftResult`,
|
|
6
|
+
:class:`SemanticShiftResult`, :class:`TemporalTrajectory`,
|
|
7
|
+
:class:`NetworkResult`, :class:`ForecastResult`,
|
|
8
|
+
:class:`CausalImpactResult`, :class:`BocpdResult`,
|
|
9
|
+
:class:`ConcordanceResult`), each implementing the same
|
|
10
|
+
``.to_df / .plot / .explain / .summary / .to_html / .to_json`` contract.
|
|
12
11
|
|
|
13
12
|
Example
|
|
14
13
|
-------
|
|
15
14
|
|
|
16
15
|
>>> import pycorpdiff as pcd
|
|
17
16
|
>>> pcd.__version__
|
|
18
|
-
'0.1.
|
|
17
|
+
'0.1.0a2'
|
|
19
18
|
"""
|
|
20
19
|
|
|
21
20
|
from __future__ import annotations
|
|
22
21
|
|
|
23
|
-
__version__ = "0.1.
|
|
22
|
+
__version__ = "0.1.0a2"
|
|
24
23
|
|
|
25
24
|
from .collocation.network import NetworkResult, cooccurrence_network
|
|
26
25
|
from .compare import Comparison, compare
|
|
@@ -251,7 +251,16 @@ class SemanticShiftResult:
|
|
|
251
251
|
return _table_to_json(self.table, path, **kw)
|
|
252
252
|
|
|
253
253
|
def plot(self, **kw: Any) -> alt.Chart:
|
|
254
|
-
|
|
254
|
+
"""Plotting for SemanticShiftResult is not yet implemented.
|
|
255
|
+
|
|
256
|
+
For a forward-looking trajectory of cosine distances, use
|
|
257
|
+
:func:`pycorpdiff.semantic_trajectory` and pass the resulting
|
|
258
|
+
DataFrame to :func:`pycorpdiff.viz.semantic_forecast_plot`.
|
|
259
|
+
"""
|
|
260
|
+
raise NotImplementedError(
|
|
261
|
+
"SemanticShiftResult.plot() is not yet implemented; "
|
|
262
|
+
"use .table or pcd.viz.semantic_forecast_plot() instead"
|
|
263
|
+
)
|
|
255
264
|
|
|
256
265
|
def neighbors_before(
|
|
257
266
|
self, target: str | None = None, n: int = 10
|
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
"""Smoke tests for the
|
|
1
|
+
"""Smoke tests for the public surface.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
NotImplementedError and are intentionally not exercised here — Phase 1
|
|
7
|
-
will replace those tests with real ones.
|
|
3
|
+
Quick exercises of imports, the :class:`Corpus` constructor, slicing,
|
|
4
|
+
the default regex tokenizer, and the CSV/parquet readers. The deeper
|
|
5
|
+
analytical surfaces have their own dedicated test modules.
|
|
8
6
|
"""
|
|
9
7
|
|
|
10
8
|
from __future__ import annotations
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/_data/hansard_sample.parquet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|