pycorpdiff 0.1.0a0__tar.gz → 0.1.0a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/CHANGELOG.md +1 -1
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/CITATION.cff +7 -14
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/PKG-INFO +44 -32
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/README.md +42 -30
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/pyproject.toml +11 -13
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/__init__.py +9 -10
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/results.py +10 -1
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_smoke.py +4 -6
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/.gitignore +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/LICENSE +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/_backends/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/_backends/pandas.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/_backends/polars.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/measures.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/network.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/shift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/compare.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/corpus.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/hansard.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/histwords.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/explain.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/io/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/io/duckdb.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/io/huggingface.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/io/readers.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/bayes.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/chi_squared.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/correction.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/dispersion.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/multicorpus.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/permutation.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/py.typed +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/alignment.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/embed.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/shift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/trajectory.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/stats.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/bocpd.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/changepoint.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/forecast.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/its.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/slicing.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/tokenize.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/bocpd.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/collocation.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/dispersion.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/forecast.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/keyness.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/network.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/scattertext.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/trajectory.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/conftest.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/fixtures/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_collocation_integration.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_histwords.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_nltk.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_quanteda.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_rayson.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_scattertext.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_explain_integration.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_keyness_integration.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_sbert_slow.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_semantic_integration.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_stop_words.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_temporal_stats.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_viz.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/property/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/property/test_collocation_properties.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/property/test_keyness_properties.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/property/test_temporal_properties.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/__init__.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_bayes_factor.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_bocpd.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_causal_impact.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_changepoint.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_chi_squared.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_collocation_cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_collocation_measures.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_collocation_shift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_comparison_concordance.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_cooccurrence_network.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_corpus_hash.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_corpus_vocab.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_correction.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_datasets_hansard.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_dispersion.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_dispersion_plot.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_doc_term_counts_sparse.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_embedders.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_explain.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_forecast.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_forecast_semantic_drift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_from_huggingface.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_hansard_fetcher.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_histwords_loader.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_its.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_keyness_multi.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_ngram_tokenizer.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_permutation_keyness.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_polars_interop.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_procrustes.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_read_duckdb.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_read_txt_line_mode.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_result_exports.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_scattertext_plot.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_semantic_neighbours.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_semantic_shift.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_semantic_trajectory.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_temporal.py +0 -0
- {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_wilson_ci.py +0 -0
|
@@ -4,7 +4,7 @@ All notable changes to `pycorpdiff` are documented in this file. The format
|
|
|
4
4
|
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
|
|
5
5
|
project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
-
## [0.1.
|
|
7
|
+
## [0.1.0a1] — initial release
|
|
8
8
|
|
|
9
9
|
The initial public release of `pycorpdiff` — comparative corpus analysis
|
|
10
10
|
for modern Python workflows. Three public verbs (`compare`, `track`,
|
|
@@ -1,11 +1,10 @@
|
|
|
1
1
|
cff-version: 1.2.0
|
|
2
2
|
message: >
|
|
3
|
-
If you use pycorpdiff in academic work, please cite
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
preparation; the draft will live in this repository as paper/paper.tex.
|
|
3
|
+
If you use pycorpdiff in academic work, please cite this software
|
|
4
|
+
entry. GitHub renders a "Cite this repository" widget directly from
|
|
5
|
+
this file.
|
|
7
6
|
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
8
|
-
version: 0.1.
|
|
7
|
+
version: 0.1.0a1
|
|
9
8
|
date-released: 2026-05-22
|
|
10
9
|
authors:
|
|
11
10
|
- family-names: Turner
|
|
@@ -34,16 +33,10 @@ abstract: >
|
|
|
34
33
|
computational social science, and discourse analysis research,
|
|
35
34
|
emphasising interpretability, explainability, statistical rigour,
|
|
36
35
|
and reproducibility.
|
|
37
|
-
preferred-citation:
|
|
38
|
-
type: article
|
|
39
|
-
authors:
|
|
40
|
-
- family-names: Turner
|
|
41
|
-
given-names: Jason
|
|
42
|
-
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
43
|
-
journal: "Journal of Statistical Software"
|
|
44
|
-
year: 2026
|
|
45
|
-
status: in-preparation
|
|
46
36
|
identifiers:
|
|
47
37
|
- type: url
|
|
48
38
|
value: "https://github.com/jturner-uofl/pycorpdiff"
|
|
49
39
|
description: Project repository
|
|
40
|
+
- type: url
|
|
41
|
+
value: "https://pypi.org/project/pycorpdiff/"
|
|
42
|
+
description: PyPI release
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycorpdiff
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a1
|
|
4
4
|
Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
|
|
5
5
|
Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
|
|
6
6
|
Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
|
|
@@ -30,7 +30,7 @@ License: MIT License
|
|
|
30
30
|
SOFTWARE.
|
|
31
31
|
License-File: LICENSE
|
|
32
32
|
Keywords: collocation,comparative corpus analysis,computational social science,corpus linguistics,diachronic nlp,digital humanities,discourse analysis,keyness,semantic change,temporal text analysis
|
|
33
|
-
Classifier: Development Status ::
|
|
33
|
+
Classifier: Development Status :: 3 - Alpha
|
|
34
34
|
Classifier: Intended Audience :: Science/Research
|
|
35
35
|
Classifier: License :: OSI Approved :: MIT License
|
|
36
36
|
Classifier: Programming Language :: Python :: 3
|
|
@@ -95,15 +95,10 @@ Description-Content-Type: text/markdown
|
|
|
95
95
|
|
|
96
96
|
# pycorpdiff
|
|
97
97
|
|
|
98
|
-
<!--
|
|
99
|
-
TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
|
|
100
|
-
|
|
101
98
|
[](https://pypi.org/project/pycorpdiff/)
|
|
102
99
|
[](https://pypi.org/project/pycorpdiff/)
|
|
103
100
|
[](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
|
|
104
|
-
[](https://doi.org/10.5281/zenodo.<RECORD>)
|
|
105
101
|
[](https://opensource.org/licenses/MIT)
|
|
106
|
-
-->
|
|
107
102
|
|
|
108
103
|
**Comparative corpus analysis for modern Python workflows.**
|
|
109
104
|
|
|
@@ -135,8 +130,8 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
135
130
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
136
131
|
via extras.
|
|
137
132
|
|
|
138
|
-
> **Status:
|
|
139
|
-
>
|
|
133
|
+
> **Status: alpha (0.1.0a1).** Public API is stable for the features
|
|
134
|
+
> described below; on PyPI as `pip install pycorpdiff`.
|
|
140
135
|
|
|
141
136
|
## The three-layer architecture
|
|
142
137
|
|
|
@@ -151,33 +146,45 @@ via extras.
|
|
|
151
146
|
```python
|
|
152
147
|
import pycorpdiff as pcd
|
|
153
148
|
|
|
154
|
-
|
|
149
|
+
# Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
|
|
150
|
+
corpus = pcd.load_hansard_sample()
|
|
151
|
+
immigration = corpus.slice(topic="immigration")
|
|
152
|
+
human = immigration.slice(frame="humanising")
|
|
153
|
+
criminal = immigration.slice(frame="criminalising")
|
|
155
154
|
|
|
156
155
|
# Compare — three verbs
|
|
157
|
-
k = pcd.compare(
|
|
158
|
-
c = pcd.compare(
|
|
159
|
-
s = pcd.compare(
|
|
156
|
+
k = pcd.compare(human, criminal).keyness()
|
|
157
|
+
c = pcd.compare(human, criminal).collocation_shift("immigrant")
|
|
158
|
+
# s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
|
|
159
|
+
# ↑ requires `pip install "pycorpdiff[semantic]"`
|
|
160
160
|
|
|
161
161
|
# Track over time
|
|
162
|
-
tr = pcd.track(
|
|
163
|
-
tr.changepoints()
|
|
164
|
-
tr.changepoints_online(hazard=1/24)
|
|
165
|
-
tr.interrupted_time_series(event_date="2016
|
|
166
|
-
tr.causal_impact(event_date="2016
|
|
167
|
-
tr.forecast(horizon=4)
|
|
162
|
+
tr = pcd.track(immigration, "criminal").over_time(freq="Y")
|
|
163
|
+
tr.changepoints() # offline PELT
|
|
164
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
165
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
166
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
167
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
168
168
|
|
|
169
169
|
# Before / after a known event
|
|
170
|
-
pcd.compare.before_after(
|
|
170
|
+
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
171
171
|
|
|
172
|
-
# N-way (≥ 2 corpora)
|
|
173
|
-
|
|
172
|
+
# N-way (≥ 2 corpora) — one keyness across all four parties
|
|
173
|
+
parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
|
|
174
|
+
nhs = corpus.slice(topic="nhs")
|
|
175
|
+
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
174
176
|
|
|
175
177
|
# The discourse as a graph
|
|
176
|
-
pcd.cooccurrence_network(
|
|
178
|
+
pcd.cooccurrence_network(immigration, top_n=30).plot()
|
|
177
179
|
|
|
178
180
|
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
179
181
|
```
|
|
180
182
|
|
|
183
|
+
The snippet above runs as-is on a fresh `pip install pycorpdiff` — no data
|
|
184
|
+
download required. Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
|
|
185
|
+
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or `pcd.from_huggingface(...)`
|
|
186
|
+
to use your own corpus.
|
|
187
|
+
|
|
181
188
|
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
182
189
|
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
183
190
|
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
@@ -185,23 +192,28 @@ surface.
|
|
|
185
192
|
|
|
186
193
|
## Installation
|
|
187
194
|
|
|
188
|
-
|
|
195
|
+
```bash
|
|
196
|
+
pip install pycorpdiff # lexical-comparative core
|
|
197
|
+
pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
|
|
198
|
+
pip install "pycorpdiff[semantic]" # + sentence-transformers
|
|
199
|
+
pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
|
|
200
|
+
pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
|
|
201
|
+
pip install "pycorpdiff[all]" # everything
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
The base install keeps a small dependency footprint (`numpy`, `pandas`,
|
|
205
|
+
`scipy`, `pyarrow`); optional extras land per analytical layer so you
|
|
206
|
+
only pay for what you use.
|
|
189
207
|
|
|
190
|
-
|
|
208
|
+
To work from source:
|
|
191
209
|
|
|
192
210
|
```bash
|
|
193
211
|
git clone https://github.com/jturner-uofl/pycorpdiff
|
|
194
212
|
cd pycorpdiff
|
|
195
213
|
pip install -e ".[dev]"
|
|
196
|
-
pytest -q
|
|
214
|
+
pytest -q
|
|
197
215
|
```
|
|
198
216
|
|
|
199
|
-
Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
|
|
200
|
-
(sentence-transformers + scikit-learn), `[temporal]` (ruptures +
|
|
201
|
-
statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
|
|
202
|
-
`[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
|
|
203
|
-
or `[all]`.
|
|
204
|
-
|
|
205
217
|
## Cross-validation receipts
|
|
206
218
|
|
|
207
219
|
The math agrees with the standard tools — by automated test:
|
|
@@ -1,14 +1,9 @@
|
|
|
1
1
|
# pycorpdiff
|
|
2
2
|
|
|
3
|
-
<!--
|
|
4
|
-
TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
|
|
5
|
-
|
|
6
3
|
[](https://pypi.org/project/pycorpdiff/)
|
|
7
4
|
[](https://pypi.org/project/pycorpdiff/)
|
|
8
5
|
[](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
|
|
9
|
-
[](https://doi.org/10.5281/zenodo.<RECORD>)
|
|
10
6
|
[](https://opensource.org/licenses/MIT)
|
|
11
|
-
-->
|
|
12
7
|
|
|
13
8
|
**Comparative corpus analysis for modern Python workflows.**
|
|
14
9
|
|
|
@@ -40,8 +35,8 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
40
35
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
41
36
|
via extras.
|
|
42
37
|
|
|
43
|
-
> **Status:
|
|
44
|
-
>
|
|
38
|
+
> **Status: alpha (0.1.0a1).** Public API is stable for the features
|
|
39
|
+
> described below; on PyPI as `pip install pycorpdiff`.
|
|
45
40
|
|
|
46
41
|
## The three-layer architecture
|
|
47
42
|
|
|
@@ -56,33 +51,45 @@ via extras.
|
|
|
56
51
|
```python
|
|
57
52
|
import pycorpdiff as pcd
|
|
58
53
|
|
|
59
|
-
|
|
54
|
+
# Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
|
|
55
|
+
corpus = pcd.load_hansard_sample()
|
|
56
|
+
immigration = corpus.slice(topic="immigration")
|
|
57
|
+
human = immigration.slice(frame="humanising")
|
|
58
|
+
criminal = immigration.slice(frame="criminalising")
|
|
60
59
|
|
|
61
60
|
# Compare — three verbs
|
|
62
|
-
k = pcd.compare(
|
|
63
|
-
c = pcd.compare(
|
|
64
|
-
s = pcd.compare(
|
|
61
|
+
k = pcd.compare(human, criminal).keyness()
|
|
62
|
+
c = pcd.compare(human, criminal).collocation_shift("immigrant")
|
|
63
|
+
# s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
|
|
64
|
+
# ↑ requires `pip install "pycorpdiff[semantic]"`
|
|
65
65
|
|
|
66
66
|
# Track over time
|
|
67
|
-
tr = pcd.track(
|
|
68
|
-
tr.changepoints()
|
|
69
|
-
tr.changepoints_online(hazard=1/24)
|
|
70
|
-
tr.interrupted_time_series(event_date="2016
|
|
71
|
-
tr.causal_impact(event_date="2016
|
|
72
|
-
tr.forecast(horizon=4)
|
|
67
|
+
tr = pcd.track(immigration, "criminal").over_time(freq="Y")
|
|
68
|
+
tr.changepoints() # offline PELT
|
|
69
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
70
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
71
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
72
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
73
73
|
|
|
74
74
|
# Before / after a known event
|
|
75
|
-
pcd.compare.before_after(
|
|
75
|
+
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
76
76
|
|
|
77
|
-
# N-way (≥ 2 corpora)
|
|
78
|
-
|
|
77
|
+
# N-way (≥ 2 corpora) — one keyness across all four parties
|
|
78
|
+
parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
|
|
79
|
+
nhs = corpus.slice(topic="nhs")
|
|
80
|
+
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
79
81
|
|
|
80
82
|
# The discourse as a graph
|
|
81
|
-
pcd.cooccurrence_network(
|
|
83
|
+
pcd.cooccurrence_network(immigration, top_n=30).plot()
|
|
82
84
|
|
|
83
85
|
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
84
86
|
```
|
|
85
87
|
|
|
88
|
+
The snippet above runs as-is on a fresh `pip install pycorpdiff` — no data
|
|
89
|
+
download required. Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
|
|
90
|
+
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or `pcd.from_huggingface(...)`
|
|
91
|
+
to use your own corpus.
|
|
92
|
+
|
|
86
93
|
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
87
94
|
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
88
95
|
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
@@ -90,23 +97,28 @@ surface.
|
|
|
90
97
|
|
|
91
98
|
## Installation
|
|
92
99
|
|
|
93
|
-
|
|
100
|
+
```bash
|
|
101
|
+
pip install pycorpdiff # lexical-comparative core
|
|
102
|
+
pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
|
|
103
|
+
pip install "pycorpdiff[semantic]" # + sentence-transformers
|
|
104
|
+
pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
|
|
105
|
+
pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
|
|
106
|
+
pip install "pycorpdiff[all]" # everything
|
|
107
|
+
```
|
|
108
|
+
|
|
109
|
+
The base install keeps a small dependency footprint (`numpy`, `pandas`,
|
|
110
|
+
`scipy`, `pyarrow`); optional extras land per analytical layer so you
|
|
111
|
+
only pay for what you use.
|
|
94
112
|
|
|
95
|
-
|
|
113
|
+
To work from source:
|
|
96
114
|
|
|
97
115
|
```bash
|
|
98
116
|
git clone https://github.com/jturner-uofl/pycorpdiff
|
|
99
117
|
cd pycorpdiff
|
|
100
118
|
pip install -e ".[dev]"
|
|
101
|
-
pytest -q
|
|
119
|
+
pytest -q
|
|
102
120
|
```
|
|
103
121
|
|
|
104
|
-
Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
|
|
105
|
-
(sentence-transformers + scikit-learn), `[temporal]` (ruptures +
|
|
106
|
-
statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
|
|
107
|
-
`[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
|
|
108
|
-
or `[all]`.
|
|
109
|
-
|
|
110
122
|
## Cross-validation receipts
|
|
111
123
|
|
|
112
124
|
The math agrees with the standard tools — by automated test:
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pycorpdiff"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.0a1"
|
|
8
8
|
description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -23,7 +23,7 @@ keywords = [
|
|
|
23
23
|
"temporal text analysis",
|
|
24
24
|
]
|
|
25
25
|
classifiers = [
|
|
26
|
-
"Development Status ::
|
|
26
|
+
"Development Status :: 3 - Alpha",
|
|
27
27
|
"Intended Audience :: Science/Research",
|
|
28
28
|
"License :: OSI Approved :: MIT License",
|
|
29
29
|
"Programming Language :: Python :: 3",
|
|
@@ -36,9 +36,9 @@ classifiers = [
|
|
|
36
36
|
]
|
|
37
37
|
dependencies = [
|
|
38
38
|
"numpy>=1.24",
|
|
39
|
-
# Capped at <3
|
|
40
|
-
#
|
|
41
|
-
#
|
|
39
|
+
# Capped at <3: pandas 3.x raises Pandas4Warning under strict
|
|
40
|
+
# warning filters via third-party DataFrame copies. Lift when the
|
|
41
|
+
# wider PyData stack catches up.
|
|
42
42
|
"pandas>=2.0,<3",
|
|
43
43
|
"scipy>=1.11",
|
|
44
44
|
"pyarrow>=14",
|
|
@@ -47,13 +47,12 @@ dependencies = [
|
|
|
47
47
|
[project.optional-dependencies]
|
|
48
48
|
# Visualisation: altair-first, matplotlib retained for paper-grade figures.
|
|
49
49
|
viz = ["altair>=5", "matplotlib>=3.8", "networkx>=3.1"]
|
|
50
|
-
# Embedding-based semantic shift
|
|
51
|
-
#
|
|
50
|
+
# Embedding-based semantic shift. sentence-transformers pulls torch
|
|
51
|
+
# transitively, which is why this is opt-in rather than a base dep.
|
|
52
52
|
semantic = ["sentence-transformers>=2.2", "scikit-learn>=1.3"]
|
|
53
53
|
# Changepoint detection + interrupted time series.
|
|
54
54
|
temporal = ["ruptures>=1.1", "statsmodels>=0.14"]
|
|
55
|
-
# Optional columnar backend. polars.from_pandas() needs pyarrow at runtime
|
|
56
|
-
# (mirrors pysofra's pinning).
|
|
55
|
+
# Optional columnar backend. polars.from_pandas() needs pyarrow at runtime.
|
|
57
56
|
polars = ["polars>=1.0", "pyarrow>=15"]
|
|
58
57
|
# Out-of-core querying for large corpora.
|
|
59
58
|
duckdb = ["duckdb>=0.10"]
|
|
@@ -92,9 +91,9 @@ dev = [
|
|
|
92
91
|
"ruff>=0.4",
|
|
93
92
|
"mypy>=1.8",
|
|
94
93
|
"pre-commit>=3.6",
|
|
95
|
-
# pandas-stubs sharpens mypy strict typing for pandas surfaces
|
|
96
|
-
#
|
|
97
|
-
#
|
|
94
|
+
# pandas-stubs sharpens mypy strict typing for pandas surfaces.
|
|
95
|
+
# Without it, ignore_missing_imports would mask real typing
|
|
96
|
+
# regressions in pandas-mediated code paths.
|
|
98
97
|
"pandas-stubs>=2.2",
|
|
99
98
|
]
|
|
100
99
|
[project.urls]
|
|
@@ -119,7 +118,6 @@ include = [
|
|
|
119
118
|
exclude = [
|
|
120
119
|
"docs",
|
|
121
120
|
"examples",
|
|
122
|
-
"site",
|
|
123
121
|
".github",
|
|
124
122
|
"uv.lock",
|
|
125
123
|
]
|
|
@@ -1,26 +1,25 @@
|
|
|
1
1
|
"""pycorpdiff — comparative corpus analysis for modern Python workflows.
|
|
2
2
|
|
|
3
3
|
The package exposes three public verbs (:func:`compare`, :func:`track`,
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
:class:`
|
|
7
|
-
:class:`
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
of the roadmap lands.
|
|
4
|
+
:func:`compare.before_after`) and a family of frozen-dataclass
|
|
5
|
+
result objects (:class:`KeynessResult`, :class:`CollocationShiftResult`,
|
|
6
|
+
:class:`SemanticShiftResult`, :class:`TemporalTrajectory`,
|
|
7
|
+
:class:`NetworkResult`, :class:`ForecastResult`,
|
|
8
|
+
:class:`CausalImpactResult`, :class:`BocpdResult`,
|
|
9
|
+
:class:`ConcordanceResult`), each implementing the same
|
|
10
|
+
``.to_df / .plot / .explain / .summary / .to_html / .to_json`` contract.
|
|
12
11
|
|
|
13
12
|
Example
|
|
14
13
|
-------
|
|
15
14
|
|
|
16
15
|
>>> import pycorpdiff as pcd
|
|
17
16
|
>>> pcd.__version__
|
|
18
|
-
'0.1.
|
|
17
|
+
'0.1.0a1'
|
|
19
18
|
"""
|
|
20
19
|
|
|
21
20
|
from __future__ import annotations
|
|
22
21
|
|
|
23
|
-
__version__ = "0.1.
|
|
22
|
+
__version__ = "0.1.0a1"
|
|
24
23
|
|
|
25
24
|
from .collocation.network import NetworkResult, cooccurrence_network
|
|
26
25
|
from .compare import Comparison, compare
|
|
@@ -251,7 +251,16 @@ class SemanticShiftResult:
|
|
|
251
251
|
return _table_to_json(self.table, path, **kw)
|
|
252
252
|
|
|
253
253
|
def plot(self, **kw: Any) -> alt.Chart:
|
|
254
|
-
|
|
254
|
+
"""Plotting for SemanticShiftResult is not yet implemented.
|
|
255
|
+
|
|
256
|
+
For a forward-looking trajectory of cosine distances, use
|
|
257
|
+
:func:`pycorpdiff.semantic_trajectory` and pass the resulting
|
|
258
|
+
DataFrame to :func:`pycorpdiff.viz.semantic_forecast_plot`.
|
|
259
|
+
"""
|
|
260
|
+
raise NotImplementedError(
|
|
261
|
+
"SemanticShiftResult.plot() is not yet implemented; "
|
|
262
|
+
"use .table or pcd.viz.semantic_forecast_plot() instead"
|
|
263
|
+
)
|
|
255
264
|
|
|
256
265
|
def neighbors_before(
|
|
257
266
|
self, target: str | None = None, n: int = 10
|
|
@@ -1,10 +1,8 @@
|
|
|
1
|
-
"""Smoke tests for the
|
|
1
|
+
"""Smoke tests for the public surface.
|
|
2
2
|
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
NotImplementedError and are intentionally not exercised here — Phase 1
|
|
7
|
-
will replace those tests with real ones.
|
|
3
|
+
Quick exercises of imports, the :class:`Corpus` constructor, slicing,
|
|
4
|
+
the default regex tokenizer, and the CSV/parquet readers. The deeper
|
|
5
|
+
analytical surfaces have their own dedicated test modules.
|
|
8
6
|
"""
|
|
9
7
|
|
|
10
8
|
from __future__ import annotations
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/_data/hansard_sample.parquet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|