pycorpdiff 0.1.0a2__tar.gz → 0.1.0a3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/CHANGELOG.md +1 -1
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/CITATION.cff +1 -1
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/PKG-INFO +44 -35
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/README.md +41 -32
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/pyproject.toml +3 -3
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/__init__.py +2 -2
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/.gitignore +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/LICENSE +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/_backends/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/_backends/pandas.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/_backends/polars.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/measures.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/network.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/shift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/compare.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/corpus.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/hansard.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/histwords.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/explain.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/duckdb.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/huggingface.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/readers.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/bayes.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/chi_squared.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/correction.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/dispersion.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/multicorpus.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/permutation.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/py.typed +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/results.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/alignment.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/embed.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/shift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/trajectory.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/stats.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/bocpd.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/changepoint.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/forecast.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/its.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/slicing.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/tokenize.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/bocpd.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/collocation.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/dispersion.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/forecast.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/keyness.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/network.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/scattertext.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/trajectory.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/conftest.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/fixtures/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_collocation_integration.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_histwords.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_nltk.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_quanteda.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_rayson.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_scattertext.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_explain_integration.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_keyness_integration.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_sbert_slow.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_semantic_integration.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_stop_words.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_temporal_stats.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_viz.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/property/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/property/test_collocation_properties.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/property/test_keyness_properties.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/property/test_temporal_properties.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_bayes_factor.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_bocpd.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_causal_impact.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_changepoint.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_chi_squared.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_collocation_cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_collocation_measures.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_collocation_shift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_comparison_concordance.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_cooccurrence_network.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_corpus_hash.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_corpus_vocab.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_correction.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_datasets_hansard.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_dispersion.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_dispersion_plot.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_doc_term_counts_sparse.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_embedders.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_explain.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_forecast.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_forecast_semantic_drift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_from_huggingface.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_hansard_fetcher.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_histwords_loader.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_its.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_keyness_multi.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_ngram_tokenizer.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_permutation_keyness.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_polars_interop.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_procrustes.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_read_duckdb.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_read_txt_line_mode.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_result_exports.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_scattertext_plot.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_semantic_neighbours.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_semantic_shift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_semantic_trajectory.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_smoke.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_temporal.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_wilson_ci.py +0 -0
|
@@ -4,7 +4,7 @@ All notable changes to `pycorpdiff` are documented in this file. The format
|
|
|
4
4
|
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
|
|
5
5
|
project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
-
## [0.1.
|
|
7
|
+
## [0.1.0a3] — initial release
|
|
8
8
|
|
|
9
9
|
The initial public release of `pycorpdiff` — comparative corpus analysis
|
|
10
10
|
for modern Python workflows. Three public verbs (`compare`, `track`,
|
|
@@ -4,7 +4,7 @@ message: >
|
|
|
4
4
|
entry. GitHub renders a "Cite this repository" widget directly from
|
|
5
5
|
this file.
|
|
6
6
|
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
7
|
-
version: 0.1.
|
|
7
|
+
version: 0.1.0a3
|
|
8
8
|
date-released: 2026-05-22
|
|
9
9
|
authors:
|
|
10
10
|
- family-names: Turner
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycorpdiff
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a3
|
|
4
4
|
Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
|
|
5
5
|
Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
|
|
6
6
|
Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
|
|
@@ -53,7 +53,7 @@ Requires-Dist: matplotlib>=3.8; extra == 'all'
|
|
|
53
53
|
Requires-Dist: networkx>=3.1; extra == 'all'
|
|
54
54
|
Requires-Dist: polars>=1.0; extra == 'all'
|
|
55
55
|
Requires-Dist: pyarrow>=15; extra == 'all'
|
|
56
|
-
Requires-Dist: pysofra>=0.1.
|
|
56
|
+
Requires-Dist: pysofra>=0.1.0a3; extra == 'all'
|
|
57
57
|
Requires-Dist: ruptures>=1.1; extra == 'all'
|
|
58
58
|
Requires-Dist: scikit-learn>=1.3; extra == 'all'
|
|
59
59
|
Requires-Dist: sentence-transformers>=2.2; extra == 'all'
|
|
@@ -76,7 +76,7 @@ Provides-Extra: nlp
|
|
|
76
76
|
Requires-Dist: spacy>=3.7; extra == 'nlp'
|
|
77
77
|
Provides-Extra: notebooks
|
|
78
78
|
Requires-Dist: jupyter>=1.0; extra == 'notebooks'
|
|
79
|
-
Requires-Dist: pysofra>=0.1.
|
|
79
|
+
Requires-Dist: pysofra>=0.1.0a3; extra == 'notebooks'
|
|
80
80
|
Requires-Dist: vl-convert-python>=1.5; extra == 'notebooks'
|
|
81
81
|
Provides-Extra: polars
|
|
82
82
|
Requires-Dist: polars>=1.0; extra == 'polars'
|
|
@@ -130,7 +130,7 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
130
130
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
131
131
|
via extras.
|
|
132
132
|
|
|
133
|
-
> **Status: alpha (0.1.
|
|
133
|
+
> **Status: alpha (0.1.0a3).** Public API is stable for the features
|
|
134
134
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
135
135
|
|
|
136
136
|
## The three-layer architecture
|
|
@@ -144,52 +144,61 @@ via extras.
|
|
|
144
144
|
## Quick start
|
|
145
145
|
|
|
146
146
|
```bash
|
|
147
|
-
pip install "pycorpdiff[viz
|
|
147
|
+
pip install "pycorpdiff[viz]"
|
|
148
148
|
```
|
|
149
149
|
|
|
150
150
|
```python
|
|
151
151
|
import pycorpdiff as pcd
|
|
152
152
|
|
|
153
|
-
# Bundled
|
|
153
|
+
# Bundled UK-Hansard sample — runs offline, no data download.
|
|
154
154
|
corpus = pcd.load_hansard_sample()
|
|
155
155
|
immigration = corpus.slice(topic="immigration")
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
#
|
|
164
|
-
|
|
165
|
-
#
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
156
|
+
|
|
157
|
+
# Which words separate the humanising and criminalising frames?
|
|
158
|
+
keyness = pcd.compare(
|
|
159
|
+
immigration.slice(frame="humanising"),
|
|
160
|
+
immigration.slice(frame="criminalising"),
|
|
161
|
+
).keyness(min_count=3)
|
|
162
|
+
|
|
163
|
+
keyness.plot() # volcano plot — picture the result
|
|
164
|
+
# keyness.table.head(10) # or look at the ranked table directly
|
|
165
|
+
# keyness.explain("criminal") # KWIC concordances showing the textual evidence
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
That's the entire surface in five lines: load a corpus, slice it,
|
|
169
|
+
compare two slices, plot the result. Every other analytical method —
|
|
170
|
+
collocation shifts, semantic drift, temporal trajectories, changepoint
|
|
171
|
+
detection, causal-impact analysis, forecasting, co-occurrence networks,
|
|
172
|
+
N-way keyness — follows the same shape. See
|
|
173
|
+
[the showcase notebook](docs/rendered/pycorpdiff_showcase.html) for the
|
|
174
|
+
full feature tour, or the cheat sheet below for one-line API previews.
|
|
175
|
+
|
|
176
|
+
### Cheat sheet — every analytical surface in one block
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
# Compare verbs (returns Result objects with .plot / .to_df / .explain / .summary)
|
|
180
|
+
pcd.compare(a, b).keyness()
|
|
181
|
+
pcd.compare(a, b).collocation_shift("migrant")
|
|
182
|
+
pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
183
|
+
|
|
184
|
+
# Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
|
|
185
|
+
tr = pcd.track(corpus, "migrant").over_time(freq="Y")
|
|
186
|
+
tr.changepoints() # offline PELT
|
|
187
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
188
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
189
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
190
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
172
191
|
|
|
173
192
|
# Before / after a known event
|
|
174
193
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
175
194
|
|
|
176
|
-
# N-way (≥ 2 corpora)
|
|
177
|
-
|
|
178
|
-
nhs = corpus.slice(topic="nhs")
|
|
179
|
-
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
195
|
+
# N-way (≥ 2 corpora)
|
|
196
|
+
pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
|
|
180
197
|
|
|
181
198
|
# The discourse as a graph
|
|
182
|
-
pcd.cooccurrence_network(
|
|
183
|
-
|
|
184
|
-
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
199
|
+
pcd.cooccurrence_network(corpus, top_n=30).plot()
|
|
185
200
|
```
|
|
186
201
|
|
|
187
|
-
Every line of the snippet above is verified end-to-end against
|
|
188
|
-
`pip install "pycorpdiff[viz,temporal]"` — no data download required.
|
|
189
|
-
Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
|
|
190
|
-
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
|
|
191
|
-
`pcd.from_huggingface(...)` to use your own corpus.
|
|
192
|
-
|
|
193
202
|
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
194
203
|
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
195
204
|
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
@@ -35,7 +35,7 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
35
35
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
36
36
|
via extras.
|
|
37
37
|
|
|
38
|
-
> **Status: alpha (0.1.
|
|
38
|
+
> **Status: alpha (0.1.0a3).** Public API is stable for the features
|
|
39
39
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
40
40
|
|
|
41
41
|
## The three-layer architecture
|
|
@@ -49,52 +49,61 @@ via extras.
|
|
|
49
49
|
## Quick start
|
|
50
50
|
|
|
51
51
|
```bash
|
|
52
|
-
pip install "pycorpdiff[viz
|
|
52
|
+
pip install "pycorpdiff[viz]"
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
```python
|
|
56
56
|
import pycorpdiff as pcd
|
|
57
57
|
|
|
58
|
-
# Bundled
|
|
58
|
+
# Bundled UK-Hansard sample — runs offline, no data download.
|
|
59
59
|
corpus = pcd.load_hansard_sample()
|
|
60
60
|
immigration = corpus.slice(topic="immigration")
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
#
|
|
69
|
-
|
|
70
|
-
#
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
61
|
+
|
|
62
|
+
# Which words separate the humanising and criminalising frames?
|
|
63
|
+
keyness = pcd.compare(
|
|
64
|
+
immigration.slice(frame="humanising"),
|
|
65
|
+
immigration.slice(frame="criminalising"),
|
|
66
|
+
).keyness(min_count=3)
|
|
67
|
+
|
|
68
|
+
keyness.plot() # volcano plot — picture the result
|
|
69
|
+
# keyness.table.head(10) # or look at the ranked table directly
|
|
70
|
+
# keyness.explain("criminal") # KWIC concordances showing the textual evidence
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
That's the entire surface in five lines: load a corpus, slice it,
|
|
74
|
+
compare two slices, plot the result. Every other analytical method —
|
|
75
|
+
collocation shifts, semantic drift, temporal trajectories, changepoint
|
|
76
|
+
detection, causal-impact analysis, forecasting, co-occurrence networks,
|
|
77
|
+
N-way keyness — follows the same shape. See
|
|
78
|
+
[the showcase notebook](docs/rendered/pycorpdiff_showcase.html) for the
|
|
79
|
+
full feature tour, or the cheat sheet below for one-line API previews.
|
|
80
|
+
|
|
81
|
+
### Cheat sheet — every analytical surface in one block
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# Compare verbs (returns Result objects with .plot / .to_df / .explain / .summary)
|
|
85
|
+
pcd.compare(a, b).keyness()
|
|
86
|
+
pcd.compare(a, b).collocation_shift("migrant")
|
|
87
|
+
pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
88
|
+
|
|
89
|
+
# Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
|
|
90
|
+
tr = pcd.track(corpus, "migrant").over_time(freq="Y")
|
|
91
|
+
tr.changepoints() # offline PELT
|
|
92
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
93
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
94
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
95
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
77
96
|
|
|
78
97
|
# Before / after a known event
|
|
79
98
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
80
99
|
|
|
81
|
-
# N-way (≥ 2 corpora)
|
|
82
|
-
|
|
83
|
-
nhs = corpus.slice(topic="nhs")
|
|
84
|
-
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
100
|
+
# N-way (≥ 2 corpora)
|
|
101
|
+
pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
|
|
85
102
|
|
|
86
103
|
# The discourse as a graph
|
|
87
|
-
pcd.cooccurrence_network(
|
|
88
|
-
|
|
89
|
-
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
104
|
+
pcd.cooccurrence_network(corpus, top_n=30).plot()
|
|
90
105
|
```
|
|
91
106
|
|
|
92
|
-
Every line of the snippet above is verified end-to-end against
|
|
93
|
-
`pip install "pycorpdiff[viz,temporal]"` — no data download required.
|
|
94
|
-
Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
|
|
95
|
-
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
|
|
96
|
-
`pcd.from_huggingface(...)` to use your own corpus.
|
|
97
|
-
|
|
98
107
|
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
99
108
|
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
100
109
|
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pycorpdiff"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.0a3"
|
|
8
8
|
description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -66,7 +66,7 @@ huggingface = ["datasets>=2.14"]
|
|
|
66
66
|
# rendered HTML examples. `jupyter` runs the notebook, `vl-convert` does
|
|
67
67
|
# static SVG/PNG export of altair charts, `pysofra` renders the showcase's
|
|
68
68
|
# result tables in JAMA-style typography.
|
|
69
|
-
notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.
|
|
69
|
+
notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.0a3"]
|
|
70
70
|
# Meta-extra so `pycorpdiff[all]` exercises every optional code path.
|
|
71
71
|
all = [
|
|
72
72
|
"altair>=5",
|
|
@@ -82,7 +82,7 @@ all = [
|
|
|
82
82
|
"duckdb>=0.10",
|
|
83
83
|
"spacy>=3.7",
|
|
84
84
|
"vl-convert-python>=1.5",
|
|
85
|
-
"pysofra>=0.1.
|
|
85
|
+
"pysofra>=0.1.0a3",
|
|
86
86
|
]
|
|
87
87
|
dev = [
|
|
88
88
|
"pytest>=8",
|
|
@@ -14,12 +14,12 @@ Example
|
|
|
14
14
|
|
|
15
15
|
>>> import pycorpdiff as pcd
|
|
16
16
|
>>> pcd.__version__
|
|
17
|
-
'0.1.
|
|
17
|
+
'0.1.0a3'
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
|
-
__version__ = "0.1.
|
|
22
|
+
__version__ = "0.1.0a3"
|
|
23
23
|
|
|
24
24
|
from .collocation.network import NetworkResult, cooccurrence_network
|
|
25
25
|
from .compare import Comparison, compare
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/_data/hansard_sample.parquet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|