pycorpdiff 0.1.0a1__tar.gz → 0.1.0a3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/CHANGELOG.md +1 -1
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/CITATION.cff +1 -1
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/PKG-INFO +47 -33
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/README.md +44 -30
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/pyproject.toml +3 -3
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/__init__.py +2 -2
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/.gitignore +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/LICENSE +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/_backends/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/_backends/pandas.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/_backends/polars.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/measures.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/network.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/shift.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/compare.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/corpus.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/hansard.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/histwords.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/explain.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/duckdb.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/huggingface.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/readers.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/bayes.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/chi_squared.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/correction.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/dispersion.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/multicorpus.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/permutation.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/py.typed +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/results.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/alignment.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/embed.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/shift.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/trajectory.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/stats.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/bocpd.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/changepoint.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/forecast.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/its.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/slicing.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/tokenize.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/bocpd.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/collocation.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/dispersion.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/forecast.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/keyness.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/network.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/scattertext.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/trajectory.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/conftest.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/fixtures/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_collocation_integration.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_histwords.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_nltk.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_quanteda.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_rayson.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_scattertext.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_explain_integration.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_keyness_integration.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_sbert_slow.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_semantic_integration.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_stop_words.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_temporal_stats.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/integration/test_viz.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/property/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/property/test_collocation_properties.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/property/test_keyness_properties.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/property/test_temporal_properties.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/__init__.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_bayes_factor.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_bocpd.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_causal_impact.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_changepoint.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_chi_squared.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_collocation_cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_collocation_measures.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_collocation_shift.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_comparison_concordance.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_cooccurrence_network.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_corpus_hash.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_corpus_vocab.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_correction.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_datasets_hansard.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_dispersion.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_dispersion_plot.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_doc_term_counts_sparse.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_embedders.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_explain.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_forecast.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_forecast_semantic_drift.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_from_huggingface.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_hansard_fetcher.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_histwords_loader.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_its.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_keyness_multi.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_ngram_tokenizer.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_permutation_keyness.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_polars_interop.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_procrustes.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_read_duckdb.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_read_txt_line_mode.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_result_exports.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_scattertext_plot.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_semantic_neighbours.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_semantic_shift.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_semantic_trajectory.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_smoke.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_temporal.py +0 -0
- {pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/tests/unit/test_wilson_ci.py +0 -0
|
@@ -4,7 +4,7 @@ All notable changes to `pycorpdiff` are documented in this file. The format
|
|
|
4
4
|
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
|
|
5
5
|
project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
-
## [0.1.
|
|
7
|
+
## [0.1.0a3] — initial release
|
|
8
8
|
|
|
9
9
|
The initial public release of `pycorpdiff` — comparative corpus analysis
|
|
10
10
|
for modern Python workflows. Three public verbs (`compare`, `track`,
|
|
@@ -4,7 +4,7 @@ message: >
|
|
|
4
4
|
entry. GitHub renders a "Cite this repository" widget directly from
|
|
5
5
|
this file.
|
|
6
6
|
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
7
|
-
version: 0.1.
|
|
7
|
+
version: 0.1.0a3
|
|
8
8
|
date-released: 2026-05-22
|
|
9
9
|
authors:
|
|
10
10
|
- family-names: Turner
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycorpdiff
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a3
|
|
4
4
|
Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
|
|
5
5
|
Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
|
|
6
6
|
Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
|
|
@@ -53,7 +53,7 @@ Requires-Dist: matplotlib>=3.8; extra == 'all'
|
|
|
53
53
|
Requires-Dist: networkx>=3.1; extra == 'all'
|
|
54
54
|
Requires-Dist: polars>=1.0; extra == 'all'
|
|
55
55
|
Requires-Dist: pyarrow>=15; extra == 'all'
|
|
56
|
-
Requires-Dist: pysofra>=0.1.
|
|
56
|
+
Requires-Dist: pysofra>=0.1.0a3; extra == 'all'
|
|
57
57
|
Requires-Dist: ruptures>=1.1; extra == 'all'
|
|
58
58
|
Requires-Dist: scikit-learn>=1.3; extra == 'all'
|
|
59
59
|
Requires-Dist: sentence-transformers>=2.2; extra == 'all'
|
|
@@ -76,7 +76,7 @@ Provides-Extra: nlp
|
|
|
76
76
|
Requires-Dist: spacy>=3.7; extra == 'nlp'
|
|
77
77
|
Provides-Extra: notebooks
|
|
78
78
|
Requires-Dist: jupyter>=1.0; extra == 'notebooks'
|
|
79
|
-
Requires-Dist: pysofra>=0.1.
|
|
79
|
+
Requires-Dist: pysofra>=0.1.0a3; extra == 'notebooks'
|
|
80
80
|
Requires-Dist: vl-convert-python>=1.5; extra == 'notebooks'
|
|
81
81
|
Provides-Extra: polars
|
|
82
82
|
Requires-Dist: polars>=1.0; extra == 'polars'
|
|
@@ -130,7 +130,7 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
130
130
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
131
131
|
via extras.
|
|
132
132
|
|
|
133
|
-
> **Status: alpha (0.1.
|
|
133
|
+
> **Status: alpha (0.1.0a3).** Public API is stable for the features
|
|
134
134
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
135
135
|
|
|
136
136
|
## The three-layer architecture
|
|
@@ -143,48 +143,62 @@ via extras.
|
|
|
143
143
|
|
|
144
144
|
## Quick start
|
|
145
145
|
|
|
146
|
+
```bash
|
|
147
|
+
pip install "pycorpdiff[viz]"
|
|
148
|
+
```
|
|
149
|
+
|
|
146
150
|
```python
|
|
147
151
|
import pycorpdiff as pcd
|
|
148
152
|
|
|
149
|
-
# Bundled
|
|
153
|
+
# Bundled UK-Hansard sample — runs offline, no data download.
|
|
150
154
|
corpus = pcd.load_hansard_sample()
|
|
151
155
|
immigration = corpus.slice(topic="immigration")
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
#
|
|
160
|
-
|
|
161
|
-
#
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
156
|
+
|
|
157
|
+
# Which words separate the humanising and criminalising frames?
|
|
158
|
+
keyness = pcd.compare(
|
|
159
|
+
immigration.slice(frame="humanising"),
|
|
160
|
+
immigration.slice(frame="criminalising"),
|
|
161
|
+
).keyness(min_count=3)
|
|
162
|
+
|
|
163
|
+
keyness.plot() # volcano plot — picture the result
|
|
164
|
+
# keyness.table.head(10) # or look at the ranked table directly
|
|
165
|
+
# keyness.explain("criminal") # KWIC concordances showing the textual evidence
|
|
166
|
+
```
|
|
167
|
+
|
|
168
|
+
That's the entire surface in five lines: load a corpus, slice it,
|
|
169
|
+
compare two slices, plot the result. Every other analytical method —
|
|
170
|
+
collocation shifts, semantic drift, temporal trajectories, changepoint
|
|
171
|
+
detection, causal-impact analysis, forecasting, co-occurrence networks,
|
|
172
|
+
N-way keyness — follows the same shape. See
|
|
173
|
+
[the showcase notebook](docs/rendered/pycorpdiff_showcase.html) for the
|
|
174
|
+
full feature tour, or the cheat sheet below for one-line API previews.
|
|
175
|
+
|
|
176
|
+
### Cheat sheet — every analytical surface in one block
|
|
177
|
+
|
|
178
|
+
```python
|
|
179
|
+
# Compare verbs (returns Result objects with .plot / .to_df / .explain / .summary)
|
|
180
|
+
pcd.compare(a, b).keyness()
|
|
181
|
+
pcd.compare(a, b).collocation_shift("migrant")
|
|
182
|
+
pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
183
|
+
|
|
184
|
+
# Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
|
|
185
|
+
tr = pcd.track(corpus, "migrant").over_time(freq="Y")
|
|
186
|
+
tr.changepoints() # offline PELT
|
|
187
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
188
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
189
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
190
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
168
191
|
|
|
169
192
|
# Before / after a known event
|
|
170
193
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
171
194
|
|
|
172
|
-
# N-way (≥ 2 corpora)
|
|
173
|
-
|
|
174
|
-
nhs = corpus.slice(topic="nhs")
|
|
175
|
-
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
195
|
+
# N-way (≥ 2 corpora)
|
|
196
|
+
pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
|
|
176
197
|
|
|
177
198
|
# The discourse as a graph
|
|
178
|
-
pcd.cooccurrence_network(
|
|
179
|
-
|
|
180
|
-
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
199
|
+
pcd.cooccurrence_network(corpus, top_n=30).plot()
|
|
181
200
|
```
|
|
182
201
|
|
|
183
|
-
The snippet above runs as-is on a fresh `pip install pycorpdiff` — no data
|
|
184
|
-
download required. Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
|
|
185
|
-
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or `pcd.from_huggingface(...)`
|
|
186
|
-
to use your own corpus.
|
|
187
|
-
|
|
188
202
|
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
189
203
|
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
190
204
|
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
@@ -35,7 +35,7 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
35
35
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
36
36
|
via extras.
|
|
37
37
|
|
|
38
|
-
> **Status: alpha (0.1.
|
|
38
|
+
> **Status: alpha (0.1.0a3).** Public API is stable for the features
|
|
39
39
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
40
40
|
|
|
41
41
|
## The three-layer architecture
|
|
@@ -48,48 +48,62 @@ via extras.
|
|
|
48
48
|
|
|
49
49
|
## Quick start
|
|
50
50
|
|
|
51
|
+
```bash
|
|
52
|
+
pip install "pycorpdiff[viz]"
|
|
53
|
+
```
|
|
54
|
+
|
|
51
55
|
```python
|
|
52
56
|
import pycorpdiff as pcd
|
|
53
57
|
|
|
54
|
-
# Bundled
|
|
58
|
+
# Bundled UK-Hansard sample — runs offline, no data download.
|
|
55
59
|
corpus = pcd.load_hansard_sample()
|
|
56
60
|
immigration = corpus.slice(topic="immigration")
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
#
|
|
65
|
-
|
|
66
|
-
#
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
61
|
+
|
|
62
|
+
# Which words separate the humanising and criminalising frames?
|
|
63
|
+
keyness = pcd.compare(
|
|
64
|
+
immigration.slice(frame="humanising"),
|
|
65
|
+
immigration.slice(frame="criminalising"),
|
|
66
|
+
).keyness(min_count=3)
|
|
67
|
+
|
|
68
|
+
keyness.plot() # volcano plot — picture the result
|
|
69
|
+
# keyness.table.head(10) # or look at the ranked table directly
|
|
70
|
+
# keyness.explain("criminal") # KWIC concordances showing the textual evidence
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
That's the entire surface in five lines: load a corpus, slice it,
|
|
74
|
+
compare two slices, plot the result. Every other analytical method —
|
|
75
|
+
collocation shifts, semantic drift, temporal trajectories, changepoint
|
|
76
|
+
detection, causal-impact analysis, forecasting, co-occurrence networks,
|
|
77
|
+
N-way keyness — follows the same shape. See
|
|
78
|
+
[the showcase notebook](docs/rendered/pycorpdiff_showcase.html) for the
|
|
79
|
+
full feature tour, or the cheat sheet below for one-line API previews.
|
|
80
|
+
|
|
81
|
+
### Cheat sheet — every analytical surface in one block
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# Compare verbs (returns Result objects with .plot / .to_df / .explain / .summary)
|
|
85
|
+
pcd.compare(a, b).keyness()
|
|
86
|
+
pcd.compare(a, b).collocation_shift("migrant")
|
|
87
|
+
pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
88
|
+
|
|
89
|
+
# Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
|
|
90
|
+
tr = pcd.track(corpus, "migrant").over_time(freq="Y")
|
|
91
|
+
tr.changepoints() # offline PELT
|
|
92
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
93
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
94
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
95
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
73
96
|
|
|
74
97
|
# Before / after a known event
|
|
75
98
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
76
99
|
|
|
77
|
-
# N-way (≥ 2 corpora)
|
|
78
|
-
|
|
79
|
-
nhs = corpus.slice(topic="nhs")
|
|
80
|
-
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
100
|
+
# N-way (≥ 2 corpora)
|
|
101
|
+
pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
|
|
81
102
|
|
|
82
103
|
# The discourse as a graph
|
|
83
|
-
pcd.cooccurrence_network(
|
|
84
|
-
|
|
85
|
-
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
104
|
+
pcd.cooccurrence_network(corpus, top_n=30).plot()
|
|
86
105
|
```
|
|
87
106
|
|
|
88
|
-
The snippet above runs as-is on a fresh `pip install pycorpdiff` — no data
|
|
89
|
-
download required. Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
|
|
90
|
-
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or `pcd.from_huggingface(...)`
|
|
91
|
-
to use your own corpus.
|
|
92
|
-
|
|
93
107
|
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
94
108
|
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
95
109
|
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pycorpdiff"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.0a3"
|
|
8
8
|
description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -66,7 +66,7 @@ huggingface = ["datasets>=2.14"]
|
|
|
66
66
|
# rendered HTML examples. `jupyter` runs the notebook, `vl-convert` does
|
|
67
67
|
# static SVG/PNG export of altair charts, `pysofra` renders the showcase's
|
|
68
68
|
# result tables in JAMA-style typography.
|
|
69
|
-
notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.
|
|
69
|
+
notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.0a3"]
|
|
70
70
|
# Meta-extra so `pycorpdiff[all]` exercises every optional code path.
|
|
71
71
|
all = [
|
|
72
72
|
"altair>=5",
|
|
@@ -82,7 +82,7 @@ all = [
|
|
|
82
82
|
"duckdb>=0.10",
|
|
83
83
|
"spacy>=3.7",
|
|
84
84
|
"vl-convert-python>=1.5",
|
|
85
|
-
"pysofra>=0.1.
|
|
85
|
+
"pysofra>=0.1.0a3",
|
|
86
86
|
]
|
|
87
87
|
dev = [
|
|
88
88
|
"pytest>=8",
|
|
@@ -14,12 +14,12 @@ Example
|
|
|
14
14
|
|
|
15
15
|
>>> import pycorpdiff as pcd
|
|
16
16
|
>>> pcd.__version__
|
|
17
|
-
'0.1.
|
|
17
|
+
'0.1.0a3'
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
|
-
__version__ = "0.1.
|
|
22
|
+
__version__ = "0.1.0a3"
|
|
23
23
|
|
|
24
24
|
from .collocation.network import NetworkResult, cooccurrence_network
|
|
25
25
|
from .compare import Comparison, compare
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pycorpdiff-0.1.0a1 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/_data/hansard_sample.parquet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|