pycorpdiff 0.1.0a2__tar.gz → 0.1.0a4__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/.gitignore +4 -4
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/CHANGELOG.md +8 -8
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/CITATION.cff +2 -2
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/PKG-INFO +60 -49
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/README.md +56 -46
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/pyproject.toml +7 -5
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/__init__.py +2 -2
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/compare.py +1 -1
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/datasets/_generate_hansard.py +1 -1
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/keyness/bayes.py +4 -3
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/__init__.py +1 -1
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_crossval_rayson.py +4 -2
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/LICENSE +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/_backends/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/_backends/pandas.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/_backends/polars.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/collocation/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/collocation/measures.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/collocation/network.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/collocation/shift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/corpus.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/datasets/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/datasets/hansard.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/datasets/histwords.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/explain.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/io/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/io/duckdb.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/io/huggingface.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/io/readers.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/keyness/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/keyness/chi_squared.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/keyness/correction.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/keyness/dispersion.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/keyness/multicorpus.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/keyness/permutation.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/py.typed +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/results.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/semantic/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/semantic/alignment.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/semantic/embed.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/semantic/shift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/semantic/trajectory.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/stats.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/temporal/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/temporal/bocpd.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/temporal/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/temporal/changepoint.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/temporal/forecast.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/temporal/its.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/temporal/slicing.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/tokenize.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/bocpd.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/causal_impact.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/collocation.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/dispersion.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/forecast.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/keyness.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/network.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/scattertext.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/viz/trajectory.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/conftest.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/fixtures/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_collocation_integration.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_crossval_histwords.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_crossval_nltk.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_crossval_quanteda.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_crossval_scattertext.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_explain_integration.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_keyness_integration.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_sbert_slow.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_semantic_integration.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_stop_words.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_temporal_stats.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/integration/test_viz.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/property/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/property/test_collocation_properties.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/property/test_keyness_properties.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/property/test_temporal_properties.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/__init__.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_bayes_factor.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_bocpd.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_causal_impact.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_changepoint.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_chi_squared.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_collocation_cooccurrence.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_collocation_measures.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_collocation_shift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_comparison_concordance.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_cooccurrence_network.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_corpus_hash.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_corpus_vocab.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_correction.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_datasets_hansard.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_dispersion.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_dispersion_plot.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_doc_term_counts_sparse.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_effect_sizes.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_embedders.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_explain.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_forecast.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_forecast_semantic_drift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_from_huggingface.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_hansard_fetcher.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_histwords_loader.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_its.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_keyness_multi.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_loglikelihood.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_ngram_tokenizer.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_permutation_keyness.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_polars_interop.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_procrustes.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_read_duckdb.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_read_txt_line_mode.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_result_exports.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_scattertext_plot.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_semantic_neighbours.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_semantic_shift.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_semantic_trajectory.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_smoke.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_temporal.py +0 -0
- {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/tests/unit/test_wilson_ci.py +0 -0
|
@@ -30,12 +30,12 @@ Thumbs.db
|
|
|
30
30
|
*.swo
|
|
31
31
|
*~
|
|
32
32
|
|
|
33
|
-
# AI workflow artefacts (kept local, never published)
|
|
34
|
-
.claude/
|
|
35
|
-
|
|
36
33
|
# Hypothesis example database (auto-managed)
|
|
37
34
|
.hypothesis/
|
|
38
35
|
|
|
36
|
+
# Local tooling
|
|
37
|
+
.claude/
|
|
38
|
+
|
|
39
39
|
# Jupyter checkpoints
|
|
40
40
|
.ipynb_checkpoints/
|
|
41
41
|
|
|
@@ -56,5 +56,5 @@ examples/*.patched.ipynb
|
|
|
56
56
|
# Stray uv lockfiles created outside the repo root
|
|
57
57
|
**/uv.lock.tmp
|
|
58
58
|
|
|
59
|
-
#
|
|
59
|
+
# Static site build output
|
|
60
60
|
site/
|
|
@@ -4,13 +4,13 @@ All notable changes to `pycorpdiff` are documented in this file. The format
|
|
|
4
4
|
follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
|
|
5
5
|
project adheres to [Semantic Versioning](https://semver.org/).
|
|
6
6
|
|
|
7
|
-
## [0.1.
|
|
7
|
+
## [0.1.0a4] — initial release
|
|
8
8
|
|
|
9
9
|
The initial public release of `pycorpdiff` — comparative corpus analysis
|
|
10
10
|
for modern Python workflows. Three public verbs (`compare`, `track`,
|
|
11
|
-
`compare.before_after`), nine `Result` dataclasses
|
|
12
|
-
|
|
13
|
-
.to_json
|
|
11
|
+
`compare.before_after`), nine `Result` dataclasses each implementing the
|
|
12
|
+
relevant subset of `.to_df / .plot / .explain / .summary / .to_html /
|
|
13
|
+
.to_json`, two `typing.Protocol` extension points (`Tokenizer`,
|
|
14
14
|
`Embedder`), and opt-in extras for visualisation, semantic embedding,
|
|
15
15
|
temporal modelling, polars interop, DuckDB ingestion, and 🤗 Datasets.
|
|
16
16
|
|
|
@@ -33,12 +33,12 @@ temporal modelling, polars interop, DuckDB ingestion, and 🤗 Datasets.
|
|
|
33
33
|
|
|
34
34
|
### Cross-validated
|
|
35
35
|
|
|
36
|
-
Numerically agrees with Rayson's LL Wizard
|
|
37
|
-
NLTK's `BigramAssocMeasures` (≤ 1e-12 on PMI / t-score / MI³),
|
|
36
|
+
Numerically agrees with Rayson's LL Wizard on hand-derived reference
|
|
37
|
+
triples, NLTK's `BigramAssocMeasures` (≤ 1e-12 on PMI / t-score / MI³),
|
|
38
38
|
Scattertext on the 2012 US conventions, `quanteda` via `rpy2`, and
|
|
39
39
|
the HistWords COHA replication.
|
|
40
40
|
|
|
41
41
|
### Infrastructure
|
|
42
42
|
|
|
43
|
-
|
|
44
|
-
matrix CI on three Python versions × two operating systems.
|
|
43
|
+
Hundreds of tests, `ruff` + `mypy --strict` clean across the source
|
|
44
|
+
tree, matrix CI on three Python versions × two operating systems.
|
|
@@ -4,8 +4,8 @@ message: >
|
|
|
4
4
|
entry. GitHub renders a "Cite this repository" widget directly from
|
|
5
5
|
this file.
|
|
6
6
|
title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
|
|
7
|
-
version: 0.1.
|
|
8
|
-
date-released: 2026-05-
|
|
7
|
+
version: 0.1.0a4
|
|
8
|
+
date-released: 2026-05-25
|
|
9
9
|
authors:
|
|
10
10
|
- family-names: Turner
|
|
11
11
|
given-names: Jason
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: pycorpdiff
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.0a4
|
|
4
4
|
Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
|
|
5
5
|
Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
|
|
6
6
|
Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
|
|
@@ -49,11 +49,12 @@ Provides-Extra: all
|
|
|
49
49
|
Requires-Dist: altair>=5; extra == 'all'
|
|
50
50
|
Requires-Dist: datasets>=2.14; extra == 'all'
|
|
51
51
|
Requires-Dist: duckdb>=0.10; extra == 'all'
|
|
52
|
+
Requires-Dist: jupyter>=1.0; extra == 'all'
|
|
52
53
|
Requires-Dist: matplotlib>=3.8; extra == 'all'
|
|
53
54
|
Requires-Dist: networkx>=3.1; extra == 'all'
|
|
54
55
|
Requires-Dist: polars>=1.0; extra == 'all'
|
|
55
56
|
Requires-Dist: pyarrow>=15; extra == 'all'
|
|
56
|
-
Requires-Dist: pysofra>=0.1.
|
|
57
|
+
Requires-Dist: pysofra>=0.1.0a3; extra == 'all'
|
|
57
58
|
Requires-Dist: ruptures>=1.1; extra == 'all'
|
|
58
59
|
Requires-Dist: scikit-learn>=1.3; extra == 'all'
|
|
59
60
|
Requires-Dist: sentence-transformers>=2.2; extra == 'all'
|
|
@@ -76,7 +77,7 @@ Provides-Extra: nlp
|
|
|
76
77
|
Requires-Dist: spacy>=3.7; extra == 'nlp'
|
|
77
78
|
Provides-Extra: notebooks
|
|
78
79
|
Requires-Dist: jupyter>=1.0; extra == 'notebooks'
|
|
79
|
-
Requires-Dist: pysofra>=0.1.
|
|
80
|
+
Requires-Dist: pysofra>=0.1.0a3; extra == 'notebooks'
|
|
80
81
|
Requires-Dist: vl-convert-python>=1.5; extra == 'notebooks'
|
|
81
82
|
Provides-Extra: polars
|
|
82
83
|
Requires-Dist: polars>=1.0; extra == 'polars'
|
|
@@ -110,9 +111,9 @@ platform, and the fragmented Python NLP stack
|
|
|
110
111
|
consolidate keyness, collocations, dispersion, temporal trajectories,
|
|
111
112
|
changepoint detection, interrupted time series, causal-impact analysis,
|
|
112
113
|
forecasting, online changepoint detection, and embedding-based semantic
|
|
113
|
-
shift under a single notebook-native API.
|
|
114
|
-
KWIC evidence: `.explain(term)` returns the
|
|
115
|
-
behind any ranked term.
|
|
114
|
+
shift under a single notebook-native API. Keyness and collocation
|
|
115
|
+
results carry their own KWIC evidence: `.explain(term)` returns the
|
|
116
|
+
source-text concordances behind any ranked term.
|
|
116
117
|
|
|
117
118
|
The package answers the questions corpus linguistics, digital humanities,
|
|
118
119
|
and computational social science routinely have:
|
|
@@ -130,7 +131,7 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
130
131
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
131
132
|
via extras.
|
|
132
133
|
|
|
133
|
-
> **Status: alpha (0.1.
|
|
134
|
+
> **Status: alpha (0.1.0a4).** Public API is stable for the features
|
|
134
135
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
135
136
|
|
|
136
137
|
## The three-layer architecture
|
|
@@ -139,61 +140,71 @@ via extras.
|
|
|
139
140
|
|---|---|---|
|
|
140
141
|
| **1 — Ingestion + `Corpus`** | get text in, slice it, hash it | `from_dataframe`, `read_csv`, `read_parquet`, `read_txt`, `read_duckdb`, `from_huggingface`, `fetch_hansard`, `Corpus.slice/by_time/__hash__/doc_term_counts(_sparse)/to_polars` |
|
|
141
142
|
| **2 — Pure math** | statistics with no I/O | `keyness.{log_likelihood,chi_squared,log_ratio,percent_diff,bayes_factor,permutation_pvalues,keyness_multi,juilland_d,benjamini_hochberg}`; `collocation.{logdice,pmi,t_score,mi_three,collocation_shift,cooccurrence_network}`; `semantic.{HashEmbedder,SBERTEmbedder,semantic_trajectory,neighborhood_drift}`; `temporal.{changepoints,interrupted_time_series,forecast,causal_impact,bocpd}` |
|
|
142
|
-
| **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each
|
|
143
|
+
| **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each implementing the relevant subset of `.to_df() / .plot() / .explain() / .summary() / .to_html() / .to_json()` |
|
|
143
144
|
|
|
144
145
|
## Quick start
|
|
145
146
|
|
|
146
147
|
```bash
|
|
147
|
-
pip install "pycorpdiff[viz
|
|
148
|
+
pip install "pycorpdiff[viz]"
|
|
148
149
|
```
|
|
149
150
|
|
|
150
151
|
```python
|
|
151
152
|
import pycorpdiff as pcd
|
|
152
153
|
|
|
153
|
-
# Bundled synthetic
|
|
154
|
+
# Bundled synthetic Hansard-style sample — runs offline, no data download.
|
|
154
155
|
corpus = pcd.load_hansard_sample()
|
|
155
156
|
immigration = corpus.slice(topic="immigration")
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
#
|
|
164
|
-
|
|
165
|
-
#
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
157
|
+
|
|
158
|
+
# Which words separate the humanising and criminalising frames?
|
|
159
|
+
keyness = pcd.compare(
|
|
160
|
+
immigration.slice(frame="humanising"),
|
|
161
|
+
immigration.slice(frame="criminalising"),
|
|
162
|
+
).keyness(min_count=3)
|
|
163
|
+
|
|
164
|
+
keyness.plot() # volcano plot — picture the result
|
|
165
|
+
# keyness.table.head(10) # or look at the ranked table directly
|
|
166
|
+
# keyness.explain("criminal") # KWIC concordances showing the textual evidence
|
|
167
|
+
```
|
|
168
|
+
|
|
169
|
+
That's the entire surface in five lines: load a corpus, slice it,
|
|
170
|
+
compare two slices, plot the result. Every other analytical method —
|
|
171
|
+
collocation shifts, semantic drift, temporal trajectories, changepoint
|
|
172
|
+
detection, causal-impact analysis, forecasting, co-occurrence networks,
|
|
173
|
+
N-way keyness — follows the same shape. See
|
|
174
|
+
[the showcase notebook](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb)
|
|
175
|
+
for the full feature tour, or the cheat sheet below for one-line API previews.
|
|
176
|
+
|
|
177
|
+
### Cheat sheet — every analytical surface in one block
|
|
178
|
+
|
|
179
|
+
```python
|
|
180
|
+
# Compare verbs (returns Result objects; methods exposed vary by Result)
|
|
181
|
+
pcd.compare(a, b).keyness()
|
|
182
|
+
pcd.compare(a, b).collocation_shift("immigrant")
|
|
183
|
+
pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
184
|
+
# SBERTEmbedder downloads a sentence-transformers model on first call;
|
|
185
|
+
# use pcd.HashEmbedder() for offline / deterministic-test settings.
|
|
186
|
+
|
|
187
|
+
# Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
|
|
188
|
+
tr = pcd.track(corpus, "immigrant").over_time(freq="Y")
|
|
189
|
+
tr.changepoints() # offline PELT
|
|
190
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
191
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
192
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
193
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
172
194
|
|
|
173
195
|
# Before / after a known event
|
|
174
196
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
175
197
|
|
|
176
|
-
# N-way (≥ 2 corpora)
|
|
177
|
-
|
|
178
|
-
nhs = corpus.slice(topic="nhs")
|
|
179
|
-
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
198
|
+
# N-way (≥ 2 corpora)
|
|
199
|
+
pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
|
|
180
200
|
|
|
181
201
|
# The discourse as a graph
|
|
182
|
-
pcd.cooccurrence_network(
|
|
183
|
-
|
|
184
|
-
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
202
|
+
pcd.cooccurrence_network(corpus, top_n=30).plot()
|
|
185
203
|
```
|
|
186
204
|
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
|
|
191
|
-
`pcd.from_huggingface(...)` to use your own corpus.
|
|
192
|
-
|
|
193
|
-
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
194
|
-
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
195
|
-
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
196
|
-
surface.
|
|
205
|
+
See [`examples/pycorpdiff_showcase.ipynb`](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb)
|
|
206
|
+
for a walkthrough on the synthetic Hansard-style corpus exercising
|
|
207
|
+
every analytical surface.
|
|
197
208
|
|
|
198
209
|
## Installation
|
|
199
210
|
|
|
@@ -223,7 +234,7 @@ pytest -q
|
|
|
223
234
|
|
|
224
235
|
The math agrees with the standard tools — by automated test:
|
|
225
236
|
|
|
226
|
-
- **Rayson's LL Wizard** —
|
|
237
|
+
- **Rayson's LL Wizard** — hand-derived contingency-table reference triples
|
|
227
238
|
- **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
|
|
228
239
|
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
|
|
229
240
|
- **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
|
|
@@ -237,11 +248,11 @@ repository" widget directly from it.
|
|
|
237
248
|
|
|
238
249
|
## License
|
|
239
250
|
|
|
240
|
-
MIT — see [LICENSE](LICENSE).
|
|
251
|
+
MIT — see [LICENSE](https://github.com/jturner-uofl/pycorpdiff/blob/main/LICENSE).
|
|
241
252
|
|
|
242
253
|
## Further reading
|
|
243
254
|
|
|
244
|
-
- [`docs/design.md`](docs/design.md) — three-layer architecture
|
|
245
|
-
- [`docs/statistical-methods.md`](docs/statistical-methods.md) — every metric's formula + citation
|
|
246
|
-
- [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
|
|
247
|
-
- [`docs/rendered/`](docs/rendered
|
|
255
|
+
- [`docs/design.md`](https://github.com/jturner-uofl/pycorpdiff/blob/main/docs/design.md) — three-layer architecture
|
|
256
|
+
- [`docs/statistical-methods.md`](https://github.com/jturner-uofl/pycorpdiff/blob/main/docs/statistical-methods.md) — every metric's formula + citation
|
|
257
|
+
- [`examples/pycorpdiff_showcase.ipynb`](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
|
|
258
|
+
- [`docs/rendered/`](https://github.com/jturner-uofl/pycorpdiff/tree/main/docs/rendered) — static HTML renders for offline viewing
|
|
@@ -15,9 +15,9 @@ platform, and the fragmented Python NLP stack
|
|
|
15
15
|
consolidate keyness, collocations, dispersion, temporal trajectories,
|
|
16
16
|
changepoint detection, interrupted time series, causal-impact analysis,
|
|
17
17
|
forecasting, online changepoint detection, and embedding-based semantic
|
|
18
|
-
shift under a single notebook-native API.
|
|
19
|
-
KWIC evidence: `.explain(term)` returns the
|
|
20
|
-
behind any ranked term.
|
|
18
|
+
shift under a single notebook-native API. Keyness and collocation
|
|
19
|
+
results carry their own KWIC evidence: `.explain(term)` returns the
|
|
20
|
+
source-text concordances behind any ranked term.
|
|
21
21
|
|
|
22
22
|
The package answers the questions corpus linguistics, digital humanities,
|
|
23
23
|
and computational social science routinely have:
|
|
@@ -35,7 +35,7 @@ points — one-line adapters, no plugin registry. The base install pulls
|
|
|
35
35
|
only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
|
|
36
36
|
via extras.
|
|
37
37
|
|
|
38
|
-
> **Status: alpha (0.1.
|
|
38
|
+
> **Status: alpha (0.1.0a4).** Public API is stable for the features
|
|
39
39
|
> described below; on PyPI as `pip install pycorpdiff`.
|
|
40
40
|
|
|
41
41
|
## The three-layer architecture
|
|
@@ -44,61 +44,71 @@ via extras.
|
|
|
44
44
|
|---|---|---|
|
|
45
45
|
| **1 — Ingestion + `Corpus`** | get text in, slice it, hash it | `from_dataframe`, `read_csv`, `read_parquet`, `read_txt`, `read_duckdb`, `from_huggingface`, `fetch_hansard`, `Corpus.slice/by_time/__hash__/doc_term_counts(_sparse)/to_polars` |
|
|
46
46
|
| **2 — Pure math** | statistics with no I/O | `keyness.{log_likelihood,chi_squared,log_ratio,percent_diff,bayes_factor,permutation_pvalues,keyness_multi,juilland_d,benjamini_hochberg}`; `collocation.{logdice,pmi,t_score,mi_three,collocation_shift,cooccurrence_network}`; `semantic.{HashEmbedder,SBERTEmbedder,semantic_trajectory,neighborhood_drift}`; `temporal.{changepoints,interrupted_time_series,forecast,causal_impact,bocpd}` |
|
|
47
|
-
| **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each
|
|
47
|
+
| **3 — Verbs + Results** | public API | `compare`, `track`, `compare.before_after`, `keyness_multi`, plus 9 frozen-dataclass Result types each implementing the relevant subset of `.to_df() / .plot() / .explain() / .summary() / .to_html() / .to_json()` |
|
|
48
48
|
|
|
49
49
|
## Quick start
|
|
50
50
|
|
|
51
51
|
```bash
|
|
52
|
-
pip install "pycorpdiff[viz
|
|
52
|
+
pip install "pycorpdiff[viz]"
|
|
53
53
|
```
|
|
54
54
|
|
|
55
55
|
```python
|
|
56
56
|
import pycorpdiff as pcd
|
|
57
57
|
|
|
58
|
-
# Bundled synthetic
|
|
58
|
+
# Bundled synthetic Hansard-style sample — runs offline, no data download.
|
|
59
59
|
corpus = pcd.load_hansard_sample()
|
|
60
60
|
immigration = corpus.slice(topic="immigration")
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
#
|
|
69
|
-
|
|
70
|
-
#
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
61
|
+
|
|
62
|
+
# Which words separate the humanising and criminalising frames?
|
|
63
|
+
keyness = pcd.compare(
|
|
64
|
+
immigration.slice(frame="humanising"),
|
|
65
|
+
immigration.slice(frame="criminalising"),
|
|
66
|
+
).keyness(min_count=3)
|
|
67
|
+
|
|
68
|
+
keyness.plot() # volcano plot — picture the result
|
|
69
|
+
# keyness.table.head(10) # or look at the ranked table directly
|
|
70
|
+
# keyness.explain("criminal") # KWIC concordances showing the textual evidence
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
That's the entire surface in five lines: load a corpus, slice it,
|
|
74
|
+
compare two slices, plot the result. Every other analytical method —
|
|
75
|
+
collocation shifts, semantic drift, temporal trajectories, changepoint
|
|
76
|
+
detection, causal-impact analysis, forecasting, co-occurrence networks,
|
|
77
|
+
N-way keyness — follows the same shape. See
|
|
78
|
+
[the showcase notebook](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb)
|
|
79
|
+
for the full feature tour, or the cheat sheet below for one-line API previews.
|
|
80
|
+
|
|
81
|
+
### Cheat sheet — every analytical surface in one block
|
|
82
|
+
|
|
83
|
+
```python
|
|
84
|
+
# Compare verbs (returns Result objects; methods exposed vary by Result)
|
|
85
|
+
pcd.compare(a, b).keyness()
|
|
86
|
+
pcd.compare(a, b).collocation_shift("immigrant")
|
|
87
|
+
pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
|
|
88
|
+
# SBERTEmbedder downloads a sentence-transformers model on first call;
|
|
89
|
+
# use pcd.HashEmbedder() for offline / deterministic-test settings.
|
|
90
|
+
|
|
91
|
+
# Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
|
|
92
|
+
tr = pcd.track(corpus, "immigrant").over_time(freq="Y")
|
|
93
|
+
tr.changepoints() # offline PELT
|
|
94
|
+
tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
|
|
95
|
+
tr.interrupted_time_series(event_date="2016") # segmented OLS
|
|
96
|
+
tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
|
|
97
|
+
tr.forecast(horizon=4) # state-space ETS
|
|
77
98
|
|
|
78
99
|
# Before / after a known event
|
|
79
100
|
pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
|
|
80
101
|
|
|
81
|
-
# N-way (≥ 2 corpora)
|
|
82
|
-
|
|
83
|
-
nhs = corpus.slice(topic="nhs")
|
|
84
|
-
pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
|
|
102
|
+
# N-way (≥ 2 corpora)
|
|
103
|
+
pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
|
|
85
104
|
|
|
86
105
|
# The discourse as a graph
|
|
87
|
-
pcd.cooccurrence_network(
|
|
88
|
-
|
|
89
|
-
# Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
|
|
106
|
+
pcd.cooccurrence_network(corpus, top_n=30).plot()
|
|
90
107
|
```
|
|
91
108
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
`pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
|
|
96
|
-
`pcd.from_huggingface(...)` to use your own corpus.
|
|
97
|
-
|
|
98
|
-
See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
|
|
99
|
-
([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
|
|
100
|
-
walkthrough on a synthetic UK Hansard corpus exercising every analytical
|
|
101
|
-
surface.
|
|
109
|
+
See [`examples/pycorpdiff_showcase.ipynb`](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb)
|
|
110
|
+
for a walkthrough on the synthetic Hansard-style corpus exercising
|
|
111
|
+
every analytical surface.
|
|
102
112
|
|
|
103
113
|
## Installation
|
|
104
114
|
|
|
@@ -128,7 +138,7 @@ pytest -q
|
|
|
128
138
|
|
|
129
139
|
The math agrees with the standard tools — by automated test:
|
|
130
140
|
|
|
131
|
-
- **Rayson's LL Wizard** —
|
|
141
|
+
- **Rayson's LL Wizard** — hand-derived contingency-table reference triples
|
|
132
142
|
- **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
|
|
133
143
|
- **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
|
|
134
144
|
- **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
|
|
@@ -142,11 +152,11 @@ repository" widget directly from it.
|
|
|
142
152
|
|
|
143
153
|
## License
|
|
144
154
|
|
|
145
|
-
MIT — see [LICENSE](LICENSE).
|
|
155
|
+
MIT — see [LICENSE](https://github.com/jturner-uofl/pycorpdiff/blob/main/LICENSE).
|
|
146
156
|
|
|
147
157
|
## Further reading
|
|
148
158
|
|
|
149
|
-
- [`docs/design.md`](docs/design.md) — three-layer architecture
|
|
150
|
-
- [`docs/statistical-methods.md`](docs/statistical-methods.md) — every metric's formula + citation
|
|
151
|
-
- [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
|
|
152
|
-
- [`docs/rendered/`](docs/rendered
|
|
159
|
+
- [`docs/design.md`](https://github.com/jturner-uofl/pycorpdiff/blob/main/docs/design.md) — three-layer architecture
|
|
160
|
+
- [`docs/statistical-methods.md`](https://github.com/jturner-uofl/pycorpdiff/blob/main/docs/statistical-methods.md) — every metric's formula + citation
|
|
161
|
+
- [`examples/pycorpdiff_showcase.ipynb`](https://github.com/jturner-uofl/pycorpdiff/blob/main/examples/pycorpdiff_showcase.ipynb) — full feature tour as a notebook
|
|
162
|
+
- [`docs/rendered/`](https://github.com/jturner-uofl/pycorpdiff/tree/main/docs/rendered) — static HTML renders for offline viewing
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "pycorpdiff"
|
|
7
|
-
version = "0.1.
|
|
7
|
+
version = "0.1.0a4"
|
|
8
8
|
description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = { file = "LICENSE" }
|
|
@@ -45,7 +45,7 @@ dependencies = [
|
|
|
45
45
|
]
|
|
46
46
|
|
|
47
47
|
[project.optional-dependencies]
|
|
48
|
-
# Visualisation: altair-first, matplotlib retained for
|
|
48
|
+
# Visualisation: altair-first, matplotlib retained for publication-quality figures.
|
|
49
49
|
viz = ["altair>=5", "matplotlib>=3.8", "networkx>=3.1"]
|
|
50
50
|
# Embedding-based semantic shift. sentence-transformers pulls torch
|
|
51
51
|
# transitively, which is why this is opt-in rather than a base dep.
|
|
@@ -66,8 +66,9 @@ huggingface = ["datasets>=2.14"]
|
|
|
66
66
|
# rendered HTML examples. `jupyter` runs the notebook, `vl-convert` does
|
|
67
67
|
# static SVG/PNG export of altair charts, `pysofra` renders the showcase's
|
|
68
68
|
# result tables in JAMA-style typography.
|
|
69
|
-
notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.
|
|
70
|
-
# Meta-extra
|
|
69
|
+
notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.0a3"]
|
|
70
|
+
# Meta-extra: `pycorpdiff[all]` pulls in every optional code path
|
|
71
|
+
# including the notebook runtime.
|
|
71
72
|
all = [
|
|
72
73
|
"altair>=5",
|
|
73
74
|
"matplotlib>=3.8",
|
|
@@ -81,8 +82,9 @@ all = [
|
|
|
81
82
|
"pyarrow>=15",
|
|
82
83
|
"duckdb>=0.10",
|
|
83
84
|
"spacy>=3.7",
|
|
85
|
+
"jupyter>=1.0",
|
|
84
86
|
"vl-convert-python>=1.5",
|
|
85
|
-
"pysofra>=0.1.
|
|
87
|
+
"pysofra>=0.1.0a3",
|
|
86
88
|
]
|
|
87
89
|
dev = [
|
|
88
90
|
"pytest>=8",
|
|
@@ -14,12 +14,12 @@ Example
|
|
|
14
14
|
|
|
15
15
|
>>> import pycorpdiff as pcd
|
|
16
16
|
>>> pcd.__version__
|
|
17
|
-
'0.1.
|
|
17
|
+
'0.1.0a4'
|
|
18
18
|
"""
|
|
19
19
|
|
|
20
20
|
from __future__ import annotations
|
|
21
21
|
|
|
22
|
-
__version__ = "0.1.
|
|
22
|
+
__version__ = "0.1.0a4"
|
|
23
23
|
|
|
24
24
|
from .collocation.network import NetworkResult, cooccurrence_network
|
|
25
25
|
from .compare import Comparison, compare
|
|
@@ -66,7 +66,7 @@ class Comparison:
|
|
|
66
66
|
require ``effect_size=True`` and sort by that column.
|
|
67
67
|
effect_size
|
|
68
68
|
If True (default), also compute LogRatio (Hardie),
|
|
69
|
-
%DIFF (Gabrielatos), and the BIC-Bayes factor
|
|
69
|
+
%DIFF (Gabrielatos), and the BIC-approximated Bayes factor.
|
|
70
70
|
dispersion
|
|
71
71
|
If True, compute Juilland's D for both corpora and flag
|
|
72
72
|
terms where ``D < 0.5`` in either — the canonical "this is
|
|
@@ -172,7 +172,7 @@ TOPICS = ["immigration", "brexit", "nhs", "climate"]
|
|
|
172
172
|
|
|
173
173
|
|
|
174
174
|
def generate(seed: int = 20260522) -> pd.DataFrame:
|
|
175
|
-
"""Return a deterministic
|
|
175
|
+
"""Return a deterministic 193-speech synthetic Hansard sample."""
|
|
176
176
|
rng = np.random.default_rng(seed)
|
|
177
177
|
rows: list[dict[str, object]] = []
|
|
178
178
|
speech_id = 0
|
|
@@ -26,9 +26,10 @@ def bayes_factor(
|
|
|
26
26
|
) -> pd.Series:
|
|
27
27
|
"""BIC-approximated Bayes factor for each term's frequency difference.
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
is the total tokens across both corpora and ``G²`` is
|
|
31
|
-
log-likelihood. The Bayes factor is then
|
|
29
|
+
The BIC approximation (Kass & Raftery 1995): ``BIC = |G²| - ln(N)``
|
|
30
|
+
where ``N`` is the total tokens across both corpora and ``G²`` is
|
|
31
|
+
the unsigned log-likelihood. The Bayes factor is then
|
|
32
|
+
``exp(BIC / 2)``. Wilson (2013) is the keyness application.
|
|
32
33
|
|
|
33
34
|
Interpret with Kass & Raftery (1995):
|
|
34
35
|
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
"""Visualisation helpers — altair-first, matplotlib for
|
|
1
|
+
"""Visualisation helpers — altair-first, matplotlib for publication-quality figures.
|
|
2
2
|
|
|
3
3
|
Every Result type's ``.plot()`` method delegates here. Plot functions
|
|
4
4
|
also accept a bare DataFrame so users can call
|
|
@@ -6,8 +6,10 @@ single-cell keyness computation in corpus linguistics. Every value
|
|
|
6
6
|
asserted below was either computed from Rayson's exact formula or
|
|
7
7
|
copy-pasted from his calculator on a clean dataset.
|
|
8
8
|
|
|
9
|
-
This file extends ``test_loglikelihood.py`` with
|
|
10
|
-
|
|
9
|
+
This file extends ``test_loglikelihood.py`` with a broader sweep of
|
|
10
|
+
canonical reference triples covering edge cases (lopsided counts,
|
|
11
|
+
sparse cells, mid-sized over-representation) so that any future
|
|
12
|
+
refactor of the LL formula trips multiple assertions simultaneously.
|
|
11
13
|
"""
|
|
12
14
|
|
|
13
15
|
from __future__ import annotations
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a4}/src/pycorpdiff/datasets/_data/hansard_sample.parquet
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|