pycorpdiff 0.1.0a0__tar.gz → 0.1.0a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/CHANGELOG.md +1 -1
  2. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/CITATION.cff +7 -14
  3. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/PKG-INFO +49 -32
  4. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/README.md +47 -30
  5. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/pyproject.toml +11 -13
  6. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/__init__.py +9 -10
  7. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/results.py +10 -1
  8. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_smoke.py +4 -6
  9. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/.gitignore +0 -0
  10. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/LICENSE +0 -0
  11. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/_backends/__init__.py +0 -0
  12. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/_backends/pandas.py +0 -0
  13. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/_backends/polars.py +0 -0
  14. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/__init__.py +0 -0
  15. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
  16. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/measures.py +0 -0
  17. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/network.py +0 -0
  18. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/collocation/shift.py +0 -0
  19. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/compare.py +0 -0
  20. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/corpus.py +0 -0
  21. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/__init__.py +0 -0
  22. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  23. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
  24. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/hansard.py +0 -0
  25. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/datasets/histwords.py +0 -0
  26. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/explain.py +0 -0
  27. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/io/__init__.py +0 -0
  28. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/io/duckdb.py +0 -0
  29. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/io/huggingface.py +0 -0
  30. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/io/readers.py +0 -0
  31. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/__init__.py +0 -0
  32. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/bayes.py +0 -0
  33. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/chi_squared.py +0 -0
  34. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/correction.py +0 -0
  35. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/dispersion.py +0 -0
  36. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
  37. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
  38. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/multicorpus.py +0 -0
  39. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/keyness/permutation.py +0 -0
  40. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/py.typed +0 -0
  41. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/__init__.py +0 -0
  42. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/alignment.py +0 -0
  43. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/embed.py +0 -0
  44. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/shift.py +0 -0
  45. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/semantic/trajectory.py +0 -0
  46. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/stats.py +0 -0
  47. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/__init__.py +0 -0
  48. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/bocpd.py +0 -0
  49. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/causal_impact.py +0 -0
  50. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/changepoint.py +0 -0
  51. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/forecast.py +0 -0
  52. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/its.py +0 -0
  53. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/temporal/slicing.py +0 -0
  54. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/tokenize.py +0 -0
  55. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/__init__.py +0 -0
  56. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/bocpd.py +0 -0
  57. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/causal_impact.py +0 -0
  58. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/collocation.py +0 -0
  59. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/dispersion.py +0 -0
  60. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/forecast.py +0 -0
  61. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/keyness.py +0 -0
  62. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/network.py +0 -0
  63. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/scattertext.py +0 -0
  64. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
  65. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/src/pycorpdiff/viz/trajectory.py +0 -0
  66. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/__init__.py +0 -0
  67. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/conftest.py +0 -0
  68. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/fixtures/__init__.py +0 -0
  69. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/__init__.py +0 -0
  70. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_collocation_integration.py +0 -0
  71. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_histwords.py +0 -0
  72. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_nltk.py +0 -0
  73. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_quanteda.py +0 -0
  74. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_rayson.py +0 -0
  75. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_crossval_scattertext.py +0 -0
  76. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_explain_integration.py +0 -0
  77. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_keyness_integration.py +0 -0
  78. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_sbert_slow.py +0 -0
  79. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_semantic_integration.py +0 -0
  80. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_stop_words.py +0 -0
  81. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_temporal_stats.py +0 -0
  82. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/integration/test_viz.py +0 -0
  83. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/property/__init__.py +0 -0
  84. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/property/test_collocation_properties.py +0 -0
  85. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/property/test_keyness_properties.py +0 -0
  86. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/property/test_temporal_properties.py +0 -0
  87. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/__init__.py +0 -0
  88. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_bayes_factor.py +0 -0
  89. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_bocpd.py +0 -0
  90. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_causal_impact.py +0 -0
  91. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_changepoint.py +0 -0
  92. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_chi_squared.py +0 -0
  93. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_collocation_cooccurrence.py +0 -0
  94. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_collocation_measures.py +0 -0
  95. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_collocation_shift.py +0 -0
  96. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_comparison_concordance.py +0 -0
  97. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_cooccurrence_network.py +0 -0
  98. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_corpus_hash.py +0 -0
  99. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_corpus_vocab.py +0 -0
  100. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_correction.py +0 -0
  101. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_datasets_hansard.py +0 -0
  102. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_dispersion.py +0 -0
  103. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_dispersion_plot.py +0 -0
  104. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_doc_term_counts_sparse.py +0 -0
  105. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_effect_sizes.py +0 -0
  106. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_embedders.py +0 -0
  107. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_explain.py +0 -0
  108. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_forecast.py +0 -0
  109. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_forecast_semantic_drift.py +0 -0
  110. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_from_huggingface.py +0 -0
  111. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_hansard_fetcher.py +0 -0
  112. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_histwords_loader.py +0 -0
  113. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_its.py +0 -0
  114. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_keyness_multi.py +0 -0
  115. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_loglikelihood.py +0 -0
  116. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_ngram_tokenizer.py +0 -0
  117. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_permutation_keyness.py +0 -0
  118. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_polars_interop.py +0 -0
  119. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_procrustes.py +0 -0
  120. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_read_duckdb.py +0 -0
  121. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_read_txt_line_mode.py +0 -0
  122. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_result_exports.py +0 -0
  123. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_scattertext_plot.py +0 -0
  124. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_semantic_neighbours.py +0 -0
  125. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_semantic_shift.py +0 -0
  126. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_semantic_trajectory.py +0 -0
  127. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_temporal.py +0 -0
  128. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a2}/tests/unit/test_wilson_ci.py +0 -0
@@ -4,7 +4,7 @@ All notable changes to `pycorpdiff` are documented in this file. The format
4
4
  follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
5
5
  project adheres to [Semantic Versioning](https://semver.org/).
6
6
 
7
- ## [0.1.0a0] — initial release
7
+ ## [0.1.0a2] — initial release
8
8
 
9
9
  The initial public release of `pycorpdiff` — comparative corpus analysis
10
10
  for modern Python workflows. Three public verbs (`compare`, `track`,
@@ -1,11 +1,10 @@
1
1
  cff-version: 1.2.0
2
2
  message: >
3
- If you use pycorpdiff in academic work, please cite both the
4
- software (this entry) and the accompanying Journal of Statistical
5
- Software paper once it appears. The JSS manuscript is in
6
- preparation; the draft will live in this repository as paper/paper.tex.
3
+ If you use pycorpdiff in academic work, please cite this software
4
+ entry. GitHub renders a "Cite this repository" widget directly from
5
+ this file.
7
6
  title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
8
- version: 0.1.0a0
7
+ version: 0.1.0a2
9
8
  date-released: 2026-05-22
10
9
  authors:
11
10
  - family-names: Turner
@@ -34,16 +33,10 @@ abstract: >
34
33
  computational social science, and discourse analysis research,
35
34
  emphasising interpretability, explainability, statistical rigour,
36
35
  and reproducibility.
37
- preferred-citation:
38
- type: article
39
- authors:
40
- - family-names: Turner
41
- given-names: Jason
42
- title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
43
- journal: "Journal of Statistical Software"
44
- year: 2026
45
- status: in-preparation
46
36
  identifiers:
47
37
  - type: url
48
38
  value: "https://github.com/jturner-uofl/pycorpdiff"
49
39
  description: Project repository
40
+ - type: url
41
+ value: "https://pypi.org/project/pycorpdiff/"
42
+ description: PyPI release
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycorpdiff
3
- Version: 0.1.0a0
3
+ Version: 0.1.0a2
4
4
  Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
5
5
  Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
6
6
  Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
@@ -30,7 +30,7 @@ License: MIT License
30
30
  SOFTWARE.
31
31
  License-File: LICENSE
32
32
  Keywords: collocation,comparative corpus analysis,computational social science,corpus linguistics,diachronic nlp,digital humanities,discourse analysis,keyness,semantic change,temporal text analysis
33
- Classifier: Development Status :: 2 - Pre-Alpha
33
+ Classifier: Development Status :: 3 - Alpha
34
34
  Classifier: Intended Audience :: Science/Research
35
35
  Classifier: License :: OSI Approved :: MIT License
36
36
  Classifier: Programming Language :: Python :: 3
@@ -95,15 +95,10 @@ Description-Content-Type: text/markdown
95
95
 
96
96
  # pycorpdiff
97
97
 
98
- <!--
99
- TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
100
-
101
98
  [![PyPI](https://img.shields.io/pypi/v/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
102
99
  [![Python versions](https://img.shields.io/pypi/pyversions/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
103
100
  [![CI](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml/badge.svg)](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
104
- [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.<RECORD>.svg)](https://doi.org/10.5281/zenodo.<RECORD>)
105
101
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
106
- -->
107
102
 
108
103
  **Comparative corpus analysis for modern Python workflows.**
109
104
 
@@ -135,8 +130,8 @@ points — one-line adapters, no plugin registry. The base install pulls
135
130
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
136
131
  via extras.
137
132
 
138
- > **Status: pre-release alpha (0.1.0a0).** Public API is stable for the
139
- > features described below; PyPI publication is the next milestone.
133
+ > **Status: alpha (0.1.0a2).** Public API is stable for the features
134
+ > described below; on PyPI as `pip install pycorpdiff`.
140
135
 
141
136
  ## The three-layer architecture
142
137
 
@@ -148,36 +143,53 @@ via extras.
148
143
 
149
144
  ## Quick start
150
145
 
146
+ ```bash
147
+ pip install "pycorpdiff[viz,temporal]"
148
+ ```
149
+
151
150
  ```python
152
151
  import pycorpdiff as pcd
153
152
 
154
- news = pcd.from_dataframe(df, text_col="body", meta_cols=("outlet", "date"))
153
+ # Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
154
+ corpus = pcd.load_hansard_sample()
155
+ immigration = corpus.slice(topic="immigration")
156
+ human = immigration.slice(frame="humanising")
157
+ criminal = immigration.slice(frame="criminalising")
155
158
 
156
159
  # Compare — three verbs
157
- k = pcd.compare(news.slice(outlet="Guardian"), news.slice(outlet="Mail")).keyness()
158
- c = pcd.compare(a, b).collocation_shift("migrant")
159
- s = pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder())
160
+ k = pcd.compare(human, criminal).keyness()
161
+ c = pcd.compare(human, criminal).collocation_shift("immigrant")
162
+ # s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
163
+ # ↑ requires `pip install "pycorpdiff[semantic]"`
160
164
 
161
165
  # Track over time
162
- tr = pcd.track(news, "migrant").over_time(freq="Y")
163
- tr.changepoints() # offline PELT
164
- tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
165
- tr.interrupted_time_series(event_date="2016-06-23") # segmented OLS
166
- tr.causal_impact(event_date="2016-06-23") # Bayesian counterfactual (Brodersen 2015)
167
- tr.forecast(horizon=4) # state-space ETS
166
+ tr = pcd.track(immigration, "criminal").over_time(freq="Y")
167
+ tr.changepoints() # offline PELT
168
+ tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
169
+ tr.interrupted_time_series(event_date="2016") # segmented OLS
170
+ tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
171
+ tr.forecast(horizon=4) # state-space ETS
168
172
 
169
173
  # Before / after a known event
170
- pcd.compare.before_after(news, event_date="2016-06-23").keyness()
174
+ pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
171
175
 
172
- # N-way (≥ 2 corpora)
173
- pcd.keyness_multi([gu, ma, te, mi], labels=["Guardian", "Mail", "Telegraph", "Mirror"])
176
+ # N-way (≥ 2 corpora) — one keyness across all four parties
177
+ parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
178
+ nhs = corpus.slice(topic="nhs")
179
+ pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
174
180
 
175
181
  # The discourse as a graph
176
- pcd.cooccurrence_network(news, top_n=50).plot()
182
+ pcd.cooccurrence_network(immigration, top_n=30).plot()
177
183
 
178
184
  # Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
179
185
  ```
180
186
 
187
+ Every line of the snippet above is verified end-to-end against
188
+ `pip install "pycorpdiff[viz,temporal]"` — no data download required.
189
+ Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
190
+ `pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
191
+ `pcd.from_huggingface(...)` to use your own corpus.
192
+
181
193
  See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
182
194
  ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
183
195
  walkthrough on a synthetic UK Hansard corpus exercising every analytical
@@ -185,23 +197,28 @@ surface.
185
197
 
186
198
  ## Installation
187
199
 
188
- <!-- TODO post-publish: replace this block with the PyPI install commands once published. -->
200
+ ```bash
201
+ pip install pycorpdiff # lexical-comparative core
202
+ pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
203
+ pip install "pycorpdiff[semantic]" # + sentence-transformers
204
+ pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
205
+ pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
206
+ pip install "pycorpdiff[all]" # everything
207
+ ```
189
208
 
190
- Currently a pre-release alpha. From a local clone:
209
+ The base install keeps a small dependency footprint (`numpy`, `pandas`,
210
+ `scipy`, `pyarrow`); optional extras land per analytical layer so you
211
+ only pay for what you use.
212
+
213
+ To work from source:
191
214
 
192
215
  ```bash
193
216
  git clone https://github.com/jturner-uofl/pycorpdiff
194
217
  cd pycorpdiff
195
218
  pip install -e ".[dev]"
196
- pytest -q # 519 default tests, ~7s
219
+ pytest -q
197
220
  ```
198
221
 
199
- Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
200
- (sentence-transformers + scikit-learn), `[temporal]` (ruptures +
201
- statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
202
- `[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
203
- or `[all]`.
204
-
205
222
  ## Cross-validation receipts
206
223
 
207
224
  The math agrees with the standard tools — by automated test:
@@ -1,14 +1,9 @@
1
1
  # pycorpdiff
2
2
 
3
- <!--
4
- TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
5
-
6
3
  [![PyPI](https://img.shields.io/pypi/v/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
7
4
  [![Python versions](https://img.shields.io/pypi/pyversions/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
8
5
  [![CI](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml/badge.svg)](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
9
- [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.<RECORD>.svg)](https://doi.org/10.5281/zenodo.<RECORD>)
10
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
11
- -->
12
7
 
13
8
  **Comparative corpus analysis for modern Python workflows.**
14
9
 
@@ -40,8 +35,8 @@ points — one-line adapters, no plugin registry. The base install pulls
40
35
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
41
36
  via extras.
42
37
 
43
- > **Status: pre-release alpha (0.1.0a0).** Public API is stable for the
44
- > features described below; PyPI publication is the next milestone.
38
+ > **Status: alpha (0.1.0a2).** Public API is stable for the features
39
+ > described below; on PyPI as `pip install pycorpdiff`.
45
40
 
46
41
  ## The three-layer architecture
47
42
 
@@ -53,36 +48,53 @@ via extras.
53
48
 
54
49
  ## Quick start
55
50
 
51
+ ```bash
52
+ pip install "pycorpdiff[viz,temporal]"
53
+ ```
54
+
56
55
  ```python
57
56
  import pycorpdiff as pcd
58
57
 
59
- news = pcd.from_dataframe(df, text_col="body", meta_cols=("outlet", "date"))
58
+ # Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
59
+ corpus = pcd.load_hansard_sample()
60
+ immigration = corpus.slice(topic="immigration")
61
+ human = immigration.slice(frame="humanising")
62
+ criminal = immigration.slice(frame="criminalising")
60
63
 
61
64
  # Compare — three verbs
62
- k = pcd.compare(news.slice(outlet="Guardian"), news.slice(outlet="Mail")).keyness()
63
- c = pcd.compare(a, b).collocation_shift("migrant")
64
- s = pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder())
65
+ k = pcd.compare(human, criminal).keyness()
66
+ c = pcd.compare(human, criminal).collocation_shift("immigrant")
67
+ # s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
68
+ # ↑ requires `pip install "pycorpdiff[semantic]"`
65
69
 
66
70
  # Track over time
67
- tr = pcd.track(news, "migrant").over_time(freq="Y")
68
- tr.changepoints() # offline PELT
69
- tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
70
- tr.interrupted_time_series(event_date="2016-06-23") # segmented OLS
71
- tr.causal_impact(event_date="2016-06-23") # Bayesian counterfactual (Brodersen 2015)
72
- tr.forecast(horizon=4) # state-space ETS
71
+ tr = pcd.track(immigration, "criminal").over_time(freq="Y")
72
+ tr.changepoints() # offline PELT
73
+ tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
74
+ tr.interrupted_time_series(event_date="2016") # segmented OLS
75
+ tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
76
+ tr.forecast(horizon=4) # state-space ETS
73
77
 
74
78
  # Before / after a known event
75
- pcd.compare.before_after(news, event_date="2016-06-23").keyness()
79
+ pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
76
80
 
77
- # N-way (≥ 2 corpora)
78
- pcd.keyness_multi([gu, ma, te, mi], labels=["Guardian", "Mail", "Telegraph", "Mirror"])
81
+ # N-way (≥ 2 corpora) — one keyness across all four parties
82
+ parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
83
+ nhs = corpus.slice(topic="nhs")
84
+ pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
79
85
 
80
86
  # The discourse as a graph
81
- pcd.cooccurrence_network(news, top_n=50).plot()
87
+ pcd.cooccurrence_network(immigration, top_n=30).plot()
82
88
 
83
89
  # Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
84
90
  ```
85
91
 
92
+ Every line of the snippet above is verified end-to-end against
93
+ `pip install "pycorpdiff[viz,temporal]"` — no data download required.
94
+ Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
95
+ `pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
96
+ `pcd.from_huggingface(...)` to use your own corpus.
97
+
86
98
  See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
87
99
  ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
88
100
  walkthrough on a synthetic UK Hansard corpus exercising every analytical
@@ -90,23 +102,28 @@ surface.
90
102
 
91
103
  ## Installation
92
104
 
93
- <!-- TODO post-publish: replace this block with the PyPI install commands once published. -->
105
+ ```bash
106
+ pip install pycorpdiff # lexical-comparative core
107
+ pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
108
+ pip install "pycorpdiff[semantic]" # + sentence-transformers
109
+ pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
110
+ pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
111
+ pip install "pycorpdiff[all]" # everything
112
+ ```
94
113
 
95
- Currently a pre-release alpha. From a local clone:
114
+ The base install keeps a small dependency footprint (`numpy`, `pandas`,
115
+ `scipy`, `pyarrow`); optional extras land per analytical layer so you
116
+ only pay for what you use.
117
+
118
+ To work from source:
96
119
 
97
120
  ```bash
98
121
  git clone https://github.com/jturner-uofl/pycorpdiff
99
122
  cd pycorpdiff
100
123
  pip install -e ".[dev]"
101
- pytest -q # 519 default tests, ~7s
124
+ pytest -q
102
125
  ```
103
126
 
104
- Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
105
- (sentence-transformers + scikit-learn), `[temporal]` (ruptures +
106
- statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
107
- `[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
108
- or `[all]`.
109
-
110
127
  ## Cross-validation receipts
111
128
 
112
129
  The math agrees with the standard tools — by automated test:
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pycorpdiff"
7
- version = "0.1.0a0"
7
+ version = "0.1.0a2"
8
8
  description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -23,7 +23,7 @@ keywords = [
23
23
  "temporal text analysis",
24
24
  ]
25
25
  classifiers = [
26
- "Development Status :: 2 - Pre-Alpha",
26
+ "Development Status :: 3 - Alpha",
27
27
  "Intended Audience :: Science/Research",
28
28
  "License :: OSI Approved :: MIT License",
29
29
  "Programming Language :: Python :: 3",
@@ -36,9 +36,9 @@ classifiers = [
36
36
  ]
37
37
  dependencies = [
38
38
  "numpy>=1.24",
39
- # Capped at <3 to match the constraint pysofra discovered: pandas 3.x
40
- # raises Pandas4Warning under strict warning filters via third-party
41
- # DataFrame copies. Lift when the wider PyData stack catches up.
39
+ # Capped at <3: pandas 3.x raises Pandas4Warning under strict
40
+ # warning filters via third-party DataFrame copies. Lift when the
41
+ # wider PyData stack catches up.
42
42
  "pandas>=2.0,<3",
43
43
  "scipy>=1.11",
44
44
  "pyarrow>=14",
@@ -47,13 +47,12 @@ dependencies = [
47
47
  [project.optional-dependencies]
48
48
  # Visualisation: altair-first, matplotlib retained for paper-grade figures.
49
49
  viz = ["altair>=5", "matplotlib>=3.8", "networkx>=3.1"]
50
- # Embedding-based semantic shift (Phase 2). sentence-transformers pulls
51
- # torch transitively, which is why this is opt-in rather than a base dep.
50
+ # Embedding-based semantic shift. sentence-transformers pulls torch
51
+ # transitively, which is why this is opt-in rather than a base dep.
52
52
  semantic = ["sentence-transformers>=2.2", "scikit-learn>=1.3"]
53
53
  # Changepoint detection + interrupted time series.
54
54
  temporal = ["ruptures>=1.1", "statsmodels>=0.14"]
55
- # Optional columnar backend. polars.from_pandas() needs pyarrow at runtime
56
- # (mirrors pysofra's pinning).
55
+ # Optional columnar backend. polars.from_pandas() needs pyarrow at runtime.
57
56
  polars = ["polars>=1.0", "pyarrow>=15"]
58
57
  # Out-of-core querying for large corpora.
59
58
  duckdb = ["duckdb>=0.10"]
@@ -92,9 +91,9 @@ dev = [
92
91
  "ruff>=0.4",
93
92
  "mypy>=1.8",
94
93
  "pre-commit>=3.6",
95
- # pandas-stubs sharpens mypy strict typing for pandas surfaces (per
96
- # pysofra). Without it, ignore_missing_imports would mask real
97
- # typing regressions in pandas-mediated code paths.
94
+ # pandas-stubs sharpens mypy strict typing for pandas surfaces.
95
+ # Without it, ignore_missing_imports would mask real typing
96
+ # regressions in pandas-mediated code paths.
98
97
  "pandas-stubs>=2.2",
99
98
  ]
100
99
  [project.urls]
@@ -119,7 +118,6 @@ include = [
119
118
  exclude = [
120
119
  "docs",
121
120
  "examples",
122
- "site",
123
121
  ".github",
124
122
  "uv.lock",
125
123
  ]
@@ -1,26 +1,25 @@
1
1
  """pycorpdiff — comparative corpus analysis for modern Python workflows.
2
2
 
3
3
  The package exposes three public verbs (:func:`compare`, :func:`track`,
4
- plus the :class:`Corpus` constructor and the I/O ``read_*`` helpers) and
5
- four families of result objects (:class:`KeynessResult`,
6
- :class:`CollocationShiftResult`, :class:`SemanticShiftResult`,
7
- :class:`TemporalTrajectory`).
8
-
9
- Layer-1 ingestion utilities are functional in this scaffolding release;
10
- Layer-2 analytical methods raise :class:`NotImplementedError` until Phase 1
11
- of the roadmap lands.
4
+ :func:`compare.before_after`) and a family of frozen-dataclass
5
+ result objects (:class:`KeynessResult`, :class:`CollocationShiftResult`,
6
+ :class:`SemanticShiftResult`, :class:`TemporalTrajectory`,
7
+ :class:`NetworkResult`, :class:`ForecastResult`,
8
+ :class:`CausalImpactResult`, :class:`BocpdResult`,
9
+ :class:`ConcordanceResult`), each implementing the same
10
+ ``.to_df / .plot / .explain / .summary / .to_html / .to_json`` contract.
12
11
 
13
12
  Example
14
13
  -------
15
14
 
16
15
  >>> import pycorpdiff as pcd
17
16
  >>> pcd.__version__
18
- '0.1.0a0'
17
+ '0.1.0a2'
19
18
  """
20
19
 
21
20
  from __future__ import annotations
22
21
 
23
- __version__ = "0.1.0a0"
22
+ __version__ = "0.1.0a2"
24
23
 
25
24
  from .collocation.network import NetworkResult, cooccurrence_network
26
25
  from .compare import Comparison, compare
@@ -251,7 +251,16 @@ class SemanticShiftResult:
251
251
  return _table_to_json(self.table, path, **kw)
252
252
 
253
253
  def plot(self, **kw: Any) -> alt.Chart:
254
- raise NotImplementedError("SemanticShiftResult.plot() lands in Phase 6")
254
+ """Plotting for SemanticShiftResult is not yet implemented.
255
+
256
+ For a forward-looking trajectory of cosine distances, use
257
+ :func:`pycorpdiff.semantic_trajectory` and pass the resulting
258
+ DataFrame to :func:`pycorpdiff.viz.semantic_forecast_plot`.
259
+ """
260
+ raise NotImplementedError(
261
+ "SemanticShiftResult.plot() is not yet implemented; "
262
+ "use .table or pcd.viz.semantic_forecast_plot() instead"
263
+ )
255
264
 
256
265
  def neighbors_before(
257
266
  self, target: str | None = None, n: int = 10
@@ -1,10 +1,8 @@
1
- """Smoke tests for the Phase 0 scaffolding.
1
+ """Smoke tests for the public surface.
2
2
 
3
- These exercise the parts of the package that are real in the scaffolding
4
- release: imports, the Corpus constructor, slicing, the regex tokenizer,
5
- and the CSV/parquet readers. Analytical methods are expected to raise
6
- NotImplementedError and are intentionally not exercised here — Phase 1
7
- will replace those tests with real ones.
3
+ Quick exercises of imports, the :class:`Corpus` constructor, slicing,
4
+ the default regex tokenizer, and the CSV/parquet readers. The deeper
5
+ analytical surfaces have their own dedicated test modules.
8
6
  """
9
7
 
10
8
  from __future__ import annotations
File without changes
File without changes