pycorpdiff 0.1.0a0__tar.gz → 0.1.0a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/CHANGELOG.md +1 -1
  2. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/CITATION.cff +7 -14
  3. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/PKG-INFO +44 -32
  4. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/README.md +42 -30
  5. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/pyproject.toml +11 -13
  6. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/__init__.py +9 -10
  7. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/results.py +10 -1
  8. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_smoke.py +4 -6
  9. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/.gitignore +0 -0
  10. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/LICENSE +0 -0
  11. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/_backends/__init__.py +0 -0
  12. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/_backends/pandas.py +0 -0
  13. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/_backends/polars.py +0 -0
  14. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/__init__.py +0 -0
  15. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
  16. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/measures.py +0 -0
  17. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/network.py +0 -0
  18. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/collocation/shift.py +0 -0
  19. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/compare.py +0 -0
  20. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/corpus.py +0 -0
  21. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/__init__.py +0 -0
  22. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  23. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
  24. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/hansard.py +0 -0
  25. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/datasets/histwords.py +0 -0
  26. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/explain.py +0 -0
  27. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/io/__init__.py +0 -0
  28. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/io/duckdb.py +0 -0
  29. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/io/huggingface.py +0 -0
  30. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/io/readers.py +0 -0
  31. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/__init__.py +0 -0
  32. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/bayes.py +0 -0
  33. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/chi_squared.py +0 -0
  34. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/correction.py +0 -0
  35. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/dispersion.py +0 -0
  36. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
  37. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
  38. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/multicorpus.py +0 -0
  39. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/keyness/permutation.py +0 -0
  40. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/py.typed +0 -0
  41. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/__init__.py +0 -0
  42. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/alignment.py +0 -0
  43. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/embed.py +0 -0
  44. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/shift.py +0 -0
  45. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/semantic/trajectory.py +0 -0
  46. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/stats.py +0 -0
  47. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/__init__.py +0 -0
  48. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/bocpd.py +0 -0
  49. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/causal_impact.py +0 -0
  50. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/changepoint.py +0 -0
  51. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/forecast.py +0 -0
  52. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/its.py +0 -0
  53. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/temporal/slicing.py +0 -0
  54. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/tokenize.py +0 -0
  55. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/__init__.py +0 -0
  56. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/bocpd.py +0 -0
  57. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/causal_impact.py +0 -0
  58. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/collocation.py +0 -0
  59. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/dispersion.py +0 -0
  60. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/forecast.py +0 -0
  61. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/keyness.py +0 -0
  62. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/network.py +0 -0
  63. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/scattertext.py +0 -0
  64. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
  65. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/src/pycorpdiff/viz/trajectory.py +0 -0
  66. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/__init__.py +0 -0
  67. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/conftest.py +0 -0
  68. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/fixtures/__init__.py +0 -0
  69. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/__init__.py +0 -0
  70. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_collocation_integration.py +0 -0
  71. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_histwords.py +0 -0
  72. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_nltk.py +0 -0
  73. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_quanteda.py +0 -0
  74. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_rayson.py +0 -0
  75. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_crossval_scattertext.py +0 -0
  76. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_explain_integration.py +0 -0
  77. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_keyness_integration.py +0 -0
  78. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_sbert_slow.py +0 -0
  79. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_semantic_integration.py +0 -0
  80. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_stop_words.py +0 -0
  81. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_temporal_stats.py +0 -0
  82. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/integration/test_viz.py +0 -0
  83. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/property/__init__.py +0 -0
  84. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/property/test_collocation_properties.py +0 -0
  85. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/property/test_keyness_properties.py +0 -0
  86. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/property/test_temporal_properties.py +0 -0
  87. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/__init__.py +0 -0
  88. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_bayes_factor.py +0 -0
  89. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_bocpd.py +0 -0
  90. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_causal_impact.py +0 -0
  91. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_changepoint.py +0 -0
  92. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_chi_squared.py +0 -0
  93. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_collocation_cooccurrence.py +0 -0
  94. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_collocation_measures.py +0 -0
  95. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_collocation_shift.py +0 -0
  96. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_comparison_concordance.py +0 -0
  97. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_cooccurrence_network.py +0 -0
  98. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_corpus_hash.py +0 -0
  99. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_corpus_vocab.py +0 -0
  100. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_correction.py +0 -0
  101. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_datasets_hansard.py +0 -0
  102. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_dispersion.py +0 -0
  103. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_dispersion_plot.py +0 -0
  104. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_doc_term_counts_sparse.py +0 -0
  105. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_effect_sizes.py +0 -0
  106. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_embedders.py +0 -0
  107. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_explain.py +0 -0
  108. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_forecast.py +0 -0
  109. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_forecast_semantic_drift.py +0 -0
  110. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_from_huggingface.py +0 -0
  111. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_hansard_fetcher.py +0 -0
  112. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_histwords_loader.py +0 -0
  113. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_its.py +0 -0
  114. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_keyness_multi.py +0 -0
  115. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_loglikelihood.py +0 -0
  116. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_ngram_tokenizer.py +0 -0
  117. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_permutation_keyness.py +0 -0
  118. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_polars_interop.py +0 -0
  119. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_procrustes.py +0 -0
  120. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_read_duckdb.py +0 -0
  121. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_read_txt_line_mode.py +0 -0
  122. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_result_exports.py +0 -0
  123. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_scattertext_plot.py +0 -0
  124. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_semantic_neighbours.py +0 -0
  125. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_semantic_shift.py +0 -0
  126. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_semantic_trajectory.py +0 -0
  127. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_temporal.py +0 -0
  128. {pycorpdiff-0.1.0a0 → pycorpdiff-0.1.0a1}/tests/unit/test_wilson_ci.py +0 -0
@@ -4,7 +4,7 @@ All notable changes to `pycorpdiff` are documented in this file. The format
4
4
  follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
5
5
  project adheres to [Semantic Versioning](https://semver.org/).
6
6
 
7
- ## [0.1.0a0] — initial release
7
+ ## [0.1.0a1] — initial release
8
8
 
9
9
  The initial public release of `pycorpdiff` — comparative corpus analysis
10
10
  for modern Python workflows. Three public verbs (`compare`, `track`,
@@ -1,11 +1,10 @@
1
1
  cff-version: 1.2.0
2
2
  message: >
3
- If you use pycorpdiff in academic work, please cite both the
4
- software (this entry) and the accompanying Journal of Statistical
5
- Software paper once it appears. The JSS manuscript is in
6
- preparation; the draft will live in this repository as paper/paper.tex.
3
+ If you use pycorpdiff in academic work, please cite this software
4
+ entry. GitHub renders a "Cite this repository" widget directly from
5
+ this file.
7
6
  title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
8
- version: 0.1.0a0
7
+ version: 0.1.0a1
9
8
  date-released: 2026-05-22
10
9
  authors:
11
10
  - family-names: Turner
@@ -34,16 +33,10 @@ abstract: >
34
33
  computational social science, and discourse analysis research,
35
34
  emphasising interpretability, explainability, statistical rigour,
36
35
  and reproducibility.
37
- preferred-citation:
38
- type: article
39
- authors:
40
- - family-names: Turner
41
- given-names: Jason
42
- title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
43
- journal: "Journal of Statistical Software"
44
- year: 2026
45
- status: in-preparation
46
36
  identifiers:
47
37
  - type: url
48
38
  value: "https://github.com/jturner-uofl/pycorpdiff"
49
39
  description: Project repository
40
+ - type: url
41
+ value: "https://pypi.org/project/pycorpdiff/"
42
+ description: PyPI release
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycorpdiff
3
- Version: 0.1.0a0
3
+ Version: 0.1.0a1
4
4
  Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
5
5
  Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
6
6
  Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
@@ -30,7 +30,7 @@ License: MIT License
30
30
  SOFTWARE.
31
31
  License-File: LICENSE
32
32
  Keywords: collocation,comparative corpus analysis,computational social science,corpus linguistics,diachronic nlp,digital humanities,discourse analysis,keyness,semantic change,temporal text analysis
33
- Classifier: Development Status :: 2 - Pre-Alpha
33
+ Classifier: Development Status :: 3 - Alpha
34
34
  Classifier: Intended Audience :: Science/Research
35
35
  Classifier: License :: OSI Approved :: MIT License
36
36
  Classifier: Programming Language :: Python :: 3
@@ -95,15 +95,10 @@ Description-Content-Type: text/markdown
95
95
 
96
96
  # pycorpdiff
97
97
 
98
- <!--
99
- TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
100
-
101
98
  [![PyPI](https://img.shields.io/pypi/v/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
102
99
  [![Python versions](https://img.shields.io/pypi/pyversions/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
103
100
  [![CI](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml/badge.svg)](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
104
- [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.<RECORD>.svg)](https://doi.org/10.5281/zenodo.<RECORD>)
105
101
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
106
- -->
107
102
 
108
103
  **Comparative corpus analysis for modern Python workflows.**
109
104
 
@@ -135,8 +130,8 @@ points — one-line adapters, no plugin registry. The base install pulls
135
130
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
136
131
  via extras.
137
132
 
138
- > **Status: pre-release alpha (0.1.0a0).** Public API is stable for the
139
- > features described below; PyPI publication is the next milestone.
133
+ > **Status: alpha (0.1.0a1).** Public API is stable for the features
134
+ > described below; on PyPI as `pip install pycorpdiff`.
140
135
 
141
136
  ## The three-layer architecture
142
137
 
@@ -151,33 +146,45 @@ via extras.
151
146
  ```python
152
147
  import pycorpdiff as pcd
153
148
 
154
- news = pcd.from_dataframe(df, text_col="body", meta_cols=("outlet", "date"))
149
+ # Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
150
+ corpus = pcd.load_hansard_sample()
151
+ immigration = corpus.slice(topic="immigration")
152
+ human = immigration.slice(frame="humanising")
153
+ criminal = immigration.slice(frame="criminalising")
155
154
 
156
155
  # Compare — three verbs
157
- k = pcd.compare(news.slice(outlet="Guardian"), news.slice(outlet="Mail")).keyness()
158
- c = pcd.compare(a, b).collocation_shift("migrant")
159
- s = pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder())
156
+ k = pcd.compare(human, criminal).keyness()
157
+ c = pcd.compare(human, criminal).collocation_shift("immigrant")
158
+ # s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
159
+ # ↑ requires `pip install "pycorpdiff[semantic]"`
160
160
 
161
161
  # Track over time
162
- tr = pcd.track(news, "migrant").over_time(freq="Y")
163
- tr.changepoints() # offline PELT
164
- tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
165
- tr.interrupted_time_series(event_date="2016-06-23") # segmented OLS
166
- tr.causal_impact(event_date="2016-06-23") # Bayesian counterfactual (Brodersen 2015)
167
- tr.forecast(horizon=4) # state-space ETS
162
+ tr = pcd.track(immigration, "criminal").over_time(freq="Y")
163
+ tr.changepoints() # offline PELT
164
+ tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
165
+ tr.interrupted_time_series(event_date="2016") # segmented OLS
166
+ tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
167
+ tr.forecast(horizon=4) # state-space ETS
168
168
 
169
169
  # Before / after a known event
170
- pcd.compare.before_after(news, event_date="2016-06-23").keyness()
170
+ pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
171
171
 
172
- # N-way (≥ 2 corpora)
173
- pcd.keyness_multi([gu, ma, te, mi], labels=["Guardian", "Mail", "Telegraph", "Mirror"])
172
+ # N-way (≥ 2 corpora) — one keyness across all four parties
173
+ parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
174
+ nhs = corpus.slice(topic="nhs")
175
+ pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
174
176
 
175
177
  # The discourse as a graph
176
- pcd.cooccurrence_network(news, top_n=50).plot()
178
+ pcd.cooccurrence_network(immigration, top_n=30).plot()
177
179
 
178
180
  # Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
179
181
  ```
180
182
 
183
+ The snippet above runs as-is on a fresh `pip install pycorpdiff` — no data
184
+ download required. Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
185
+ `pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or `pcd.from_huggingface(...)`
186
+ to use your own corpus.
187
+
181
188
  See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
182
189
  ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
183
190
  walkthrough on a synthetic UK Hansard corpus exercising every analytical
@@ -185,23 +192,28 @@ surface.
185
192
 
186
193
  ## Installation
187
194
 
188
- <!-- TODO post-publish: replace this block with the PyPI install commands once published. -->
195
+ ```bash
196
+ pip install pycorpdiff # lexical-comparative core
197
+ pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
198
+ pip install "pycorpdiff[semantic]" # + sentence-transformers
199
+ pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
200
+ pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
201
+ pip install "pycorpdiff[all]" # everything
202
+ ```
203
+
204
+ The base install keeps a small dependency footprint (`numpy`, `pandas`,
205
+ `scipy`, `pyarrow`); optional extras land per analytical layer so you
206
+ only pay for what you use.
189
207
 
190
- Currently a pre-release alpha. From a local clone:
208
+ To work from source:
191
209
 
192
210
  ```bash
193
211
  git clone https://github.com/jturner-uofl/pycorpdiff
194
212
  cd pycorpdiff
195
213
  pip install -e ".[dev]"
196
- pytest -q # 519 default tests, ~7s
214
+ pytest -q
197
215
  ```
198
216
 
199
- Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
200
- (sentence-transformers + scikit-learn), `[temporal]` (ruptures +
201
- statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
202
- `[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
203
- or `[all]`.
204
-
205
217
  ## Cross-validation receipts
206
218
 
207
219
  The math agrees with the standard tools — by automated test:
@@ -1,14 +1,9 @@
1
1
  # pycorpdiff
2
2
 
3
- <!--
4
- TODO post-publish (Phase 5 — once GitHub repo public + PyPI published + Zenodo DOI minted):
5
-
6
3
  [![PyPI](https://img.shields.io/pypi/v/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
7
4
  [![Python versions](https://img.shields.io/pypi/pyversions/pycorpdiff.svg)](https://pypi.org/project/pycorpdiff/)
8
5
  [![CI](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml/badge.svg)](https://github.com/jturner-uofl/pycorpdiff/actions/workflows/ci.yml)
9
- [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.<RECORD>.svg)](https://doi.org/10.5281/zenodo.<RECORD>)
10
6
  [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
11
- -->
12
7
 
13
8
  **Comparative corpus analysis for modern Python workflows.**
14
9
 
@@ -40,8 +35,8 @@ points — one-line adapters, no plugin registry. The base install pulls
40
35
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
41
36
  via extras.
42
37
 
43
- > **Status: pre-release alpha (0.1.0a0).** Public API is stable for the
44
- > features described below; PyPI publication is the next milestone.
38
+ > **Status: alpha (0.1.0a1).** Public API is stable for the features
39
+ > described below; on PyPI as `pip install pycorpdiff`.
45
40
 
46
41
  ## The three-layer architecture
47
42
 
@@ -56,33 +51,45 @@ via extras.
56
51
  ```python
57
52
  import pycorpdiff as pcd
58
53
 
59
- news = pcd.from_dataframe(df, text_col="body", meta_cols=("outlet", "date"))
54
+ # Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
55
+ corpus = pcd.load_hansard_sample()
56
+ immigration = corpus.slice(topic="immigration")
57
+ human = immigration.slice(frame="humanising")
58
+ criminal = immigration.slice(frame="criminalising")
60
59
 
61
60
  # Compare — three verbs
62
- k = pcd.compare(news.slice(outlet="Guardian"), news.slice(outlet="Mail")).keyness()
63
- c = pcd.compare(a, b).collocation_shift("migrant")
64
- s = pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder())
61
+ k = pcd.compare(human, criminal).keyness()
62
+ c = pcd.compare(human, criminal).collocation_shift("immigrant")
63
+ # s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
64
+ # ↑ requires `pip install "pycorpdiff[semantic]"`
65
65
 
66
66
  # Track over time
67
- tr = pcd.track(news, "migrant").over_time(freq="Y")
68
- tr.changepoints() # offline PELT
69
- tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
70
- tr.interrupted_time_series(event_date="2016-06-23") # segmented OLS
71
- tr.causal_impact(event_date="2016-06-23") # Bayesian counterfactual (Brodersen 2015)
72
- tr.forecast(horizon=4) # state-space ETS
67
+ tr = pcd.track(immigration, "criminal").over_time(freq="Y")
68
+ tr.changepoints() # offline PELT
69
+ tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
70
+ tr.interrupted_time_series(event_date="2016") # segmented OLS
71
+ tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
72
+ tr.forecast(horizon=4) # state-space ETS
73
73
 
74
74
  # Before / after a known event
75
- pcd.compare.before_after(news, event_date="2016-06-23").keyness()
75
+ pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
76
76
 
77
- # N-way (≥ 2 corpora)
78
- pcd.keyness_multi([gu, ma, te, mi], labels=["Guardian", "Mail", "Telegraph", "Mirror"])
77
+ # N-way (≥ 2 corpora) — one keyness across all four parties
78
+ parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
79
+ nhs = corpus.slice(topic="nhs")
80
+ pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
79
81
 
80
82
  # The discourse as a graph
81
- pcd.cooccurrence_network(news, top_n=50).plot()
83
+ pcd.cooccurrence_network(immigration, top_n=30).plot()
82
84
 
83
85
  # Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
84
86
  ```
85
87
 
88
+ The snippet above runs as-is on a fresh `pip install pycorpdiff` — no data
89
+ download required. Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
90
+ `pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or `pcd.from_huggingface(...)`
91
+ to use your own corpus.
92
+
86
93
  See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
87
94
  ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
88
95
  walkthrough on a synthetic UK Hansard corpus exercising every analytical
@@ -90,23 +97,28 @@ surface.
90
97
 
91
98
  ## Installation
92
99
 
93
- <!-- TODO post-publish: replace this block with the PyPI install commands once published. -->
100
+ ```bash
101
+ pip install pycorpdiff # lexical-comparative core
102
+ pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
103
+ pip install "pycorpdiff[semantic]" # + sentence-transformers
104
+ pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
105
+ pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
106
+ pip install "pycorpdiff[all]" # everything
107
+ ```
108
+
109
+ The base install keeps a small dependency footprint (`numpy`, `pandas`,
110
+ `scipy`, `pyarrow`); optional extras land per analytical layer so you
111
+ only pay for what you use.
94
112
 
95
- Currently a pre-release alpha. From a local clone:
113
+ To work from source:
96
114
 
97
115
  ```bash
98
116
  git clone https://github.com/jturner-uofl/pycorpdiff
99
117
  cd pycorpdiff
100
118
  pip install -e ".[dev]"
101
- pytest -q # 519 default tests, ~7s
119
+ pytest -q
102
120
  ```
103
121
 
104
- Optional extras: `[viz]` (altair + matplotlib + networkx), `[semantic]`
105
- (sentence-transformers + scikit-learn), `[temporal]` (ruptures +
106
- statsmodels), `[polars]`, `[duckdb]`, `[huggingface]`, `[nlp]` (spaCy),
107
- `[notebooks]` (jupyter + vl-convert + pysofra, for the showcase),
108
- or `[all]`.
109
-
110
122
  ## Cross-validation receipts
111
123
 
112
124
  The math agrees with the standard tools — by automated test:
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pycorpdiff"
7
- version = "0.1.0a0"
7
+ version = "0.1.0a1"
8
8
  description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -23,7 +23,7 @@ keywords = [
23
23
  "temporal text analysis",
24
24
  ]
25
25
  classifiers = [
26
- "Development Status :: 2 - Pre-Alpha",
26
+ "Development Status :: 3 - Alpha",
27
27
  "Intended Audience :: Science/Research",
28
28
  "License :: OSI Approved :: MIT License",
29
29
  "Programming Language :: Python :: 3",
@@ -36,9 +36,9 @@ classifiers = [
36
36
  ]
37
37
  dependencies = [
38
38
  "numpy>=1.24",
39
- # Capped at <3 to match the constraint pysofra discovered: pandas 3.x
40
- # raises Pandas4Warning under strict warning filters via third-party
41
- # DataFrame copies. Lift when the wider PyData stack catches up.
39
+ # Capped at <3: pandas 3.x raises Pandas4Warning under strict
40
+ # warning filters via third-party DataFrame copies. Lift when the
41
+ # wider PyData stack catches up.
42
42
  "pandas>=2.0,<3",
43
43
  "scipy>=1.11",
44
44
  "pyarrow>=14",
@@ -47,13 +47,12 @@ dependencies = [
47
47
  [project.optional-dependencies]
48
48
  # Visualisation: altair-first, matplotlib retained for paper-grade figures.
49
49
  viz = ["altair>=5", "matplotlib>=3.8", "networkx>=3.1"]
50
- # Embedding-based semantic shift (Phase 2). sentence-transformers pulls
51
- # torch transitively, which is why this is opt-in rather than a base dep.
50
+ # Embedding-based semantic shift. sentence-transformers pulls torch
51
+ # transitively, which is why this is opt-in rather than a base dep.
52
52
  semantic = ["sentence-transformers>=2.2", "scikit-learn>=1.3"]
53
53
  # Changepoint detection + interrupted time series.
54
54
  temporal = ["ruptures>=1.1", "statsmodels>=0.14"]
55
- # Optional columnar backend. polars.from_pandas() needs pyarrow at runtime
56
- # (mirrors pysofra's pinning).
55
+ # Optional columnar backend. polars.from_pandas() needs pyarrow at runtime.
57
56
  polars = ["polars>=1.0", "pyarrow>=15"]
58
57
  # Out-of-core querying for large corpora.
59
58
  duckdb = ["duckdb>=0.10"]
@@ -92,9 +91,9 @@ dev = [
92
91
  "ruff>=0.4",
93
92
  "mypy>=1.8",
94
93
  "pre-commit>=3.6",
95
- # pandas-stubs sharpens mypy strict typing for pandas surfaces (per
96
- # pysofra). Without it, ignore_missing_imports would mask real
97
- # typing regressions in pandas-mediated code paths.
94
+ # pandas-stubs sharpens mypy strict typing for pandas surfaces.
95
+ # Without it, ignore_missing_imports would mask real typing
96
+ # regressions in pandas-mediated code paths.
98
97
  "pandas-stubs>=2.2",
99
98
  ]
100
99
  [project.urls]
@@ -119,7 +118,6 @@ include = [
119
118
  exclude = [
120
119
  "docs",
121
120
  "examples",
122
- "site",
123
121
  ".github",
124
122
  "uv.lock",
125
123
  ]
@@ -1,26 +1,25 @@
1
1
  """pycorpdiff — comparative corpus analysis for modern Python workflows.
2
2
 
3
3
  The package exposes three public verbs (:func:`compare`, :func:`track`,
4
- plus the :class:`Corpus` constructor and the I/O ``read_*`` helpers) and
5
- four families of result objects (:class:`KeynessResult`,
6
- :class:`CollocationShiftResult`, :class:`SemanticShiftResult`,
7
- :class:`TemporalTrajectory`).
8
-
9
- Layer-1 ingestion utilities are functional in this scaffolding release;
10
- Layer-2 analytical methods raise :class:`NotImplementedError` until Phase 1
11
- of the roadmap lands.
4
+ :func:`compare.before_after`) and a family of frozen-dataclass
5
+ result objects (:class:`KeynessResult`, :class:`CollocationShiftResult`,
6
+ :class:`SemanticShiftResult`, :class:`TemporalTrajectory`,
7
+ :class:`NetworkResult`, :class:`ForecastResult`,
8
+ :class:`CausalImpactResult`, :class:`BocpdResult`,
9
+ :class:`ConcordanceResult`), each implementing the same
10
+ ``.to_df / .plot / .explain / .summary / .to_html / .to_json`` contract.
12
11
 
13
12
  Example
14
13
  -------
15
14
 
16
15
  >>> import pycorpdiff as pcd
17
16
  >>> pcd.__version__
18
- '0.1.0a0'
17
+ '0.1.0a1'
19
18
  """
20
19
 
21
20
  from __future__ import annotations
22
21
 
23
- __version__ = "0.1.0a0"
22
+ __version__ = "0.1.0a1"
24
23
 
25
24
  from .collocation.network import NetworkResult, cooccurrence_network
26
25
  from .compare import Comparison, compare
@@ -251,7 +251,16 @@ class SemanticShiftResult:
251
251
  return _table_to_json(self.table, path, **kw)
252
252
 
253
253
  def plot(self, **kw: Any) -> alt.Chart:
254
- raise NotImplementedError("SemanticShiftResult.plot() lands in Phase 6")
254
+ """Plotting for SemanticShiftResult is not yet implemented.
255
+
256
+ For a forward-looking trajectory of cosine distances, use
257
+ :func:`pycorpdiff.semantic_trajectory` and pass the resulting
258
+ DataFrame to :func:`pycorpdiff.viz.semantic_forecast_plot`.
259
+ """
260
+ raise NotImplementedError(
261
+ "SemanticShiftResult.plot() is not yet implemented; "
262
+ "use .table or pcd.viz.semantic_forecast_plot() instead"
263
+ )
255
264
 
256
265
  def neighbors_before(
257
266
  self, target: str | None = None, n: int = 10
@@ -1,10 +1,8 @@
1
- """Smoke tests for the Phase 0 scaffolding.
1
+ """Smoke tests for the public surface.
2
2
 
3
- These exercise the parts of the package that are real in the scaffolding
4
- release: imports, the Corpus constructor, slicing, the regex tokenizer,
5
- and the CSV/parquet readers. Analytical methods are expected to raise
6
- NotImplementedError and are intentionally not exercised here — Phase 1
7
- will replace those tests with real ones.
3
+ Quick exercises of imports, the :class:`Corpus` constructor, slicing,
4
+ the default regex tokenizer, and the CSV/parquet readers. The deeper
5
+ analytical surfaces have their own dedicated test modules.
8
6
  """
9
7
 
10
8
  from __future__ import annotations
File without changes
File without changes