pycorpdiff 0.1.0a2__tar.gz → 0.1.0a3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/CHANGELOG.md +1 -1
  2. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/CITATION.cff +1 -1
  3. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/PKG-INFO +44 -35
  4. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/README.md +41 -32
  5. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/pyproject.toml +3 -3
  6. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/__init__.py +2 -2
  7. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/.gitignore +0 -0
  8. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/LICENSE +0 -0
  9. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/_backends/__init__.py +0 -0
  10. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/_backends/pandas.py +0 -0
  11. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/_backends/polars.py +0 -0
  12. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/__init__.py +0 -0
  13. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
  14. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/measures.py +0 -0
  15. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/network.py +0 -0
  16. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/collocation/shift.py +0 -0
  17. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/compare.py +0 -0
  18. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/corpus.py +0 -0
  19. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/__init__.py +0 -0
  20. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  21. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
  22. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/hansard.py +0 -0
  23. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/datasets/histwords.py +0 -0
  24. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/explain.py +0 -0
  25. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/__init__.py +0 -0
  26. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/duckdb.py +0 -0
  27. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/huggingface.py +0 -0
  28. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/io/readers.py +0 -0
  29. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/__init__.py +0 -0
  30. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/bayes.py +0 -0
  31. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/chi_squared.py +0 -0
  32. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/correction.py +0 -0
  33. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/dispersion.py +0 -0
  34. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
  35. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/loglikelihood.py +0 -0
  36. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/multicorpus.py +0 -0
  37. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/keyness/permutation.py +0 -0
  38. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/py.typed +0 -0
  39. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/results.py +0 -0
  40. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/__init__.py +0 -0
  41. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/alignment.py +0 -0
  42. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/embed.py +0 -0
  43. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/shift.py +0 -0
  44. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/semantic/trajectory.py +0 -0
  45. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/stats.py +0 -0
  46. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/__init__.py +0 -0
  47. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/bocpd.py +0 -0
  48. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/causal_impact.py +0 -0
  49. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/changepoint.py +0 -0
  50. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/forecast.py +0 -0
  51. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/its.py +0 -0
  52. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/temporal/slicing.py +0 -0
  53. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/tokenize.py +0 -0
  54. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/__init__.py +0 -0
  55. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/bocpd.py +0 -0
  56. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/causal_impact.py +0 -0
  57. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/collocation.py +0 -0
  58. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/dispersion.py +0 -0
  59. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/forecast.py +0 -0
  60. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/keyness.py +0 -0
  61. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/network.py +0 -0
  62. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/scattertext.py +0 -0
  63. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
  64. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/src/pycorpdiff/viz/trajectory.py +0 -0
  65. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/__init__.py +0 -0
  66. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/conftest.py +0 -0
  67. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/fixtures/__init__.py +0 -0
  68. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/__init__.py +0 -0
  69. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_collocation_integration.py +0 -0
  70. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_histwords.py +0 -0
  71. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_nltk.py +0 -0
  72. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_quanteda.py +0 -0
  73. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_rayson.py +0 -0
  74. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_crossval_scattertext.py +0 -0
  75. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_explain_integration.py +0 -0
  76. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_keyness_integration.py +0 -0
  77. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_sbert_slow.py +0 -0
  78. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_semantic_integration.py +0 -0
  79. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_stop_words.py +0 -0
  80. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_temporal_stats.py +0 -0
  81. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/integration/test_viz.py +0 -0
  82. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/property/__init__.py +0 -0
  83. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/property/test_collocation_properties.py +0 -0
  84. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/property/test_keyness_properties.py +0 -0
  85. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/property/test_temporal_properties.py +0 -0
  86. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/__init__.py +0 -0
  87. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_bayes_factor.py +0 -0
  88. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_bocpd.py +0 -0
  89. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_causal_impact.py +0 -0
  90. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_changepoint.py +0 -0
  91. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_chi_squared.py +0 -0
  92. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_collocation_cooccurrence.py +0 -0
  93. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_collocation_measures.py +0 -0
  94. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_collocation_shift.py +0 -0
  95. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_comparison_concordance.py +0 -0
  96. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_cooccurrence_network.py +0 -0
  97. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_corpus_hash.py +0 -0
  98. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_corpus_vocab.py +0 -0
  99. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_correction.py +0 -0
  100. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_datasets_hansard.py +0 -0
  101. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_dispersion.py +0 -0
  102. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_dispersion_plot.py +0 -0
  103. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_doc_term_counts_sparse.py +0 -0
  104. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_effect_sizes.py +0 -0
  105. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_embedders.py +0 -0
  106. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_explain.py +0 -0
  107. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_forecast.py +0 -0
  108. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_forecast_semantic_drift.py +0 -0
  109. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_from_huggingface.py +0 -0
  110. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_hansard_fetcher.py +0 -0
  111. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_histwords_loader.py +0 -0
  112. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_its.py +0 -0
  113. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_keyness_multi.py +0 -0
  114. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_loglikelihood.py +0 -0
  115. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_ngram_tokenizer.py +0 -0
  116. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_permutation_keyness.py +0 -0
  117. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_polars_interop.py +0 -0
  118. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_procrustes.py +0 -0
  119. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_read_duckdb.py +0 -0
  120. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_read_txt_line_mode.py +0 -0
  121. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_result_exports.py +0 -0
  122. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_scattertext_plot.py +0 -0
  123. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_semantic_neighbours.py +0 -0
  124. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_semantic_shift.py +0 -0
  125. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_semantic_trajectory.py +0 -0
  126. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_smoke.py +0 -0
  127. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_temporal.py +0 -0
  128. {pycorpdiff-0.1.0a2 → pycorpdiff-0.1.0a3}/tests/unit/test_wilson_ci.py +0 -0
@@ -4,7 +4,7 @@ All notable changes to `pycorpdiff` are documented in this file. The format
4
4
  follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
5
5
  project adheres to [Semantic Versioning](https://semver.org/).
6
6
 
7
- ## [0.1.0a2] — initial release
7
+ ## [0.1.0a3] — initial release
8
8
 
9
9
  The initial public release of `pycorpdiff` — comparative corpus analysis
10
10
  for modern Python workflows. Three public verbs (`compare`, `track`,
@@ -4,7 +4,7 @@ message: >
4
4
  entry. GitHub renders a "Cite this repository" widget directly from
5
5
  this file.
6
6
  title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
7
- version: 0.1.0a2
7
+ version: 0.1.0a3
8
8
  date-released: 2026-05-22
9
9
  authors:
10
10
  - family-names: Turner
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycorpdiff
3
- Version: 0.1.0a2
3
+ Version: 0.1.0a3
4
4
  Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
5
5
  Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
6
6
  Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
@@ -53,7 +53,7 @@ Requires-Dist: matplotlib>=3.8; extra == 'all'
53
53
  Requires-Dist: networkx>=3.1; extra == 'all'
54
54
  Requires-Dist: polars>=1.0; extra == 'all'
55
55
  Requires-Dist: pyarrow>=15; extra == 'all'
56
- Requires-Dist: pysofra>=0.1.0a2; extra == 'all'
56
+ Requires-Dist: pysofra>=0.1.0a3; extra == 'all'
57
57
  Requires-Dist: ruptures>=1.1; extra == 'all'
58
58
  Requires-Dist: scikit-learn>=1.3; extra == 'all'
59
59
  Requires-Dist: sentence-transformers>=2.2; extra == 'all'
@@ -76,7 +76,7 @@ Provides-Extra: nlp
76
76
  Requires-Dist: spacy>=3.7; extra == 'nlp'
77
77
  Provides-Extra: notebooks
78
78
  Requires-Dist: jupyter>=1.0; extra == 'notebooks'
79
- Requires-Dist: pysofra>=0.1.0a2; extra == 'notebooks'
79
+ Requires-Dist: pysofra>=0.1.0a3; extra == 'notebooks'
80
80
  Requires-Dist: vl-convert-python>=1.5; extra == 'notebooks'
81
81
  Provides-Extra: polars
82
82
  Requires-Dist: polars>=1.0; extra == 'polars'
@@ -130,7 +130,7 @@ points — one-line adapters, no plugin registry. The base install pulls
130
130
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
131
131
  via extras.
132
132
 
133
- > **Status: alpha (0.1.0a2).** Public API is stable for the features
133
+ > **Status: alpha (0.1.0a3).** Public API is stable for the features
134
134
  > described below; on PyPI as `pip install pycorpdiff`.
135
135
 
136
136
  ## The three-layer architecture
@@ -144,52 +144,61 @@ via extras.
144
144
  ## Quick start
145
145
 
146
146
  ```bash
147
- pip install "pycorpdiff[viz,temporal]"
147
+ pip install "pycorpdiff[viz]"
148
148
  ```
149
149
 
150
150
  ```python
151
151
  import pycorpdiff as pcd
152
152
 
153
- # Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
153
+ # Bundled UK-Hansard sample — runs offline, no data download.
154
154
  corpus = pcd.load_hansard_sample()
155
155
  immigration = corpus.slice(topic="immigration")
156
- human = immigration.slice(frame="humanising")
157
- criminal = immigration.slice(frame="criminalising")
158
-
159
- # Compare — three verbs
160
- k = pcd.compare(human, criminal).keyness()
161
- c = pcd.compare(human, criminal).collocation_shift("immigrant")
162
- # s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
163
- # requires `pip install "pycorpdiff[semantic]"`
164
-
165
- # Track over time
166
- tr = pcd.track(immigration, "criminal").over_time(freq="Y")
167
- tr.changepoints() # offline PELT
168
- tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
169
- tr.interrupted_time_series(event_date="2016") # segmented OLS
170
- tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
171
- tr.forecast(horizon=4) # state-space ETS
156
+
157
+ # Which words separate the humanising and criminalising frames?
158
+ keyness = pcd.compare(
159
+ immigration.slice(frame="humanising"),
160
+ immigration.slice(frame="criminalising"),
161
+ ).keyness(min_count=3)
162
+
163
+ keyness.plot() # volcano plot picture the result
164
+ # keyness.table.head(10) # or look at the ranked table directly
165
+ # keyness.explain("criminal") # KWIC concordances showing the textual evidence
166
+ ```
167
+
168
+ That's the entire surface in five lines: load a corpus, slice it,
169
+ compare two slices, plot the result. Every other analytical method —
170
+ collocation shifts, semantic drift, temporal trajectories, changepoint
171
+ detection, causal-impact analysis, forecasting, co-occurrence networks,
172
+ N-way keyness — follows the same shape. See
173
+ [the showcase notebook](docs/rendered/pycorpdiff_showcase.html) for the
174
+ full feature tour, or the cheat sheet below for one-line API previews.
175
+
176
+ ### Cheat sheet — every analytical surface in one block
177
+
178
+ ```python
179
+ # Compare verbs (returns Result objects with .plot / .to_df / .explain / .summary)
180
+ pcd.compare(a, b).keyness()
181
+ pcd.compare(a, b).collocation_shift("migrant")
182
+ pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder()) # [semantic]
183
+
184
+ # Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
185
+ tr = pcd.track(corpus, "migrant").over_time(freq="Y")
186
+ tr.changepoints() # offline PELT
187
+ tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
188
+ tr.interrupted_time_series(event_date="2016") # segmented OLS
189
+ tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
190
+ tr.forecast(horizon=4) # state-space ETS
172
191
 
173
192
  # Before / after a known event
174
193
  pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
175
194
 
176
- # N-way (≥ 2 corpora) — one keyness across all four parties
177
- parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
178
- nhs = corpus.slice(topic="nhs")
179
- pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
195
+ # N-way (≥ 2 corpora)
196
+ pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
180
197
 
181
198
  # The discourse as a graph
182
- pcd.cooccurrence_network(immigration, top_n=30).plot()
183
-
184
- # Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
199
+ pcd.cooccurrence_network(corpus, top_n=30).plot()
185
200
  ```
186
201
 
187
- Every line of the snippet above is verified end-to-end against
188
- `pip install "pycorpdiff[viz,temporal]"` — no data download required.
189
- Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
190
- `pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
191
- `pcd.from_huggingface(...)` to use your own corpus.
192
-
193
202
  See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
194
203
  ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
195
204
  walkthrough on a synthetic UK Hansard corpus exercising every analytical
@@ -35,7 +35,7 @@ points — one-line adapters, no plugin registry. The base install pulls
35
35
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
36
36
  via extras.
37
37
 
38
- > **Status: alpha (0.1.0a2).** Public API is stable for the features
38
+ > **Status: alpha (0.1.0a3).** Public API is stable for the features
39
39
  > described below; on PyPI as `pip install pycorpdiff`.
40
40
 
41
41
  ## The three-layer architecture
@@ -49,52 +49,61 @@ via extras.
49
49
  ## Quick start
50
50
 
51
51
  ```bash
52
- pip install "pycorpdiff[viz,temporal]"
52
+ pip install "pycorpdiff[viz]"
53
53
  ```
54
54
 
55
55
  ```python
56
56
  import pycorpdiff as pcd
57
57
 
58
- # Bundled synthetic UK-Hansard corpus — runs offline, no data needed.
58
+ # Bundled UK-Hansard sample — runs offline, no data download.
59
59
  corpus = pcd.load_hansard_sample()
60
60
  immigration = corpus.slice(topic="immigration")
61
- human = immigration.slice(frame="humanising")
62
- criminal = immigration.slice(frame="criminalising")
63
-
64
- # Compare — three verbs
65
- k = pcd.compare(human, criminal).keyness()
66
- c = pcd.compare(human, criminal).collocation_shift("immigrant")
67
- # s = pcd.compare(human, criminal).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder())
68
- # requires `pip install "pycorpdiff[semantic]"`
69
-
70
- # Track over time
71
- tr = pcd.track(immigration, "criminal").over_time(freq="Y")
72
- tr.changepoints() # offline PELT
73
- tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
74
- tr.interrupted_time_series(event_date="2016") # segmented OLS
75
- tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
76
- tr.forecast(horizon=4) # state-space ETS
61
+
62
+ # Which words separate the humanising and criminalising frames?
63
+ keyness = pcd.compare(
64
+ immigration.slice(frame="humanising"),
65
+ immigration.slice(frame="criminalising"),
66
+ ).keyness(min_count=3)
67
+
68
+ keyness.plot() # volcano plot picture the result
69
+ # keyness.table.head(10) # or look at the ranked table directly
70
+ # keyness.explain("criminal") # KWIC concordances showing the textual evidence
71
+ ```
72
+
73
+ That's the entire surface in five lines: load a corpus, slice it,
74
+ compare two slices, plot the result. Every other analytical method —
75
+ collocation shifts, semantic drift, temporal trajectories, changepoint
76
+ detection, causal-impact analysis, forecasting, co-occurrence networks,
77
+ N-way keyness — follows the same shape. See
78
+ [the showcase notebook](docs/rendered/pycorpdiff_showcase.html) for the
79
+ full feature tour, or the cheat sheet below for one-line API previews.
80
+
81
+ ### Cheat sheet — every analytical surface in one block
82
+
83
+ ```python
84
+ # Compare verbs (returns Result objects with .plot / .to_df / .explain / .summary)
85
+ pcd.compare(a, b).keyness()
86
+ pcd.compare(a, b).collocation_shift("migrant")
87
+ pcd.compare(a, b).semantic_shift("migrant", embedder=pcd.SBERTEmbedder()) # [semantic]
88
+
89
+ # Track over time (requires [temporal] for the changepoint + ITS + forecast + causal_impact methods)
90
+ tr = pcd.track(corpus, "migrant").over_time(freq="Y")
91
+ tr.changepoints() # offline PELT
92
+ tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
93
+ tr.interrupted_time_series(event_date="2016") # segmented OLS
94
+ tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
95
+ tr.forecast(horizon=4) # state-space ETS
77
96
 
78
97
  # Before / after a known event
79
98
  pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
80
99
 
81
- # N-way (≥ 2 corpora) — one keyness across all four parties
82
- parties = ["Conservative", "Labour", "Liberal Democrat", "SNP"]
83
- nhs = corpus.slice(topic="nhs")
84
- pcd.keyness_multi([nhs.slice(party=p) for p in parties], labels=parties)
100
+ # N-way (≥ 2 corpora)
101
+ pcd.keyness_multi([a, b, c, d], labels=["A", "B", "C", "D"])
85
102
 
86
103
  # The discourse as a graph
87
- pcd.cooccurrence_network(immigration, top_n=30).plot()
88
-
89
- # Every Result: .to_df() · .plot() · .explain() · .summary() · .to_html() · .to_json()
104
+ pcd.cooccurrence_network(corpus, top_n=30).plot()
90
105
  ```
91
106
 
92
- Every line of the snippet above is verified end-to-end against
93
- `pip install "pycorpdiff[viz,temporal]"` — no data download required.
94
- Replace `load_hansard_sample()` with `pcd.from_dataframe(your_df, ...)`,
95
- `pcd.read_parquet(...)`, `pcd.fetch_hansard(...)`, or
96
- `pcd.from_huggingface(...)` to use your own corpus.
97
-
98
107
  See [`examples/pycorpdiff_showcase.ipynb`](examples/pycorpdiff_showcase.ipynb)
99
108
  ([rendered HTML](docs/rendered/pycorpdiff_showcase.html)) for a
100
109
  walkthrough on a synthetic UK Hansard corpus exercising every analytical
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pycorpdiff"
7
- version = "0.1.0a2"
7
+ version = "0.1.0a3"
8
8
  description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -66,7 +66,7 @@ huggingface = ["datasets>=2.14"]
66
66
  # rendered HTML examples. `jupyter` runs the notebook, `vl-convert` does
67
67
  # static SVG/PNG export of altair charts, `pysofra` renders the showcase's
68
68
  # result tables in JAMA-style typography.
69
- notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.0a2"]
69
+ notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.0a3"]
70
70
  # Meta-extra so `pycorpdiff[all]` exercises every optional code path.
71
71
  all = [
72
72
  "altair>=5",
@@ -82,7 +82,7 @@ all = [
82
82
  "duckdb>=0.10",
83
83
  "spacy>=3.7",
84
84
  "vl-convert-python>=1.5",
85
- "pysofra>=0.1.0a2",
85
+ "pysofra>=0.1.0a3",
86
86
  ]
87
87
  dev = [
88
88
  "pytest>=8",
@@ -14,12 +14,12 @@ Example
14
14
 
15
15
  >>> import pycorpdiff as pcd
16
16
  >>> pcd.__version__
17
- '0.1.0a2'
17
+ '0.1.0a3'
18
18
  """
19
19
 
20
20
  from __future__ import annotations
21
21
 
22
- __version__ = "0.1.0a2"
22
+ __version__ = "0.1.0a3"
23
23
 
24
24
  from .collocation.network import NetworkResult, cooccurrence_network
25
25
  from .compare import Comparison, compare
File without changes
File without changes