pycorpdiff 0.1.0a5__tar.gz → 0.1.0a6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/CHANGELOG.md +1 -1
  2. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/CITATION.cff +4 -2
  3. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/PKG-INFO +3 -3
  4. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/README.md +2 -2
  5. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/pyproject.toml +3 -1
  6. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/__init__.py +2 -2
  7. pycorpdiff-0.1.0a6/src/pycorpdiff/_backends/pandas.py +9 -0
  8. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/compare.py +12 -1
  9. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/io/huggingface.py +1 -1
  10. pycorpdiff-0.1.0a6/src/pycorpdiff/keyness/loglikelihood.py +149 -0
  11. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/results.py +13 -7
  12. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_crossval_quanteda.py +15 -4
  13. pycorpdiff-0.1.0a5/src/pycorpdiff/_backends/pandas.py +0 -3
  14. pycorpdiff-0.1.0a5/src/pycorpdiff/keyness/loglikelihood.py +0 -92
  15. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/.gitignore +0 -0
  16. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/LICENSE +0 -0
  17. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/_backends/__init__.py +0 -0
  18. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/_backends/polars.py +0 -0
  19. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/collocation/__init__.py +0 -0
  20. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
  21. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/collocation/measures.py +0 -0
  22. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/collocation/network.py +0 -0
  23. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/collocation/shift.py +0 -0
  24. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/corpus.py +0 -0
  25. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/datasets/__init__.py +0 -0
  26. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  27. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
  28. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/datasets/hansard.py +0 -0
  29. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/datasets/histwords.py +0 -0
  30. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/explain.py +0 -0
  31. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/io/__init__.py +0 -0
  32. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/io/duckdb.py +0 -0
  33. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/io/readers.py +0 -0
  34. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/keyness/__init__.py +0 -0
  35. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/keyness/bayes.py +0 -0
  36. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/keyness/chi_squared.py +0 -0
  37. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/keyness/correction.py +0 -0
  38. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/keyness/dispersion.py +0 -0
  39. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
  40. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/keyness/multicorpus.py +0 -0
  41. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/keyness/permutation.py +0 -0
  42. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/py.typed +0 -0
  43. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/semantic/__init__.py +0 -0
  44. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/semantic/alignment.py +0 -0
  45. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/semantic/embed.py +0 -0
  46. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/semantic/shift.py +0 -0
  47. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/semantic/trajectory.py +0 -0
  48. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/stats.py +0 -0
  49. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/temporal/__init__.py +0 -0
  50. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/temporal/bocpd.py +0 -0
  51. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/temporal/causal_impact.py +0 -0
  52. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/temporal/changepoint.py +0 -0
  53. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/temporal/forecast.py +0 -0
  54. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/temporal/its.py +0 -0
  55. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/temporal/slicing.py +0 -0
  56. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/tokenize.py +0 -0
  57. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/__init__.py +0 -0
  58. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/bocpd.py +0 -0
  59. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/causal_impact.py +0 -0
  60. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/collocation.py +0 -0
  61. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/dispersion.py +0 -0
  62. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/forecast.py +0 -0
  63. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/keyness.py +0 -0
  64. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/network.py +0 -0
  65. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/scattertext.py +0 -0
  66. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
  67. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/src/pycorpdiff/viz/trajectory.py +0 -0
  68. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/__init__.py +0 -0
  69. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/conftest.py +0 -0
  70. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/fixtures/__init__.py +0 -0
  71. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/__init__.py +0 -0
  72. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_collocation_integration.py +0 -0
  73. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_crossval_histwords.py +0 -0
  74. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_crossval_nltk.py +0 -0
  75. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_crossval_rayson.py +0 -0
  76. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_crossval_scattertext.py +0 -0
  77. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_explain_integration.py +0 -0
  78. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_keyness_integration.py +0 -0
  79. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_sbert_slow.py +0 -0
  80. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_semantic_integration.py +0 -0
  81. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_stop_words.py +0 -0
  82. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_temporal_stats.py +0 -0
  83. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/integration/test_viz.py +0 -0
  84. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/property/__init__.py +0 -0
  85. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/property/test_collocation_properties.py +0 -0
  86. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/property/test_keyness_properties.py +0 -0
  87. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/property/test_temporal_properties.py +0 -0
  88. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/__init__.py +0 -0
  89. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_bayes_factor.py +0 -0
  90. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_bocpd.py +0 -0
  91. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_causal_impact.py +0 -0
  92. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_changepoint.py +0 -0
  93. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_chi_squared.py +0 -0
  94. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_collocation_cooccurrence.py +0 -0
  95. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_collocation_measures.py +0 -0
  96. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_collocation_shift.py +0 -0
  97. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_comparison_concordance.py +0 -0
  98. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_cooccurrence_network.py +0 -0
  99. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_corpus_hash.py +0 -0
  100. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_corpus_vocab.py +0 -0
  101. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_correction.py +0 -0
  102. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_datasets_hansard.py +0 -0
  103. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_dispersion.py +0 -0
  104. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_dispersion_plot.py +0 -0
  105. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_doc_term_counts_sparse.py +0 -0
  106. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_effect_sizes.py +0 -0
  107. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_embedders.py +0 -0
  108. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_explain.py +0 -0
  109. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_forecast.py +0 -0
  110. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_forecast_semantic_drift.py +0 -0
  111. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_from_huggingface.py +0 -0
  112. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_hansard_fetcher.py +0 -0
  113. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_histwords_loader.py +0 -0
  114. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_its.py +0 -0
  115. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_keyness_multi.py +0 -0
  116. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_loglikelihood.py +0 -0
  117. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_ngram_tokenizer.py +0 -0
  118. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_permutation_keyness.py +0 -0
  119. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_polars_interop.py +0 -0
  120. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_procrustes.py +0 -0
  121. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_read_duckdb.py +0 -0
  122. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_read_txt_line_mode.py +0 -0
  123. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_result_exports.py +0 -0
  124. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_scattertext_plot.py +0 -0
  125. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_semantic_neighbours.py +0 -0
  126. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_semantic_shift.py +0 -0
  127. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_semantic_trajectory.py +0 -0
  128. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_smoke.py +0 -0
  129. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_temporal.py +0 -0
  130. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a6}/tests/unit/test_wilson_ci.py +0 -0
@@ -4,7 +4,7 @@ All notable changes to `pycorpdiff` are documented in this file. The format
4
4
  follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
5
5
  project adheres to [Semantic Versioning](https://semver.org/).
6
6
 
7
- ## [0.1.0a5] — initial release
7
+ ## [0.1.0a6] — initial release
8
8
 
9
9
  The initial public release of `pycorpdiff` — comparative corpus analysis
10
10
  for modern Python workflows. Three public verbs (`compare`, `track`,
@@ -4,7 +4,7 @@ message: >
4
4
  entry. GitHub renders a "Cite this repository" widget directly from
5
5
  this file.
6
6
  title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
7
- version: 0.1.0a5
7
+ version: 0.1.0a6
8
8
  date-released: 2026-05-25
9
9
  authors:
10
10
  - family-names: Turner
@@ -32,7 +32,9 @@ abstract: >
32
32
  API. The package targets corpus linguistics, digital humanities,
33
33
  computational social science, and discourse analysis research,
34
34
  emphasising interpretability, explainability, statistical rigour,
35
- and reproducibility.
35
+ and reproducibility. A bundled synthetic UK-Hansard-style sample
36
+ ships for offline demonstration; real-data interfaces include
37
+ fetch_hansard and from_huggingface.
36
38
  identifiers:
37
39
  - type: url
38
40
  value: "https://github.com/jturner-uofl/pycorpdiff"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycorpdiff
3
- Version: 0.1.0a5
3
+ Version: 0.1.0a6
4
4
  Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
5
5
  Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
6
6
  Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
@@ -131,7 +131,7 @@ points — one-line adapters, no plugin registry. The base install pulls
131
131
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
132
132
  via extras.
133
133
 
134
- > **Status: alpha (0.1.0a5).** Public API is stable for the features
134
+ > **Status: alpha (0.1.0a6).** Public API is stable for the features
135
135
  > described below; on PyPI as `pip install pycorpdiff`.
136
136
 
137
137
  ## The three-layer architecture
@@ -237,7 +237,7 @@ The math agrees with the standard tools — by automated test:
237
237
  - **Rayson's LL Wizard** — hand-derived contingency-table reference triples
238
238
  - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
239
239
  - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
240
- - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
240
+ - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement with `formula="dunning"` (slow tier)
241
241
  - **HistWords (Hamilton et al. 2016)** — diachronic cosine displacements on COHA (slow tier)
242
242
 
243
243
  ## Citation
@@ -35,7 +35,7 @@ points — one-line adapters, no plugin registry. The base install pulls
35
35
  only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
36
36
  via extras.
37
37
 
38
- > **Status: alpha (0.1.0a5).** Public API is stable for the features
38
+ > **Status: alpha (0.1.0a6).** Public API is stable for the features
39
39
  > described below; on PyPI as `pip install pycorpdiff`.
40
40
 
41
41
  ## The three-layer architecture
@@ -141,7 +141,7 @@ The math agrees with the standard tools — by automated test:
141
141
  - **Rayson's LL Wizard** — hand-derived contingency-table reference triples
142
142
  - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
143
143
  - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
144
- - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
144
+ - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement with `formula="dunning"` (slow tier)
145
145
  - **HistWords (Hamilton et al. 2016)** — diachronic cosine displacements on COHA (slow tier)
146
146
 
147
147
  ## Citation
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pycorpdiff"
7
- version = "0.1.0a5"
7
+ version = "0.1.0a6"
8
8
  description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -176,6 +176,8 @@ disallow_any_generics = true
176
176
  module = [
177
177
  "altair",
178
178
  "altair.*",
179
+ "datasets",
180
+ "datasets.*",
179
181
  "duckdb",
180
182
  "duckdb.*",
181
183
  "matplotlib",
@@ -14,12 +14,12 @@ Example
14
14
 
15
15
  >>> import pycorpdiff as pcd
16
16
  >>> pcd.__version__
17
- '0.1.0a5'
17
+ '0.1.0a6'
18
18
  """
19
19
 
20
20
  from __future__ import annotations
21
21
 
22
- __version__ = "0.1.0a5"
22
+ __version__ = "0.1.0a6"
23
23
 
24
24
  from .collocation.network import NetworkResult, cooccurrence_network
25
25
  from .compare import Comparison, compare
@@ -0,0 +1,9 @@
1
+ """Pandas-backed internals for :class:`pycorpdiff.Corpus`.
2
+
3
+ Corpus operations route through this module so backend-specific code
4
+ stays out of the public API. The pandas backend is the default and is
5
+ exercised on every install; polars is opt-in via the ``polars`` extra
6
+ and lives in the sibling :mod:`pycorpdiff._backends.polars`.
7
+ """
8
+
9
+ from __future__ import annotations
@@ -10,6 +10,7 @@ from dataclasses import dataclass
10
10
  from typing import TYPE_CHECKING, Literal
11
11
 
12
12
  from .corpus import Corpus, CorpusSlice
13
+ from .keyness.loglikelihood import LLFormula
13
14
 
14
15
  if TYPE_CHECKING:
15
16
  from .results import (
@@ -46,6 +47,7 @@ class Comparison:
46
47
  def keyness(
47
48
  self,
48
49
  method: KeynessMethod = "log_likelihood",
50
+ formula: LLFormula = "rayson",
49
51
  effect_size: bool = True,
50
52
  dispersion: bool = False,
51
53
  min_count: int = 5,
@@ -64,6 +66,14 @@ class Comparison:
64
66
  sorts by signed Pearson χ². The other modes
65
67
  (``"log_ratio"``, ``"bayes_factor"``, ``"percent_diff"``)
66
68
  require ``effect_size=True`` and sort by that column.
69
+ formula
70
+ Which log-likelihood formulation to use for the G² column.
71
+ ``"rayson"`` (default) is the 2-cell shortcut matching
72
+ Rayson's UCREL LL Wizard; ``"dunning"`` is the full 4-cell
73
+ G² matching NLTK's ``BigramAssocMeasures`` and R's
74
+ ``quanteda::textstat_keyness(measure="lr")``. See
75
+ ``docs/statistical-methods.md`` for the math + when they
76
+ diverge.
67
77
  effect_size
68
78
  If True (default), also compute LogRatio (Hardie),
69
79
  %DIFF (Gabrielatos), and the BIC-approximated Bayes factor.
@@ -131,7 +141,7 @@ class Comparison:
131
141
  # G² is always computed (cheap, the default sort column). χ² is
132
142
  # computed only when requested — same shape, asymptotically
133
143
  # equivalent, no need to pay for both by default.
134
- table = log_likelihood(a_kept, b_kept, n_a, n_b)
144
+ table = log_likelihood(a_kept, b_kept, n_a, n_b, formula=formula)
135
145
  if method == "chi_squared":
136
146
  chi_table = _chi_squared(a_kept, b_kept, n_a, n_b)
137
147
  table["chi_squared"] = chi_table["chi_squared"]
@@ -192,6 +202,7 @@ class Comparison:
192
202
  label_a=_corpus_label(self.a),
193
203
  label_b=_corpus_label(self.b),
194
204
  params={
205
+ "formula": formula,
195
206
  "effect_size": effect_size,
196
207
  "dispersion": dispersion,
197
208
  "min_count": min_count,
@@ -95,7 +95,7 @@ def from_huggingface(
95
95
  loader = _loader
96
96
  if loader is None:
97
97
  try:
98
- from datasets import load_dataset as _hf_load # type: ignore[import-not-found]
98
+ from datasets import load_dataset as _hf_load
99
99
  except ImportError as exc: # pragma: no cover
100
100
  raise ImportError(
101
101
  "from_huggingface requires the `datasets` library. "
@@ -0,0 +1,149 @@
1
+ """Log-likelihood-ratio keyness statistic (Dunning, Rayson formulations).
2
+
3
+ References
4
+ ----------
5
+ Dunning, T. (1993). Accurate methods for the statistics of surprise and
6
+ coincidence. *Computational Linguistics*, 19(1), 61-74.
7
+
8
+ Rayson, P., & Garside, R. (2000). Comparing corpora using frequency
9
+ profiling. In *Proceedings of the Workshop on Comparing Corpora*,
10
+ ACL 2000, pp. 1-6.
11
+
12
+ Notes
13
+ -----
14
+ Two slightly different log-likelihood-ratio statistics circulate in the
15
+ corpus-linguistics literature for the 2-corpus / 2-state contingency
16
+ table:
17
+
18
+ - **Rayson** (``formula="rayson"``, default): the 2-cell shortcut
19
+ ``LL = 2·(O₁·ln(O₁/E₁) + O₂·ln(O₂/E₂))``, summing only the
20
+ *term-present* cells. This is the formulation behind the
21
+ UCREL Lancaster LL Wizard (the standard online reference at
22
+ http://ucrel.lancs.ac.uk/llwizard.html), originally published in
23
+ Rayson & Garside (2000).
24
+
25
+ - **Dunning** (``formula="dunning"``): the full 4-cell G²,
26
+ summing over *all four* cells of the 2×2 contingency table —
27
+ ``LL = 2·Σᵢ₌₁..₄ Oᵢ·ln(Oᵢ/Eᵢ)``. This is the classical
28
+ Dunning (1993) likelihood-ratio statistic and the formulation used
29
+ by NLTK's :class:`BigramAssocMeasures` and R's
30
+ :func:`quanteda::textstat_keyness(measure="lr")`.
31
+
32
+ For the canonical Rayson example (12000/1M vs 10000/1M), the two
33
+ formulations give 182.07 (Rayson) and 184.10 (Dunning). For the
34
+ near-symmetric, low-frequency cases that dominate corpus-linguistics
35
+ practice the two are typically within 1-2 % of each other; for highly
36
+ asymmetric corpora or high-frequency terms they diverge more.
37
+
38
+ The statistic returned by :func:`log_likelihood` is **signed**: positive
39
+ when the term is overused in corpus A relative to corpus B (i.e.
40
+ ``a/N_a > b/N_b``) and negative when overused in B. This is the
41
+ convention CASS / Lancaster tooling has gravitated toward — it carries
42
+ direction information without needing a separate column. The reported
43
+ *p*-value uses the unsigned magnitude as the test statistic; both
44
+ formulations are asymptotically χ²₁-distributed under the null.
45
+ """
46
+
47
+ from __future__ import annotations
48
+
49
+ from typing import Literal
50
+
51
+ import numpy as np
52
+ import pandas as pd
53
+ from scipy.special import xlogy
54
+ from scipy.stats import chi2
55
+
56
+ LLFormula = Literal["rayson", "dunning"]
57
+
58
+
59
+ def log_likelihood(
60
+ counts_a: pd.Series,
61
+ counts_b: pd.Series,
62
+ total_a: int,
63
+ total_b: int,
64
+ *,
65
+ formula: LLFormula = "rayson",
66
+ ) -> pd.DataFrame:
67
+ """Compute the signed log-likelihood-ratio keyness statistic.
68
+
69
+ Two formulations are available; see the module docstring for the
70
+ references and the math.
71
+
72
+ ``counts_a`` and ``counts_b`` are aligned on their union; missing
73
+ terms are imputed as zero. No min-count filtering is applied here —
74
+ that is the caller's responsibility (see
75
+ :meth:`pycorpdiff.Comparison.keyness`).
76
+
77
+ Parameters
78
+ ----------
79
+ counts_a, counts_b
80
+ Term-frequency series. Index entries are terms; values are
81
+ non-negative integer counts.
82
+ total_a, total_b
83
+ Corpus totals (token counts before any min-count filter). Used
84
+ for the contingency-table "not-term" cells under
85
+ ``formula="dunning"``.
86
+ formula
87
+ ``"rayson"`` (default; 2-cell shortcut, matches Rayson's LL
88
+ Wizard) or ``"dunning"`` (full 4-cell G²).
89
+
90
+ Returns
91
+ -------
92
+ pandas.DataFrame
93
+ Indexed by term, with columns ``count_a``, ``count_b``,
94
+ ``expected_a``, ``expected_b``, ``g2`` (signed), ``p_value``.
95
+ """
96
+ if total_a <= 0 or total_b <= 0:
97
+ raise ValueError(f"total_a and total_b must be positive; got {total_a}, {total_b}")
98
+ if formula not in ("rayson", "dunning"):
99
+ raise ValueError(
100
+ f"formula must be 'rayson' or 'dunning'; got {formula!r}"
101
+ )
102
+
103
+ terms = counts_a.index.union(counts_b.index)
104
+ a = counts_a.reindex(terms, fill_value=0).astype(np.int64).to_numpy()
105
+ b = counts_b.reindex(terms, fill_value=0).astype(np.int64).to_numpy()
106
+
107
+ obs_sum = a + b
108
+ total = total_a + total_b
109
+ expected_a = total_a * obs_sum / total
110
+ expected_b = total_b * obs_sum / total
111
+
112
+ # Rayson 2-cell shortcut: only the term-present rows.
113
+ # LL = 2 · (O₁·ln(O₁/E₁) + O₂·ln(O₂/E₂))
114
+ contrib = (
115
+ xlogy(a, a) - xlogy(a, expected_a) + xlogy(b, b) - xlogy(b, expected_b)
116
+ )
117
+ if formula == "dunning":
118
+ # Add the term-absent cells for the full 4-cell G².
119
+ c = total_a - a # other tokens in A
120
+ d = total_b - b # other tokens in B
121
+ expected_c = total_a - expected_a
122
+ expected_d = total_b - expected_b
123
+ contrib = contrib + (
124
+ xlogy(c, c) - xlogy(c, expected_c) + xlogy(d, d) - xlogy(d, expected_d)
125
+ )
126
+ unsigned = 2.0 * contrib
127
+ # Mathematically LL >= 0; clip away the tiny negative values that
128
+ # surface from float roundoff when the two corpora have ~identical rates.
129
+ unsigned = np.maximum(unsigned, 0.0)
130
+
131
+ # Sign by direction of overuse: + when A's rate exceeds B's, else -.
132
+ a_rate = a / total_a
133
+ b_rate = b / total_b
134
+ sign = np.where(a_rate >= b_rate, 1.0, -1.0)
135
+ signed = sign * unsigned
136
+
137
+ p_value = chi2.sf(unsigned, df=1)
138
+
139
+ return pd.DataFrame(
140
+ {
141
+ "count_a": a,
142
+ "count_b": b,
143
+ "expected_a": expected_a,
144
+ "expected_b": expected_b,
145
+ "g2": signed,
146
+ "p_value": p_value,
147
+ },
148
+ index=terms,
149
+ )
@@ -1,16 +1,22 @@
1
1
  """Result dataclasses returned by every public analytical verb.
2
2
 
3
- Every Result implements the same informal contract:
3
+ Each Result implements the relevant subset of an informal six-method
4
+ contract:
4
5
 
5
6
  - ``.to_df()`` returns a tidy :class:`pandas.DataFrame`.
6
7
  - ``.plot(**kw)`` returns an :class:`altair.Chart`.
7
- - ``.explain(term, n)`` returns a :class:`ConcordanceResult` with
8
- evidence for one row of the result.
8
+ - ``.to_html(path=None)`` renders the table as HTML.
9
+ - ``.to_json(path=None)`` renders the table as JSON.
9
10
  - ``.summary()`` returns a short human-readable string.
10
-
11
- This contract is intentionally a duck-typing convention rather than an
12
- abstract base class — it keeps Results lightweight and lets them be
13
- constructed from a plain DataFrame without inheritance gymnastics.
11
+ - ``.explain(term, n)`` returns a :class:`ConcordanceResult` with
12
+ KWIC evidence for one row of the result. Defined only on
13
+ comparison-based Results (``KeynessResult``, ``CollocationShiftResult``)
14
+ where "one row of the result" maps to a target term.
15
+
16
+ See ``docs/design.md`` for the per-Result method matrix. This contract
17
+ is a duck-typing convention rather than an abstract base class — it
18
+ keeps Results lightweight and lets them be constructed from a plain
19
+ DataFrame without inheritance gymnastics.
14
20
  """
15
21
 
16
22
  from __future__ import annotations
@@ -103,10 +103,20 @@ def _quanteda_keyness(corpus_df: pd.DataFrame) -> pd.DataFrame:
103
103
  def test_log_likelihood_matches_quanteda_byte_for_byte(
104
104
  fixture_corpus: pcd.Corpus,
105
105
  ) -> None:
106
- """For every term shared with quanteda, our signed G² agrees to 1e-4."""
106
+ """For every term shared with quanteda (using formula='dunning'),
107
+ our signed G² agrees byte-for-byte to ≤ 1e-10.
108
+
109
+ quanteda's ``textstat_keyness(measure="lr")`` uses the full 4-cell
110
+ Dunning G². The Rayson 2-cell shortcut (our default) is a different
111
+ statistic; comparing like-to-like requires passing ``formula="dunning"``.
112
+ """
107
113
  a = fixture_corpus.slice(frame="A")
108
114
  b = fixture_corpus.slice(frame="B")
109
- ours = pcd.compare(a, b).keyness(min_count=1).table.set_index("term")["g2"]
115
+ ours = (
116
+ pcd.compare(a, b)
117
+ .keyness(min_count=1, formula="dunning")
118
+ .table.set_index("term")["g2"]
119
+ )
110
120
 
111
121
  quanteda_df = _quanteda_keyness(fixture_corpus.docs.copy())
112
122
  theirs = pd.Series(
@@ -123,7 +133,8 @@ def test_log_likelihood_matches_quanteda_byte_for_byte(
123
133
  theirs_v = float(theirs[term])
124
134
  # quanteda's textstat_keyness uses signed G² with the same
125
135
  # convention we do: positive when overused in the target
126
- # group. Agreement to 4 decimal places is more than enough.
127
- assert math.isclose(ours_v, theirs_v, abs_tol=1e-4), (
136
+ # group. With matching formulae, the two implementations
137
+ # should agree to floating-point noise.
138
+ assert math.isclose(ours_v, theirs_v, abs_tol=1e-10), (
128
139
  f"{term}: pycorpdiff={ours_v}, quanteda={theirs_v}"
129
140
  )
@@ -1,3 +0,0 @@
1
- """Pandas backend shim — placeholder until backend abstractions are needed."""
2
-
3
- from __future__ import annotations
@@ -1,92 +0,0 @@
1
- """Dunning's G² log-likelihood statistic.
2
-
3
- Reference
4
- ---------
5
- Dunning, T. (1993). Accurate methods for the statistics of surprise and
6
- coincidence. *Computational Linguistics*, 19(1), 61-74.
7
-
8
- Notes
9
- -----
10
- The G² returned by :func:`log_likelihood` is **signed**: positive when the
11
- term is overused in corpus A relative to corpus B (i.e. ``a/N_a > b/N_b``)
12
- and negative when overused in B. This is the convention CASS / Lancaster
13
- tooling has gravitated toward — it carries direction information without
14
- needing a separate column. The reported *p*-value uses ``|G²|`` as the
15
- test statistic; the unsigned form is what's chi-squared distributed.
16
- """
17
-
18
- from __future__ import annotations
19
-
20
- import numpy as np
21
- import pandas as pd
22
- from scipy.special import xlogy
23
- from scipy.stats import chi2
24
-
25
-
26
- def log_likelihood(
27
- counts_a: pd.Series,
28
- counts_b: pd.Series,
29
- total_a: int,
30
- total_b: int,
31
- ) -> pd.DataFrame:
32
- """Compute Dunning G² for every term in the union of input indices.
33
-
34
- ``counts_a`` and ``counts_b`` are aligned on their union; missing
35
- terms are imputed as zero. No min-count filtering is applied here —
36
- that is the caller's responsibility (see
37
- :meth:`pycorpdiff.Comparison.keyness`).
38
-
39
- Parameters
40
- ----------
41
- counts_a, counts_b
42
- Term-frequency series. Index entries are terms; values are
43
- non-negative integer counts.
44
- total_a, total_b
45
- Corpus totals (token counts before any min-count filter). Used
46
- for the contingency-table "not-term" cells.
47
-
48
- Returns
49
- -------
50
- pandas.DataFrame
51
- Indexed by term, with columns ``count_a``, ``count_b``,
52
- ``expected_a``, ``expected_b``, ``g2`` (signed), ``p_value``.
53
- """
54
- if total_a <= 0 or total_b <= 0:
55
- raise ValueError(f"total_a and total_b must be positive; got {total_a}, {total_b}")
56
-
57
- terms = counts_a.index.union(counts_b.index)
58
- a = counts_a.reindex(terms, fill_value=0).astype(np.int64).to_numpy()
59
- b = counts_b.reindex(terms, fill_value=0).astype(np.int64).to_numpy()
60
-
61
- obs_sum = a + b
62
- total = total_a + total_b
63
- expected_a = total_a * obs_sum / total
64
- expected_b = total_b * obs_sum / total
65
-
66
- # 2 * sum_i O_i * ln(O_i / E_i), with xlogy giving 0*log(0)=0.
67
- unsigned = 2.0 * (
68
- xlogy(a, a) - xlogy(a, expected_a) + xlogy(b, b) - xlogy(b, expected_b)
69
- )
70
- # Mathematically G² >= 0; clip away the tiny negative values that
71
- # surface from float roundoff when the two corpora have ~identical rates.
72
- unsigned = np.maximum(unsigned, 0.0)
73
-
74
- # Sign by direction of overuse: + when A's rate exceeds B's, else -.
75
- a_rate = a / total_a
76
- b_rate = b / total_b
77
- sign = np.where(a_rate >= b_rate, 1.0, -1.0)
78
- signed = sign * unsigned
79
-
80
- p_value = chi2.sf(unsigned, df=1)
81
-
82
- return pd.DataFrame(
83
- {
84
- "count_a": a,
85
- "count_b": b,
86
- "expected_a": expected_a,
87
- "expected_b": expected_b,
88
- "g2": signed,
89
- "p_value": p_value,
90
- },
91
- index=terms,
92
- )
File without changes
File without changes