pycorpdiff 0.1.0a5__tar.gz → 0.1.0a7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (132) hide show
  1. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/.gitignore +0 -3
  2. pycorpdiff-0.1.0a7/CHANGELOG.md +71 -0
  3. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/CITATION.cff +4 -2
  4. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/PKG-INFO +42 -24
  5. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/README.md +39 -21
  6. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/pyproject.toml +15 -9
  7. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/__init__.py +6 -5
  8. pycorpdiff-0.1.0a7/src/pycorpdiff/_backends/pandas.py +9 -0
  9. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/compare.py +15 -2
  10. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/corpus.py +9 -0
  11. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/io/duckdb.py +13 -1
  12. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/io/huggingface.py +1 -1
  13. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/bayes.py +10 -2
  14. pycorpdiff-0.1.0a7/src/pycorpdiff/keyness/loglikelihood.py +149 -0
  15. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/results.py +37 -14
  16. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/shift.py +24 -0
  17. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_histwords.py +29 -15
  18. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_quanteda.py +40 -27
  19. pycorpdiff-0.1.0a7/tests/unit/test_audit_a7_fixes.py +133 -0
  20. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_loglikelihood.py +46 -0
  21. pycorpdiff-0.1.0a5/CHANGELOG.md +0 -44
  22. pycorpdiff-0.1.0a5/src/pycorpdiff/_backends/pandas.py +0 -3
  23. pycorpdiff-0.1.0a5/src/pycorpdiff/keyness/loglikelihood.py +0 -92
  24. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/LICENSE +0 -0
  25. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/_backends/__init__.py +0 -0
  26. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/_backends/polars.py +0 -0
  27. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/__init__.py +0 -0
  28. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/cooccurrence.py +0 -0
  29. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/measures.py +0 -0
  30. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/network.py +0 -0
  31. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/collocation/shift.py +0 -0
  32. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/__init__.py +0 -0
  33. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/_data/hansard_sample.parquet +0 -0
  34. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/_generate_hansard.py +0 -0
  35. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/hansard.py +0 -0
  36. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/datasets/histwords.py +0 -0
  37. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/explain.py +0 -0
  38. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/io/__init__.py +0 -0
  39. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/io/readers.py +0 -0
  40. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/__init__.py +0 -0
  41. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/chi_squared.py +0 -0
  42. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/correction.py +0 -0
  43. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/dispersion.py +0 -0
  44. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/effect_sizes.py +0 -0
  45. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/multicorpus.py +0 -0
  46. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/keyness/permutation.py +0 -0
  47. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/py.typed +0 -0
  48. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/__init__.py +0 -0
  49. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/alignment.py +0 -0
  50. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/embed.py +0 -0
  51. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/semantic/trajectory.py +0 -0
  52. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/stats.py +0 -0
  53. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/__init__.py +0 -0
  54. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/bocpd.py +0 -0
  55. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/causal_impact.py +0 -0
  56. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/changepoint.py +0 -0
  57. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/forecast.py +0 -0
  58. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/its.py +0 -0
  59. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/temporal/slicing.py +0 -0
  60. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/tokenize.py +0 -0
  61. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/__init__.py +0 -0
  62. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/bocpd.py +0 -0
  63. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/causal_impact.py +0 -0
  64. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/collocation.py +0 -0
  65. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/dispersion.py +0 -0
  66. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/forecast.py +0 -0
  67. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/keyness.py +0 -0
  68. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/network.py +0 -0
  69. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/scattertext.py +0 -0
  70. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/semantic_forecast.py +0 -0
  71. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/src/pycorpdiff/viz/trajectory.py +0 -0
  72. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/__init__.py +0 -0
  73. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/conftest.py +0 -0
  74. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/fixtures/__init__.py +0 -0
  75. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/__init__.py +0 -0
  76. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_collocation_integration.py +0 -0
  77. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_nltk.py +0 -0
  78. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_rayson.py +0 -0
  79. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_crossval_scattertext.py +0 -0
  80. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_explain_integration.py +0 -0
  81. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_keyness_integration.py +0 -0
  82. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_sbert_slow.py +0 -0
  83. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_semantic_integration.py +0 -0
  84. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_stop_words.py +0 -0
  85. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_temporal_stats.py +0 -0
  86. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/integration/test_viz.py +0 -0
  87. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/property/__init__.py +0 -0
  88. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/property/test_collocation_properties.py +0 -0
  89. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/property/test_keyness_properties.py +0 -0
  90. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/property/test_temporal_properties.py +0 -0
  91. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/__init__.py +0 -0
  92. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_bayes_factor.py +0 -0
  93. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_bocpd.py +0 -0
  94. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_causal_impact.py +0 -0
  95. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_changepoint.py +0 -0
  96. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_chi_squared.py +0 -0
  97. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_collocation_cooccurrence.py +0 -0
  98. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_collocation_measures.py +0 -0
  99. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_collocation_shift.py +0 -0
  100. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_comparison_concordance.py +0 -0
  101. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_cooccurrence_network.py +0 -0
  102. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_corpus_hash.py +0 -0
  103. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_corpus_vocab.py +0 -0
  104. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_correction.py +0 -0
  105. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_datasets_hansard.py +0 -0
  106. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_dispersion.py +0 -0
  107. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_dispersion_plot.py +0 -0
  108. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_doc_term_counts_sparse.py +0 -0
  109. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_effect_sizes.py +0 -0
  110. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_embedders.py +0 -0
  111. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_explain.py +0 -0
  112. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_forecast.py +0 -0
  113. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_forecast_semantic_drift.py +0 -0
  114. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_from_huggingface.py +0 -0
  115. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_hansard_fetcher.py +0 -0
  116. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_histwords_loader.py +0 -0
  117. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_its.py +0 -0
  118. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_keyness_multi.py +0 -0
  119. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_ngram_tokenizer.py +0 -0
  120. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_permutation_keyness.py +0 -0
  121. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_polars_interop.py +0 -0
  122. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_procrustes.py +0 -0
  123. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_read_duckdb.py +0 -0
  124. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_read_txt_line_mode.py +0 -0
  125. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_result_exports.py +0 -0
  126. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_scattertext_plot.py +0 -0
  127. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_semantic_neighbours.py +0 -0
  128. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_semantic_shift.py +0 -0
  129. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_semantic_trajectory.py +0 -0
  130. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_smoke.py +0 -0
  131. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_temporal.py +0 -0
  132. {pycorpdiff-0.1.0a5 → pycorpdiff-0.1.0a7}/tests/unit/test_wilson_ci.py +0 -0
@@ -33,9 +33,6 @@ Thumbs.db
33
33
  # Hypothesis example database (auto-managed)
34
34
  .hypothesis/
35
35
 
36
- # Local tooling
37
- .claude/
38
-
39
36
  # Jupyter checkpoints
40
37
  .ipynb_checkpoints/
41
38
 
@@ -0,0 +1,71 @@
1
+ # Changelog
2
+
3
+ All notable changes to `pycorpdiff` are documented in this file. The format
4
+ follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this
5
+ project adheres to [Semantic Versioning](https://semver.org/).
6
+
7
+ ## [0.1.0a7] — first public release
8
+
9
+ The first public alpha of `pycorpdiff` — comparative corpus analysis
10
+ for modern Python workflows. Three public verbs (`compare`, `track`,
11
+ `compare.before_after`), nine `Result` dataclasses each implementing
12
+ the relevant subset of `.to_df / .plot / .explain / .summary /
13
+ .to_html / .to_json` (see `docs/design.md` for the per-Result method
14
+ matrix), two `typing.Protocol` extension points (`Tokenizer`,
15
+ `Embedder`), and opt-in extras for visualisation, semantic embedding,
16
+ temporal modelling, polars interop, DuckDB ingestion, 🤗 Datasets,
17
+ and notebook rendering.
18
+
19
+ ### Analytical surface
20
+
21
+ - **Keyness**: signed log-likelihood G² with selectable formula
22
+ (`formula="rayson"` 2-cell shortcut, default; matches the UCREL
23
+ LL Wizard. `formula="dunning"` 4-cell G²; matches NLTK +
24
+ `quanteda::textstat_keyness(measure="lr")` byte-for-byte.). Pearson
25
+ χ², Hardie LogRatio, Gabrielatos %DIFF, BIC-approximated Bayes
26
+ factor (also tracks the `formula=` choice), Juilland D / Gries DP
27
+ dispersion flagging, Benjamini–Hochberg correction, stop-word
28
+ filtering, empirical permutation *p*-values, N-way contingency G²
29
+ via `keyness_multi`.
30
+ - **Collocations**: logDice, PMI, t-score, MI³ with Laplace smoothing;
31
+ cross-corpus `collocation_shift`; co-occurrence networks via
32
+ `cooccurrence_network`.
33
+ - **Semantic shift**: averaged contextual embeddings, Procrustes
34
+ alignment, multi-period `semantic_trajectory`, `neighborhood_drift`.
35
+ Embedder output shape is validated to catch silently-broken
36
+ embedders before they produce nonsense.
37
+ - **Temporal**: Wilson-CI trajectories, offline PELT changepoints,
38
+ online Bayesian changepoint detection, segmented-OLS interrupted
39
+ time series, Bayesian structural time-series causal impact,
40
+ state-space exponential-smoothing forecasting.
41
+
42
+ ### Cross-validated
43
+
44
+ The package is checked against standard tools by automated test:
45
+
46
+ - **Rayson's LL Wizard** — hand-derived contingency-table reference
47
+ triples (fast tier; runs on every push).
48
+ - **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
49
+ on every adjacent bigram (slow tier).
50
+ - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
51
+ US Conventions corpus (slow tier).
52
+ - **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
53
+ `formula="dunning"` (slow tier).
54
+ - **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
55
+ sanity check on Stanford SNAP COHA decade embeddings; skips
56
+ gracefully when the archive isn't reachable (slow tier).
57
+
58
+ ### Extras
59
+
60
+ `[viz]`, `[semantic]`, `[temporal]`, `[polars]`, `[duckdb]`, `[nlp]`,
61
+ `[huggingface]`, `[notebooks]`, `[all]` are MIT-compatible. A separate
62
+ `[showcase]` extra pulls in `pysofra` (GPL-3.0-or-later) for
63
+ JAMA-style table polish in the showcase notebook — opt in explicitly
64
+ if you accept that licence.
65
+
66
+ ### Infrastructure
67
+
68
+ Hundreds of tests, `ruff` + `mypy --strict` clean across the source
69
+ tree, matrix CI on three Python versions × two operating systems,
70
+ plus a slow-tier CI job exercising the cross-validation receipts
71
+ against NLTK + quanteda on main pushes.
@@ -4,7 +4,7 @@ message: >
4
4
  entry. GitHub renders a "Cite this repository" widget directly from
5
5
  this file.
6
6
  title: "pycorpdiff: Comparative Corpus Analysis for Modern Python Workflows"
7
- version: 0.1.0a5
7
+ version: 0.1.0a7
8
8
  date-released: 2026-05-25
9
9
  authors:
10
10
  - family-names: Turner
@@ -32,7 +32,9 @@ abstract: >
32
32
  API. The package targets corpus linguistics, digital humanities,
33
33
  computational social science, and discourse analysis research,
34
34
  emphasising interpretability, explainability, statistical rigour,
35
- and reproducibility.
35
+ and reproducibility. A bundled synthetic UK-Hansard-style sample
36
+ ships for offline demonstration; real-data interfaces include
37
+ fetch_hansard and from_huggingface.
36
38
  identifiers:
37
39
  - type: url
38
40
  value: "https://github.com/jturner-uofl/pycorpdiff"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: pycorpdiff
3
- Version: 0.1.0a5
3
+ Version: 0.1.0a7
4
4
  Summary: Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference.
5
5
  Project-URL: Homepage, https://github.com/jturner-uofl/pycorpdiff
6
6
  Project-URL: Documentation, https://github.com/jturner-uofl/pycorpdiff
@@ -54,7 +54,6 @@ Requires-Dist: matplotlib>=3.8; extra == 'all'
54
54
  Requires-Dist: networkx>=3.1; extra == 'all'
55
55
  Requires-Dist: polars>=1.0; extra == 'all'
56
56
  Requires-Dist: pyarrow>=15; extra == 'all'
57
- Requires-Dist: pysofra>=0.1.0a3; extra == 'all'
58
57
  Requires-Dist: ruptures>=1.1; extra == 'all'
59
58
  Requires-Dist: scikit-learn>=1.3; extra == 'all'
60
59
  Requires-Dist: sentence-transformers>=2.2; extra == 'all'
@@ -77,7 +76,6 @@ Provides-Extra: nlp
77
76
  Requires-Dist: spacy>=3.7; extra == 'nlp'
78
77
  Provides-Extra: notebooks
79
78
  Requires-Dist: jupyter>=1.0; extra == 'notebooks'
80
- Requires-Dist: pysofra>=0.1.0a3; extra == 'notebooks'
81
79
  Requires-Dist: vl-convert-python>=1.5; extra == 'notebooks'
82
80
  Provides-Extra: polars
83
81
  Requires-Dist: polars>=1.0; extra == 'polars'
@@ -85,6 +83,8 @@ Requires-Dist: pyarrow>=15; extra == 'polars'
85
83
  Provides-Extra: semantic
86
84
  Requires-Dist: scikit-learn>=1.3; extra == 'semantic'
87
85
  Requires-Dist: sentence-transformers>=2.2; extra == 'semantic'
86
+ Provides-Extra: showcase
87
+ Requires-Dist: pysofra>=0.1.0a3; extra == 'showcase'
88
88
  Provides-Extra: temporal
89
89
  Requires-Dist: ruptures>=1.1; extra == 'temporal'
90
90
  Requires-Dist: statsmodels>=0.14; extra == 'temporal'
@@ -127,11 +127,11 @@ and computational social science routinely have:
127
127
  `pycorpdiff` is positioned as **orchestration**, not reinvention.
128
128
  Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
129
129
  `SBERT`-compatible model) plug in via two `typing.Protocol` extension
130
- points — one-line adapters, no plugin registry. The base install pulls
131
- only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
132
- via extras.
130
+ points — one-line adapters, no plugin registry. The base install's
131
+ direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
132
+ `pyarrow`; everything else is opt-in via extras.
133
133
 
134
- > **Status: alpha (0.1.0a5).** Public API is stable for the features
134
+ > **Status: alpha (0.1.0a7).** Public API is stable for the features
135
135
  > described below; on PyPI as `pip install pycorpdiff`.
136
136
 
137
137
  ## The three-layer architecture
@@ -178,7 +178,8 @@ for the full feature tour, or the cheat sheet below for one-line API previews.
178
178
 
179
179
  ```python
180
180
  # Compare verbs (returns Result objects; methods exposed vary by Result)
181
- pcd.compare(a, b).keyness()
181
+ pcd.compare(a, b).keyness() # default formula="rayson" (LL Wizard)
182
+ pcd.compare(a, b).keyness(formula="dunning") # full 4-cell G² (matches quanteda / NLTK)
182
183
  pcd.compare(a, b).collocation_shift("immigrant")
183
184
  pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
184
185
  # SBERTEmbedder downloads a sentence-transformers model on first call;
@@ -190,7 +191,7 @@ tr.changepoints() # offline PELT
190
191
  tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
191
192
  tr.interrupted_time_series(event_date="2016") # segmented OLS
192
193
  tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
193
- tr.forecast(horizon=4) # state-space ETS
194
+ tr.forecast(horizon=4) # 4 periods at the over_time freq (state-space ETS)
194
195
 
195
196
  # Before / after a known event
196
197
  pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
@@ -209,17 +210,20 @@ every analytical surface.
209
210
  ## Installation
210
211
 
211
212
  ```bash
212
- pip install pycorpdiff # lexical-comparative core
213
- pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
214
- pip install "pycorpdiff[semantic]" # + sentence-transformers
215
- pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
216
- pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
217
- pip install "pycorpdiff[all]" # everything
213
+ pip install pycorpdiff # lexical-comparative core (MIT)
214
+ pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
215
+ pip install "pycorpdiff[semantic]" # + sentence-transformers
216
+ pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
217
+ pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert
218
+ pip install "pycorpdiff[all]" # everything MIT-compatible
219
+ pip install "pycorpdiff[all,showcase]" # + pysofra (GPL-3.0-or-later) for the JAMA-style showcase
218
220
  ```
219
221
 
220
- The base install keeps a small dependency footprint (`numpy`, `pandas`,
221
- `scipy`, `pyarrow`); optional extras land per analytical layer so you
222
- only pay for what you use.
222
+ The base install's direct runtime dependencies are `numpy`, `pandas`,
223
+ `scipy`, and `pyarrow`; optional extras land per analytical layer so
224
+ you only pay for what you use. `[showcase]` is broken out separately
225
+ because `pysofra` is GPL-3.0-or-later — pure `pycorpdiff` use without
226
+ that extra remains MIT-only.
223
227
 
224
228
  To work from source:
225
229
 
@@ -232,13 +236,27 @@ pytest -q
232
236
 
233
237
  ## Cross-validation receipts
234
238
 
235
- The math agrees with the standard tools by automated test:
239
+ The math is checked against standard tools by automated test. The
240
+ fast tier runs on every push (matrix CI); the slow tier needs heavy
241
+ optional dependencies (R + quanteda, NLTK, rpy2, Stanford SNAP
242
+ downloads) and runs on main pushes only.
236
243
 
237
- - **Rayson's LL Wizard** — hand-derived contingency-table reference triples
238
- - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
239
- - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
240
- - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
241
- - **HistWords (Hamilton et al. 2016)** — diachronic cosine displacements on COHA (slow tier)
244
+ Fast tier:
245
+
246
+ - **Rayson's LL Wizard** — hand-derived contingency-table reference
247
+ triples ([`tests/integration/test_crossval_rayson.py`](https://github.com/jturner-uofl/pycorpdiff/blob/main/tests/integration/test_crossval_rayson.py))
248
+
249
+ Slow tier:
250
+
251
+ - **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
252
+ on every adjacent bigram
253
+ - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
254
+ US Conventions corpus
255
+ - **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
256
+ `formula="dunning"`
257
+ - **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
258
+ sanity check on Stanford SNAP COHA decade embeddings (skips
259
+ gracefully if the archive isn't reachable)
242
260
 
243
261
  ## Citation
244
262
 
@@ -31,11 +31,11 @@ and computational social science routinely have:
31
31
  `pycorpdiff` is positioned as **orchestration**, not reinvention.
32
32
  Tokenizers (`spaCy`, `Stanza`, `jieba`, `fugashi`) and embedders (any
33
33
  `SBERT`-compatible model) plug in via two `typing.Protocol` extension
34
- points — one-line adapters, no plugin registry. The base install pulls
35
- only `numpy`, `pandas`, `scipy`, and `pyarrow`; everything else is opt-in
36
- via extras.
34
+ points — one-line adapters, no plugin registry. The base install's
35
+ direct runtime dependencies are `numpy`, `pandas`, `scipy`, and
36
+ `pyarrow`; everything else is opt-in via extras.
37
37
 
38
- > **Status: alpha (0.1.0a5).** Public API is stable for the features
38
+ > **Status: alpha (0.1.0a7).** Public API is stable for the features
39
39
  > described below; on PyPI as `pip install pycorpdiff`.
40
40
 
41
41
  ## The three-layer architecture
@@ -82,7 +82,8 @@ for the full feature tour, or the cheat sheet below for one-line API previews.
82
82
 
83
83
  ```python
84
84
  # Compare verbs (returns Result objects; methods exposed vary by Result)
85
- pcd.compare(a, b).keyness()
85
+ pcd.compare(a, b).keyness() # default formula="rayson" (LL Wizard)
86
+ pcd.compare(a, b).keyness(formula="dunning") # full 4-cell G² (matches quanteda / NLTK)
86
87
  pcd.compare(a, b).collocation_shift("immigrant")
87
88
  pcd.compare(a, b).semantic_shift("immigrant", embedder=pcd.SBERTEmbedder()) # [semantic]
88
89
  # SBERTEmbedder downloads a sentence-transformers model on first call;
@@ -94,7 +95,7 @@ tr.changepoints() # offline PELT
94
95
  tr.changepoints_online(hazard=1/24) # Bayesian online (Adams & MacKay 2007)
95
96
  tr.interrupted_time_series(event_date="2016") # segmented OLS
96
97
  tr.causal_impact(event_date="2016") # Bayesian counterfactual (Brodersen 2015)
97
- tr.forecast(horizon=4) # state-space ETS
98
+ tr.forecast(horizon=4) # 4 periods at the over_time freq (state-space ETS)
98
99
 
99
100
  # Before / after a known event
100
101
  pcd.compare.before_after(corpus, event_date="2016-06-23").keyness()
@@ -113,17 +114,20 @@ every analytical surface.
113
114
  ## Installation
114
115
 
115
116
  ```bash
116
- pip install pycorpdiff # lexical-comparative core
117
- pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
118
- pip install "pycorpdiff[semantic]" # + sentence-transformers
119
- pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
120
- pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert / pysofra
121
- pip install "pycorpdiff[all]" # everything
117
+ pip install pycorpdiff # lexical-comparative core (MIT)
118
+ pip install "pycorpdiff[viz]" # + altair / matplotlib / networkx
119
+ pip install "pycorpdiff[semantic]" # + sentence-transformers
120
+ pip install "pycorpdiff[temporal]" # + ruptures / statsmodels
121
+ pip install "pycorpdiff[notebooks]" # + jupyter / vl-convert
122
+ pip install "pycorpdiff[all]" # everything MIT-compatible
123
+ pip install "pycorpdiff[all,showcase]" # + pysofra (GPL-3.0-or-later) for the JAMA-style showcase
122
124
  ```
123
125
 
124
- The base install keeps a small dependency footprint (`numpy`, `pandas`,
125
- `scipy`, `pyarrow`); optional extras land per analytical layer so you
126
- only pay for what you use.
126
+ The base install's direct runtime dependencies are `numpy`, `pandas`,
127
+ `scipy`, and `pyarrow`; optional extras land per analytical layer so
128
+ you only pay for what you use. `[showcase]` is broken out separately
129
+ because `pysofra` is GPL-3.0-or-later — pure `pycorpdiff` use without
130
+ that extra remains MIT-only.
127
131
 
128
132
  To work from source:
129
133
 
@@ -136,13 +140,27 @@ pytest -q
136
140
 
137
141
  ## Cross-validation receipts
138
142
 
139
- The math agrees with the standard tools by automated test:
143
+ The math is checked against standard tools by automated test. The
144
+ fast tier runs on every push (matrix CI); the slow tier needs heavy
145
+ optional dependencies (R + quanteda, NLTK, rpy2, Stanford SNAP
146
+ downloads) and runs on main pushes only.
140
147
 
141
- - **Rayson's LL Wizard** — hand-derived contingency-table reference triples
142
- - **NLTK** `BigramAssocMeasures` — PMI + t-score to ≤ 1e-12 on every adjacent bigram
143
- - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012 US Conventions corpus
144
- - **quanteda (R)** via `rpy2` — byte-for-byte G² agreement (slow tier)
145
- - **HistWords (Hamilton et al. 2016)** — diachronic cosine displacements on COHA (slow tier)
148
+ Fast tier:
149
+
150
+ - **Rayson's LL Wizard** — hand-derived contingency-table reference
151
+ triples ([`tests/integration/test_crossval_rayson.py`](https://github.com/jturner-uofl/pycorpdiff/blob/main/tests/integration/test_crossval_rayson.py))
152
+
153
+ Slow tier:
154
+
155
+ - **NLTK** `BigramAssocMeasures` — PMI + t-score agreement to ≤ 1e-12
156
+ on every adjacent bigram
157
+ - **Scattertext (Kessler 2017)** — behavioural agreement on the 2012
158
+ US Conventions corpus
159
+ - **quanteda (R)** via `rpy2` — G² agreement to ≤ 1e-10 with
160
+ `formula="dunning"`
161
+ - **HistWords (Hamilton et al. 2016)** — known-shifter / stable-word
162
+ sanity check on Stanford SNAP COHA decade embeddings (skips
163
+ gracefully if the archive isn't reachable)
146
164
 
147
165
  ## Citation
148
166
 
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "pycorpdiff"
7
- version = "0.1.0a5"
7
+ version = "0.1.0a7"
8
8
  description = "Comparative corpus analysis for Python: keyness, collocations, semantic shift, temporal trajectories with changepoints + causal inference."
9
9
  readme = "README.md"
10
10
  license = { file = "LICENSE" }
@@ -62,13 +62,18 @@ nlp = ["spacy>=3.7"]
62
62
  # Public-text-corpus hub. Heavy (pulls pyarrow, fsspec, requests, aiohttp),
63
63
  # so opt-in only — base install stays small.
64
64
  huggingface = ["datasets>=2.14"]
65
- # Needed if you want to execute the showcase notebook or regenerate the
66
- # rendered HTML examples. `jupyter` runs the notebook, `vl-convert` does
67
- # static SVG/PNG export of altair charts, `pysofra` renders the showcase's
68
- # result tables in JAMA-style typography.
69
- notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5", "pysofra>=0.1.0a3"]
70
- # Meta-extra: `pycorpdiff[all]` pulls in every optional code path
71
- # including the notebook runtime.
65
+ # Needed if you want to execute the example notebooks. `jupyter` runs
66
+ # the notebook; `vl-convert` does static SVG/PNG export of altair charts.
67
+ # Kept MIT-clean see `showcase` below for the JAMA-style table polish.
68
+ notebooks = ["jupyter>=1.0", "vl-convert-python>=1.5"]
69
+ # Adds `pysofra` for the showcase notebook's JAMA-style typography.
70
+ # IMPORTANT: pysofra is GPL-3.0-or-later. Opting in to `[showcase]` (or
71
+ # installing pysofra directly) brings GPL into your environment; pure
72
+ # pycorpdiff use without this extra remains MIT-only.
73
+ showcase = ["pysofra>=0.1.0a3"]
74
+ # Meta-extra: every MIT-compatible optional code path. Does NOT include
75
+ # `[showcase]` because pysofra is GPL-3.0-or-later; install
76
+ # `pycorpdiff[all,showcase]` explicitly if you accept that licence.
72
77
  all = [
73
78
  "altair>=5",
74
79
  "matplotlib>=3.8",
@@ -84,7 +89,6 @@ all = [
84
89
  "spacy>=3.7",
85
90
  "jupyter>=1.0",
86
91
  "vl-convert-python>=1.5",
87
- "pysofra>=0.1.0a3",
88
92
  ]
89
93
  dev = [
90
94
  "pytest>=8",
@@ -176,6 +180,8 @@ disallow_any_generics = true
176
180
  module = [
177
181
  "altair",
178
182
  "altair.*",
183
+ "datasets",
184
+ "datasets.*",
179
185
  "duckdb",
180
186
  "duckdb.*",
181
187
  "matplotlib",
@@ -6,20 +6,21 @@ result objects (:class:`KeynessResult`, :class:`CollocationShiftResult`,
6
6
  :class:`SemanticShiftResult`, :class:`TemporalTrajectory`,
7
7
  :class:`NetworkResult`, :class:`ForecastResult`,
8
8
  :class:`CausalImpactResult`, :class:`BocpdResult`,
9
- :class:`ConcordanceResult`), each implementing the same
10
- ``.to_df / .plot / .explain / .summary / .to_html / .to_json`` contract.
9
+ :class:`ConcordanceResult`), each implementing the relevant subset of
10
+ the ``.to_df / .plot / .explain / .summary / .to_html / .to_json``
11
+ contract. See ``docs/design.md`` for the per-Result method matrix.
11
12
 
12
13
  Example
13
14
  -------
14
15
 
15
16
  >>> import pycorpdiff as pcd
16
- >>> pcd.__version__
17
- '0.1.0a5'
17
+ >>> isinstance(pcd.__version__, str)
18
+ True
18
19
  """
19
20
 
20
21
  from __future__ import annotations
21
22
 
22
- __version__ = "0.1.0a5"
23
+ __version__ = "0.1.0a7"
23
24
 
24
25
  from .collocation.network import NetworkResult, cooccurrence_network
25
26
  from .compare import Comparison, compare
@@ -0,0 +1,9 @@
1
+ """Pandas-backed internals for :class:`pycorpdiff.Corpus`.
2
+
3
+ Corpus operations route through this module so backend-specific code
4
+ stays out of the public API. The pandas backend is the default and is
5
+ exercised on every install; polars is opt-in via the ``polars`` extra
6
+ and lives in the sibling :mod:`pycorpdiff._backends.polars`.
7
+ """
8
+
9
+ from __future__ import annotations
@@ -10,6 +10,7 @@ from dataclasses import dataclass
10
10
  from typing import TYPE_CHECKING, Literal
11
11
 
12
12
  from .corpus import Corpus, CorpusSlice
13
+ from .keyness.loglikelihood import LLFormula
13
14
 
14
15
  if TYPE_CHECKING:
15
16
  from .results import (
@@ -46,6 +47,7 @@ class Comparison:
46
47
  def keyness(
47
48
  self,
48
49
  method: KeynessMethod = "log_likelihood",
50
+ formula: LLFormula = "rayson",
49
51
  effect_size: bool = True,
50
52
  dispersion: bool = False,
51
53
  min_count: int = 5,
@@ -64,6 +66,14 @@ class Comparison:
64
66
  sorts by signed Pearson χ². The other modes
65
67
  (``"log_ratio"``, ``"bayes_factor"``, ``"percent_diff"``)
66
68
  require ``effect_size=True`` and sort by that column.
69
+ formula
70
+ Which log-likelihood formulation to use for the G² column.
71
+ ``"rayson"`` (default) is the 2-cell shortcut matching
72
+ Rayson's UCREL LL Wizard; ``"dunning"`` is the full 4-cell
73
+ G² matching NLTK's ``BigramAssocMeasures`` and R's
74
+ ``quanteda::textstat_keyness(measure="lr")``. See
75
+ ``docs/statistical-methods.md`` for the math + when they
76
+ diverge.
67
77
  effect_size
68
78
  If True (default), also compute LogRatio (Hardie),
69
79
  %DIFF (Gabrielatos), and the BIC-approximated Bayes factor.
@@ -131,7 +141,7 @@ class Comparison:
131
141
  # G² is always computed (cheap, the default sort column). χ² is
132
142
  # computed only when requested — same shape, asymptotically
133
143
  # equivalent, no need to pay for both by default.
134
- table = log_likelihood(a_kept, b_kept, n_a, n_b)
144
+ table = log_likelihood(a_kept, b_kept, n_a, n_b, formula=formula)
135
145
  if method == "chi_squared":
136
146
  chi_table = _chi_squared(a_kept, b_kept, n_a, n_b)
137
147
  table["chi_squared"] = chi_table["chi_squared"]
@@ -139,7 +149,9 @@ class Comparison:
139
149
  if effect_size:
140
150
  table["log_ratio"] = _log_ratio(a_kept, b_kept, n_a, n_b)
141
151
  table["percent_diff"] = _percent_diff(a_kept, b_kept, n_a, n_b)
142
- table["bayes_factor"] = _bayes_factor(a_kept, b_kept, n_a, n_b)
152
+ table["bayes_factor"] = _bayes_factor(
153
+ a_kept, b_kept, n_a, n_b, formula=formula
154
+ )
143
155
 
144
156
  if dispersion:
145
157
  kept_terms = table.index
@@ -192,6 +204,7 @@ class Comparison:
192
204
  label_a=_corpus_label(self.a),
193
205
  label_b=_corpus_label(self.b),
194
206
  params={
207
+ "formula": formula,
195
208
  "effect_size": effect_size,
196
209
  "dispersion": dispersion,
197
210
  "min_count": min_count,
@@ -242,6 +242,15 @@ class Corpus:
242
242
  """
243
243
  from .temporal.slicing import TemporalCorpus # local import to break cycle
244
244
 
245
+ if len(self.docs) == 0:
246
+ raise ValueError(
247
+ "by_time() requires a non-empty corpus; got 0 documents."
248
+ )
249
+ if col not in self.docs.columns:
250
+ raise ValueError(
251
+ f"by_time(col={col!r}, ...): column not found in corpus. "
252
+ f"Available columns: {list(self.docs.columns)!r}."
253
+ )
245
254
  return TemporalCorpus(parent=self, time_col=col, freq=freq)
246
255
 
247
256
  def with_tokenizer(self, tokenizer: Tokenizer) -> Corpus:
@@ -71,12 +71,24 @@ def read_duckdb(
71
71
  ... )
72
72
  """
73
73
  try:
74
- import duckdb # noqa: F401
74
+ import duckdb
75
75
  except ImportError as exc: # pragma: no cover
76
76
  raise ImportError(
77
77
  "read_duckdb requires duckdb. Install with: pip install 'pycorpdiff[duckdb]'"
78
78
  ) from exc
79
79
 
80
+ if isinstance(connection, str):
81
+ raise TypeError(
82
+ "read_duckdb expects a DuckDB connection, not a file path. "
83
+ f"Got connection={connection!r}. Open one first: "
84
+ f'duckdb.connect({connection!r}), or pcd.read_duckdb(duckdb.connect(), "...")'
85
+ )
86
+ if not isinstance(connection, duckdb.DuckDBPyConnection):
87
+ raise TypeError(
88
+ "read_duckdb expects a duckdb.DuckDBPyConnection; got "
89
+ f"{type(connection).__name__}. Open one via duckdb.connect(...)."
90
+ )
91
+
80
92
  cursor = connection.execute(query, params) if params is not None else connection.execute(query)
81
93
  df = cursor.df()
82
94
  if text_col not in df.columns:
@@ -95,7 +95,7 @@ def from_huggingface(
95
95
  loader = _loader
96
96
  if loader is None:
97
97
  try:
98
- from datasets import load_dataset as _hf_load # type: ignore[import-not-found]
98
+ from datasets import load_dataset as _hf_load
99
99
  except ImportError as exc: # pragma: no cover
100
100
  raise ImportError(
101
101
  "from_huggingface requires the `datasets` library. "
@@ -15,7 +15,7 @@ from __future__ import annotations
15
15
  import numpy as np
16
16
  import pandas as pd
17
17
 
18
- from .loglikelihood import log_likelihood
18
+ from .loglikelihood import LLFormula, log_likelihood
19
19
 
20
20
 
21
21
  def bayes_factor(
@@ -23,6 +23,8 @@ def bayes_factor(
23
23
  counts_b: pd.Series,
24
24
  total_a: int,
25
25
  total_b: int,
26
+ *,
27
+ formula: LLFormula = "rayson",
26
28
  ) -> pd.Series:
27
29
  """BIC-approximated Bayes factor for each term's frequency difference.
28
30
 
@@ -31,6 +33,12 @@ def bayes_factor(
31
33
  the unsigned log-likelihood. The Bayes factor is then
32
34
  ``exp(BIC / 2)``. Wilson (2013) is the keyness application.
33
35
 
36
+ ``formula`` selects which G² flavour feeds the BF: ``"rayson"`` (the
37
+ 2-cell shortcut, default; matches the LL Wizard) or ``"dunning"``
38
+ (the full 4-cell G²; matches quanteda/NLTK). Use the same
39
+ ``formula=`` as the ``keyness()`` call that produced the row so the
40
+ G² and the Bayes factor in a single row describe the same statistic.
41
+
34
42
  Interpret with Kass & Raftery (1995):
35
43
 
36
44
  - ``BF > 2`` : positive evidence
@@ -43,7 +51,7 @@ def bayes_factor(
43
51
  plots / sorts handle it.
44
52
  """
45
53
  terms = counts_a.index.union(counts_b.index)
46
- ll_table = log_likelihood(counts_a, counts_b, total_a, total_b)
54
+ ll_table = log_likelihood(counts_a, counts_b, total_a, total_b, formula=formula)
47
55
  g2_abs = ll_table["g2"].abs()
48
56
  bic = g2_abs - np.log(total_a + total_b)
49
57
  with np.errstate(over="ignore"):